<a href="https://colab.research.google.com/github/kenypatel233/XAI_Dump/blob/main/XAI_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Interpretable ML models with Microsoft's InterpretML
- Linear Regression
- Decision trees
- Explainable boosting machine


---
Dataset used: https://www.kaggle.com/fedesoriano/stroke-prediction-dataset




In [2]:
!pip install interpret

Collecting interpret
  Downloading interpret-0.2.7-py3-none-any.whl (1.4 kB)
Collecting interpret-core[dash,debug,decisiontree,ebm,lime,linear,notebook,plotly,required,sensitivity,shap,skoperules,treeinterpreter]>=0.2.7
  Downloading interpret_core-0.2.7-py3-none-any.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 10.0 MB/s 
[?25hCollecting SALib>=1.3.3
  Downloading SALib-1.4.5-py2.py3-none-any.whl (756 kB)
[K     |████████████████████████████████| 756 kB 78.5 MB/s 
Collecting ipython>=7.4.0
  Downloading ipython-7.28.0-py3-none-any.whl (788 kB)
[K     |████████████████████████████████| 788 kB 64.0 MB/s 
[?25hCollecting ipykernel>=5.1.0
  Downloading ipykernel-6.4.1-py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 77.8 MB/s 
[?25hCollecting skope-rules>=1.0.1
  Downloading skope_rules-1.0.1-py3-none-any.whl (14 kB)
Collecting treeinterpreter>=0.2.2
  Downloading treeinterpreter-0.2.3-py2.py3-none-any.whl (6.0 kB)
Collecting shap>=0.28.5
 

In [1]:
#Imports
import pandas as pd
import numpy as np
from interpret.glassbox import (LogisticRegression,
                                ClassificationTree, 
                                ExplainableBoostingClassifier)
from interpret import show
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler




#### 1.0 Data exploration & pre-processing


In [2]:
df= pd.read_csv("/content/healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
df['bmi'].describe()


count    4909.000000
mean       28.893237
std         7.854067
min        10.300000
25%        23.500000
50%        28.100000
75%        33.100000
max        97.600000
Name: bmi, dtype: float64

In [12]:
df.bmi = df.bmi.fillna(28.9)

In [13]:
df.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [14]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893503,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.698018,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.8,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.4,0.0
75%,54682.0,61.0,0.0,0.0,114.09,32.8,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [15]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [16]:
# One-hot encode all categorical columns
categorical_cols = ["gender",
                            "ever_married",
                            "work_type",
                            "Residence_type",
                            "smoking_status"]
encoded = pd.get_dummies(df[categorical_cols], 
                                prefix=categorical_cols)

# Update data with new columns
df = pd.concat([encoded, df], axis=1)
df.drop(categorical_cols, axis=1, inplace=True)

df.head()

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,9046,67.0,0,1,228.69,36.6,1
1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,51676,61.0,0,0,202.21,28.9,1
2,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,31112,80.0,0,1,105.92,32.5,1
3,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,60182,49.0,0,0,171.23,34.4,1
4,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1665,79.0,1,0,174.12,24.0,1


In [17]:
df.drop("id",axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,67.0,0,1,228.69,36.6,1
1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,61.0,0,0,202.21,28.9,1
2,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,80.0,0,1,105.92,32.5,1
3,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,49.0,0,0,171.23,34.4,1
4,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,79.0,1,0,174.12,24.0,1


In [20]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [21]:
#Train-test split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.20, random_state=25)

In [23]:
print(X_train.shape)
print(y_train.shape)

(4088, 21)
(4088,)


In [24]:
#Oversampling data
oversample = RandomOverSampler(sampling_strategy='minority')
# Convert to numpy and oversample
x_np = X_train.to_numpy()
y_np = y_train.to_numpy()
x_np, y_np = oversample.fit_resample(x_np, y_np)
# Convert back to pandas
x_over = pd.DataFrame(x_np, columns=X_train.columns)
y_over = pd.Series(y_np, name=y_train.name)



In [25]:
X_train = x_over
y_train = y_over

#### 2.1 Logistic Regression model

In [29]:
#Training the model
log_reg = LogisticRegression(random_state=25
                             , feature_names=X_train.columns, 
                             penalty='l1', 
                             solver='liblinear')
log_reg.fit(X_train, y_train)
print("Done with training!")

Done with training!


In [30]:
#Prediction and evaluation of model
#F1 score is used because of class imbalance
y_pred = log_reg.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")


F1 Score 0.5191289422586267
Accuracy 0.7221135029354208


In [31]:
#  Explain local prediction for first 100 values
lr_local = log_reg.explain_local(X_test[:100], y_test[:100], name='Logistic Regression')
show(lr_local)



  detected_envs


In [33]:
#  Explain global logistic regression model
lr_global = log_reg.explain_global(name='Logistic Regression')
show(lr_global)