# Importing libraries

In [1]:
import pandas as pd
from interpret.glassbox import (LogisticRegression,ClassificationTree,ExplainableBoostingClassifier,ebm,decisiontree,RegressionTree)
from interpret import show
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from interpret.blackbox import LimeTabular
import dice_ml


### reading csv files

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


### encoding of data  - changing object data types to int for data preprocessing

In [4]:

df.replace({'Sex':{'M':0,'F':1}},inplace = True)
df.replace({'ChestPainType':{'TA':1,'ATA':2,'NAP':3,'ASY':4}},inplace=True)
df.replace({'RestingECG':{'Normal':0,'ST':1,'LVH':2}},inplace = True)
df.replace({'ExerciseAngina':{'N':0,'Y':1}},inplace = True)
df.replace({'ST_Slope':{'Up':1,'Flat':2,'Down':3}},inplace = True)

### splitting and training dataset

In [5]:

x = df.iloc[:,:11].values
y = df['HeartDisease'].values
x.shape,y.shape

((918, 11), (918,))

In [6]:
X_train, X_test, y_train, y_test  = train_test_split(x,y,test_size=0.2)

## using Logistic Regression interpret ml library 

LogisticRegression take tranind data as input and generate explanations for locala and global test input

In [7]:
#training model for given datapoints
lr = LogisticRegression(random_state=2021, feature_names=df.iloc[:,:11].columns, penalty='l1', solver='liblinear')
lr.fit(X_train, y_train)
print("Training finished.")

Training finished.


In [8]:
#predicting results
y_pred = lr.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)*100}")

F1 Score 0.8626660695626213
Accuracy 86.41304347826086


explain_local shows explanations for every entry in test dataset 
It explains how particular feature impacts the predcited output
measure of impact of features on individual data points

In [9]:
#generating explanations
'''explain_local give explaination for every data point i.e.
gives interscept show which parameter affect the dataset ans how (positively or negatively)'''
lr_local = lr.explain_local(X_test, y_test, name='Logistic Regression')
show(lr_local)

explain_global show the bigger picture. It represent the overall impact of individual feature on entire dataset

In [10]:
lr_global = lr.explain_global(name='Logistic Regression')
show(lr_global)

## using classification tree

In [11]:
tree = ClassificationTree()
tree.fit(X_train, y_train)
print("Training finished.")
y_pred = tree.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

Training finished.
F1 Score 0.8638045891931903
Accuracy 0.8641304347826086


In [12]:
tree_local = tree.explain_local(X_test[:100], y_test[:100], name='Tree')
show(tree_local)

## using ExplainableBoostingClassifier

In [13]:
ebm = ExplainableBoostingClassifier(random_state=2021)
ebm.fit(X_train, y_train) 
print("Training finished.")
y_pred = ebm.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}*100")

Training finished.
F1 Score 0.8900047824007653
Accuracy 0.8913043478260869*100


In [14]:
ebm_local = ebm.explain_local(X_test[:100], y_test[:100], name='EBM')
show(ebm_local)

In [15]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

# LIME

In [16]:
from sklearn.ensemble import RandomForestClassifier

 Local  Interpretable Model-agnostic Explanations generates local explanations 

In [19]:
features  = df.iloc[:,:11].columns.values
features

array(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol',
       'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak',
       'ST_Slope'], dtype=object)

In [43]:
from interpret.blackbox import LimeTabular

In [48]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

F1 Score 0.9012286770845759
Accuracy 0.9021739130434783


In [50]:
lime = LimeTabular(predict_fn=rf.predict_proba, 
                   data=X_train, 
                   random_state=1)

In [None]:
lime_local = lime.explain_local(X_test[-20:], 
                                y_test[-20:], 
                                name='LIME')

show(lime_local)

# SHAP

shap shows contribution of every feature in prediction

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(rf)
# Calculate shapley values for test data
start_index = 1
end_index = 2
shap_values = explainer.shap_values(X_test[start_index:end_index])
X_test[start_index:end_index]

In [None]:
print(shap_values[0].shape)
shap_values

In [None]:
shap.initjs()
# Force plot
prediction = rf.predict(X_test[start_index:end_index])[0]
print(f"The RF predicted: {prediction}")
shap.force_plot(explainer.expected_value[1],
                shap_values[1],
                X_test[start_index:end_index],feature_names=df.iloc[:,:11].columns) # for values

In [29]:
shap.summary_plot(shap_values, X_test,feature_names=df.iloc[:,:11].columns)

NameError: name 'shap' is not defined

# Redefining model

In [30]:
df.iloc[:,:11].columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope'],
      dtype='object')

In [31]:
x_new = df[['ST_Slope','ChestPainType','Cholesterol','ExerciseAngina','Oldpeak','Age']].values

In [32]:
X1_train, X1_test, y1_train, y1_test  = train_test_split(x_new,y,test_size=0.2)

In [33]:
rf1 = RandomForestClassifier()
rf1.fit(X1_train, y1_train)
y1_pred = rf1.predict(X1_test)
print(f"F1 Score {f1_score(y1_test, y1_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y1_test, y1_pred)*100}")
print(f"Accuracy old {accuracy_score(y_test,y_pred)*100}")

F1 Score 0.8255717502073705
Accuracy 82.6086956521739
Accuracy old 89.67391304347827


In [34]:
data_dice = dice_ml.Data(dataframe=df, 
                         # For perturbation strategy
                         continuous_features=['Age','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak'], 
                         outcome_name='HeartDisease')

In [35]:
rf_dice = dice_ml.Model(model=rf, 
                        backend="sklearn")
explainer = dice_ml.Dice(data_dice, 
                         rf_dice, 
                         method="random")

In [36]:
import numpy as np

In [37]:

# input_datapoint = X_test[0:1]
# features_to_vary=['RestingBP','Cholesterol','FastingBS']
# permitted_range={'RestingBP':[70,150],
#                  'Cholesterol':[40,200],
#                  'FastingBS':[50,2500]}
# columns = ['Age', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS','RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
# df = pd.DataFrame(data = input_datapoint,columns=columns)

# cf = explainer.generate_counterfactuals(df, 
#                                   total_CFs=3, 
#                                   desired_class="opposite",
#                                   permitted_range=permitted_range,
#                                   features_to_vary=features_to_vary
#                                 )

In [38]:
# cf.visualize_as_dataframe(show_only_changes=True)