# UCI Heart Disease Detection

This notebook is used as part of my thesis, comparing different XAI methods and libraries.
<br/>
The purpose of the created models is to classify if a patient is either healthy or has a heart disease.
<br/>
<br/>
Attributes:
1. age
2. sex (1=male, 0=female)
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholesterol in milligrams per deciliter (mg/dl)
6. fasting blood sugar > 120 mg/dl
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

Dataset: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

## 1 Set up Environment and Dataset <a class="anchor" id="ch1"></a>

### 1.1 Load Libraries and Set Up Parameters <a class="anchor" id="ch1.1"></a>

In [None]:
# random seed for reproduction
seedNum = 23

In [None]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import random
import urllib.request
import seaborn as sns
import catboost
import shap
import lime
import graphviz
import tensorflow as tf

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.inspection import partial_dependence, plot_partial_dependence

from catboost import CatBoostClassifier
from alibi.explainers import AnchorTabular, CounterFactualProto, CounterFactual
from alibi.utils.mapping import ohe_to_ord, ord_to_ohe
from datetime import datetime

# required installs:
# pip install shap
# pip install lime
# pip install alibi
# conda install python-graphviz AND install from https://graphviz.org/download/

In [None]:
# timer for the script processing
startTimeScript = datetime.now()

# set up n_jobs
n_jobs = 6

# set flag for splitting the dataset
splitDataset = True
splitPercentage = 0.20

# set number of folds for cross validation
n_folds = 10

# set various default modeling parameters
scoring = 'accuracy'

In [None]:
# order of columns for explanation compatability
new_order=["sex","cp","fbs","restecg","exang","slope","thal","age","trestbps","chol","thalach","oldpeak","ca","target"]

# dictionary of categorical variable values
category_map={0: ["female", "male"],
              1: ["typical angina","atypical angina","non-anginal pain","asymptomatic"],
              2: ["below 120 mg/dl","above 120 ml/dl"],
              3: ["normal","ST-T wave abnormality","probable left ventricular hypertrophy"],
              4: ["no","yes"],
              5: ["upsloping","flat","downsloping"],
              6: ["no info","normal","fixed defect","reversable defect"]
             }

# dict of column names for renaming
col_names = {"cp":'chest pain type', "trestbps":'resting blood pressure', "chol":'serum cholesterol (mg/dl)',
             "fbs":'fasting blood sugar', "restecg":'resting ecg results',
             "thalach":'maximum heart rate achieved', "exang":'exercise induced angina',
             "oldpeak":'exercise induced ST depression',
             "slope":'slope of peak exercise ST segment', "ca":'vessels colored by flourosopy', "thal":"thalassemia type"}

In [None]:
#import dataset
dataset_path = 'data/heart.csv'
Xy_original = pd.read_csv(dataset_path)
Xy_original = Xy_original[new_order]
Xy_original.rename(columns=col_names, inplace=True)
Xy_original.shape

### 1.2 Preprocessing <a class="anchor" id="ch1.2"></a>

In [None]:
# Use variable totCol to hold the number of columns in the dataframe
totCol = len(Xy_original.columns)
totAttr = totCol-1

X_original = Xy_original.iloc[:,0:totAttr]
y_original = Xy_original.iloc[:,totAttr]

print("Xy_original.shape: {} X_original.shape: {} y_original.shape: {}".format(Xy_original.shape, X_original.shape, y_original.shape))

In [None]:
# Split the data further into training and test datasets
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_original, y_original, test_size=splitPercentage, 
                                                                stratify=y_original, random_state=seedNum)

print("X_train.shape: {} y_train_df.shape: {}".format(X_train_df.shape, y_train_df.shape))
print("X_test_df.shape: {} y_test_df.shape: {}".format(X_test_df.shape, y_test_df.shape))

In [None]:
# Finalize the training and testing datasets for the modeling activities
X_train = X_train_df.to_numpy()
y_train = y_train_df.to_numpy()
X_test = X_test_df.to_numpy()
y_test = y_test_df.to_numpy()
print("X_train.shape: {} y_train.shape: {}".format(X_train.shape, y_train.shape))
print("X_test.shape: {} y_test.shape: {}".format(X_test.shape, y_test.shape))

In [None]:
# preprocessor

feature_names = X_original.columns.values

num_features = [x for x in range(len(feature_names)) if x not in list(category_map.keys())]
num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

categorical_features = list(category_map.keys())
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features),
                                               ('cat', categorical_transformer, categorical_features)])
preprocessor.fit(X_train)

## 2 Tree-based Modeling <a class="anchor" id="ch2"></a>

### 2.1 Try Some Untuned  Models <a class="anchor" id="ch2.1"></a>

In [None]:
# Set up Algorithms Spot-Checking Array

startTimeModule = datetime.now()
train_models = []
train_results = []
train_model_names = []
train_metrics = []
train_models.append(('LR', LogisticRegression(random_state=seedNum, n_jobs=n_jobs)))
train_models.append(('KN', KNeighborsClassifier(n_jobs=n_jobs)))
train_models.append(('DT', DecisionTreeClassifier(random_state=seedNum)))
train_models.append(('BT', BaggingClassifier(random_state=seedNum, n_jobs=n_jobs)))
train_models.append(('RF', RandomForestClassifier(random_state=seedNum, n_jobs=n_jobs)))
train_models.append(('ET', ExtraTreesClassifier(random_state=seedNum, n_jobs=n_jobs)))
train_models.append(('GB', GradientBoostingClassifier(random_state=seedNum)))

In [None]:
# Generate models in turn

for name, model in train_models:
    startTimeModule = datetime.now()
    kfold = KFold(n_splits=n_folds)
    cv_results = cross_val_score(model, preprocessor.transform(X_train), y_train, cv=kfold, scoring=scoring)
    train_results.append(cv_results)
    train_model_names.append(name)
    train_metrics.append(cv_results.mean())
    print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))
    print(model)
    print ('Model training time:', (datetime.now() - startTimeModule), '\n')
print ('Average metrics ('+scoring+') from all models:',np.mean(train_metrics))
print ('Total training time for all models:',(datetime.now() - startTimeModule))

### 2.2 Train and Set Up Reference Models <a class="anchor" id="ch2.2"></a>

Decision Tree:

In [None]:
startTimeModule = datetime.now()

decisiontree = DecisionTreeClassifier(random_state=seedNum)
param_grid_tree = {
    "max_depth": [4,6,8],
    "criterion" : ["gini","entropy"],
    "min_samples_leaf": [6,10,14]
}

kfold = KFold(n_splits=n_folds)
grid = GridSearchCV(decisiontree, param_grid_tree, scoring=scoring, cv=kfold, n_jobs=n_jobs, refit="Recall")

grid_result = grid.fit(preprocessor.transform(X_train), y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print ('Computing time:',(datetime.now() - startTimeModule))

clf_dt_be = grid_result.best_estimator_
clf_dt = clf_dt_be.fit(preprocessor.transform(X_train), y_train)

Random Forest:

In [None]:
startTimeModule = datetime.now()

tune_model = RandomForestClassifier(random_state=seedNum, n_jobs=n_jobs)

n_estimators = [100]
criterion = ["gini","entropy"]
max_features =[None, "sqrt", 0.2, 0.3, 0.4, 0.5]

paramGrid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features)

kfold = KFold(n_splits=n_folds)
grid = GridSearchCV(estimator=tune_model, param_grid=paramGrid, scoring=scoring, cv=kfold, refit="Recall")
grid_result = grid.fit(preprocessor.transform(X_train), y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print ('Computing time:',(datetime.now() - startTimeModule))

clf_rf_be = grid_result.best_estimator_
clf_rf = clf_rf_be.fit(preprocessor.transform(X_train), y_train)

Gradient Boosting:

In [None]:
gridsearch_new = True

if(gridsearch_new):
    param_grid_catB = {
        "depth": [6,10],
    }
    
    clf_catb = CatBoostClassifier(
        eval_metric='Accuracy',
    )
    
    startTimeModule = datetime.now()
    grid_search = clf_catb.grid_search(param_grid_catB, preprocessor.transform(X_train), y_train, partition_random_seed=seedNum, plot=False)
    
    print ('Computing time:',(datetime.now() - startTimeModule))

clf_catb.get_params()

clf_cb_be = CatBoostClassifier(eval_metric='Accuracy', depth=6, verbose=False)

startTimeModule = datetime.now()
clf_cb = clf_cb_be.fit(preprocessor.transform(X_train), y_train, verbose=False)
#print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))
print ('Computing time:',(datetime.now() - startTimeModule))

### 2.3 Model Evaluation <a class="anchor" id="ch2.3"></a>

In [None]:
predictions_dt = clf_dt.predict(preprocessor.transform(X_test))
predictions_rf = clf_rf.predict(preprocessor.transform(X_test))
predictions_cb = clf_cb.predict(preprocessor.transform(X_test))
cv_dt = cross_val_score(clf_dt_be, preprocessor.transform(X_train), y_train, cv=kfold, scoring=scoring)
cv_rf = cross_val_score(clf_rf_be, preprocessor.transform(X_train), y_train, cv=kfold, scoring=scoring)
cv_cb = cross_val_score(clf_cb_be, preprocessor.transform(X_train), y_train, cv=kfold, scoring=scoring)

print(clf_dt,"\n\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions_dt))
print("\n\nClassification Report:\n\n",classification_report(y_test, predictions_dt))
print("Cross-Validation: %f (%f)" % (cv_dt.mean(), cv_dt.std()))
print("--------------------------------------------------------\n")

print(clf_rf,"\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions_rf))
print("\n\nClassification Report:\n\n",classification_report(y_test, predictions_rf))
print("Cross-Validation: %f (%f)" % (cv_rf.mean(), cv_rf.std()))
print("--------------------------------------------------------\n")

print(clf_cb,"\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions_cb))
print("\n\nClassification Report:\n\n",classification_report(y_test, predictions_cb))
print("Cross-Validation: %f (%f)" % (cv_cb.mean(), cv_cb.std()))

In [None]:
dt_fpr, dt_tpr, dt_thresholds = roc_curve(y_true=y_test, y_score=predictions_dt, pos_label=1)
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_true=y_test, y_score=predictions_rf, pos_label=1)
cb_fpr, cb_tpr, cb_thresholds = roc_curve(y_true=y_test, y_score=predictions_cb, pos_label=1)

print("Decision Tree AUC: ",auc(dt_fpr, dt_tpr))
print("Random Forest AUC: ",auc(rf_fpr, rf_tpr,))
print("Catboost AUC: ",auc(cb_fpr, cb_tpr,))

In [None]:
plt.figure(figsize=(12,10))

plt.plot(dt_fpr, dt_tpr, label="Decision Tree")
plt.plot(rf_fpr, rf_tpr,"brown", label="Random Forest")
plt.plot(cb_fpr, cb_tpr, "violet", label="Gradient Boosting")
plt.plot([0,1],[0,1],'r-',label='Random Predictions')
plt.plot([0,0,1,1],[0,1,1,1],'g-',label='Perfect Predictions')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# 3 Anchors <a class="anchor" id="ch3"></a>

In [None]:
clf=clf_cb
pred_idx = 1

probabilities = clf.predict_proba(preprocessor.transform(X_test))
print("Probabilities: ", probabilities[pred_idx])
print("Correct class: ", y_test[pred_idx])

In [None]:
target_names = ["healthy", "sick"]
feature_names = X_original.columns.values

In [None]:
predict_fn = lambda x: clf_rf.predict_proba(preprocessor.transform(x))

In [None]:
explainer = AnchorTabular(predict_fn, feature_names, categorical_names=category_map, seed=seedNum)

In [None]:
explainer.fit(X_train, disc_perc=[25, 50, 75])

In [None]:
class_names = target_names

anch_exp = explainer.explain(X_test[pred_idx], threshold=0.95)
print('\nANCHOR:\n\nIF %s' % ('\n AND '.join(anch_exp.anchor)))
print('THEN PREDICT: ', class_names[explainer.predictor(X_test[pred_idx].reshape(1, -1))[0]])
print('\nWITH PRECISION: %.2f' % anch_exp.precision)
print('WITH COVERAGE: %.2f' % anch_exp.coverage)

# 4 Lime

In [None]:
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train ,feature_names = feature_names,class_names=class_names,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=category_map, kernel_width=3)

In [None]:
class_names

In [None]:
exp = lime_explainer.explain_instance(X_test[pred_idx], predict_fn, num_features=5)
exp.show_in_notebook(show_all=False)

In [None]:
explist=exp.as_list()
explist