In [8]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set_style("whitegrid")
sns.set_palette("CMRmap_r")

import sklearn
from sklearn.model_selection import train_test_split

print('Setup Complete.')

print(os.listdir('./data'))

Setup Complete.
['heart.csv']


In [9]:
df = pd.read_csv('./data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
!pip install eli5
!pip install shap
!pip install pdpbox

Collecting eli5
  Downloading eli5-0.10.1-py2.py3-none-any.whl (105 kB)
[K     |████████████████████████████████| 105 kB 1.9 MB/s eta 0:00:01
Collecting tabulate>=0.7.7
  Downloading tabulate-0.8.7-py3-none-any.whl (24 kB)
Installing collected packages: tabulate, eli5
Successfully installed eli5-0.10.1 tabulate-0.8.7
Collecting shap
  Downloading shap-0.36.0.tar.gz (319 kB)
[K     |████████████████████████████████| 319 kB 2.2 MB/s eta 0:00:01
Collecting tqdm>4.25.0
  Downloading tqdm-4.51.0-py2.py3-none-any.whl (70 kB)
[K     |████████████████████████████████| 70 kB 7.5 MB/s eta 0:00:011
[?25hCollecting slicer
  Downloading slicer-0.0.4-py3-none-any.whl (13 kB)
Collecting numba
  Downloading numba-0.51.2-cp38-cp38-macosx_10_14_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 6.3 MB/s eta 0:00:01
Collecting llvmlite<0.35,>=0.34.0.dev0
  Downloading llvmlite-0.34.0-cp38-cp38-macosx_10_9_x86_64.whl (18.4 MB)
[K     |████████████████████████████████| 18.4 MB 7.4 M

  Building wheel for psutil (setup.py) ... [?25ldone
[?25h  Created wheel for psutil: filename=psutil-5.7.3-cp38-cp38-macosx_10_9_x86_64.whl size=234911 sha256=91daeeb972dc00735de6ade90359b8a5b62b6a0a667f990ab7a88f9ffa9cbcbd
  Stored in directory: /Users/jordansamek/Library/Caches/pip/wheels/f6/59/c2/38111ef4c354088a156bc95fbeb5396c0cac91a0f62f7158b9
Successfully built pdpbox psutil
Installing collected packages: psutil, pdpbox
Successfully installed pdpbox-0.2.0 psutil-5.7.3


In [10]:
# Scikit-learn and other imports

# Model evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

# Models we'll be using and testing
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

import eli5
from eli5.sklearn import PermutationImportance
import shap
from pdpbox import pdp, info_plots

np.random.seed(42)

import warnings
warnings.filterwarnings('ignore')

Let's take a look at our columns again and see what the abbreviations mean.

In [11]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


What do we have...

- **age** - The patient's age in years
- **sex** - The gender of the patient (1 = male; 0 = female)
- **cp** - Chest pain experienced (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)
- **trestbps** - The resting heart rate of the patient (measured in mmHg)
- **chol** - Serum cholesterol level (measured in mg/dl)
- **fbs** - Fasting blood sugar (> 120 mg/dl; 1 = true, 0 = false)
- **restecg** - Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing -probable or definite left ventricular hypertrophy by Estes' criteria)
- **thalach** - The patient's maximum heart rate achieved
- **exang** - Exercise induced angina (1 = yes; 0 = no)
- **oldpeak** - ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot)
- **slope** - The slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
- **ca** - The number of major vessels (0-3)
- **thal** - A blodd disorder called Thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)
- **target** - Heart disease (0 = no; 1 = yes)

We'll change a few of these columns around for better interpretability.

In [12]:
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar',
             'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'st_slope',
             'num_major_vessels', 'thalassemia', 'target']

We'll also change around some of the values for the categorical variables for better interpretability of the dataset.

In [15]:
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'

df['chest_pain_type'][df['chest_pain_type'] == 1] = 'typical angina'
df['chest_pain_type'][df['chest_pain_type'] == 2] = 'atypical angina'
df['chest_pain_type'][df['chest_pain_type'] == 3] = 'non-angina pain'
df['chest_pain_type'][df['chest_pain_type'] == 4] = 'asymptomatic'

df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

df['rest_ecg'][df['rest_ecg'] == 0] = 'normal'
df['rest_ecg'][df['rest_ecg'] == 1] = 'ST-T wave abnormality'
df['rest_ecg'][df['rest_ecg'] == 2] = 'left ventricular hypertrophy'

df['exercise_induced_angina'][df['exercise_induced_angina'] == 0] = 'no'
df['exercise_induced_angina'][df['exercise_induced_angina'] == 1] = 'yes'

df['st_slope'][df['st_slope'] == 1] = 'upsloping'
df['st_slope'][df['st_slope'] == 2] = 'flat'
df['st_slope'][df['st_slope'] == 3] = 'downsloping'

df['thalassemia'][df['thalassemia'] == 1] = 'normal'
df['thalassemia'][df['thalassemia'] == 2] = 'fixed defect'
df['thalassemia'][df['thalassemia'] == 3] = 'reversable defect'

In [16]:
df.dtypes

age                          int64
sex                         object
chest_pain_type             object
resting_blood_pressure       int64
cholesterol                  int64
fasting_blood_sugar         object
rest_ecg                    object
max_heart_rate_achieved      int64
exercise_induced_angina     object
st_depression              float64
st_slope                    object
num_major_vessels            int64
thalassemia                 object
target                       int64
dtype: object