In [32]:
%config Completer.use_jedi = False

In [227]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.experimental import enable_iterative_imputer

from sklearn.neighbors       import KNeighborsClassifier

from sklearn.naive_bayes     import GaussianNB

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm             import SVC

from sklearn.decomposition   import PCA

import imblearn
from   imblearn.pipeline          import make_pipeline # scikit-learn Pipeline does not work with imblearn

from sklearn.metrics import balanced_accuracy_score, fbeta_score, f1_score

In [14]:
reset -fs

In [17]:
data = pd.read_csv('data/heart_failure_clinical_records_dataset.csv')
data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [19]:
y = data.DEATH_EVENT # Our target variable is DEATH_EVENT 
X = data.drop(columns='DEATH_EVENT')  # Remove the target variable from X 

In [37]:
# How many observations do we have?
len(X)

299

After importing the data, the next step is to pull off a segment of the data that will be our testing set. This test set will be be hidden away from the models we are designing until we are ready to test one. Additionally, since there are only have 299 observations, I am going to split the data by a higher percentage to ensure that the testing set is somewhat representative of the data.

In [38]:
# Testing split that we are going to hide away from our model
X, X_test, y, y_test = train_test_split(X, y, test_size=0.3)


In [63]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Feature Engineering
-----

In [64]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 209 entries, 85 to 57
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       209 non-null    float64
 1   anaemia                   209 non-null    int64  
 2   creatinine_phosphokinase  209 non-null    int64  
 3   diabetes                  209 non-null    int64  
 4   ejection_fraction         209 non-null    int64  
 5   high_blood_pressure       209 non-null    int64  
 6   platelets                 209 non-null    float64
 7   serum_creatinine          209 non-null    float64
 8   serum_sodium              209 non-null    int64  
 9   sex                       209 non-null    int64  
 10  smoking                   209 non-null    int64  
 11  time                      209 non-null    int64  
dtypes: float64(3), int64(9)
memory usage: 21.2 KB


One can see that there are 12 columns in the data, and they all appear to have numeric data-types. However, some of the columns actually are actually categorical andd consist of 0s and 1s (corresponding to whether or not a specific patient falls into that category or not). In summary, these categorical columns have already been one-hot encoded, so there is no need to incorporate that into the preprocessing pipeline.

In [215]:
cat_cols = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
con_cols = ['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'age', 'time']
age_col = ['age']  # Split off age from con_cols since there MinMaxScaler(0,1) works well for ages

In [216]:
con_pipe = Pipeline([('scaler', StandardScaler()),  
                     ('imputer', SimpleImputer(strategy='median', add_indicator=True))])

# con_pipe = Pipeline([('scaler', StandardScaler()),  
#                      ('imputer', IterativeImputer())])

# age_pipe = Pipeline([('scaler', MinMaxScaler((0,1))),  # MinMax Scaler good to apply to age
#                      ('imputer', SimpleImputer(strategy='median', add_indicator=True))])

cat_pipe = Pipeline([('scaler', StandardScaler()),
                     ('imputer', SimpleImputer(strategy='most_frequent'))])

# cat_pipe = Pipeline([('imputer', IterativeImputer())])

preprocessing = ColumnTransformer([('categorical', cat_pipe, cat_cols),
                                   ('continuous', con_pipe, con_cols)])

In [102]:
# con_pipe = make_pipeline(imblearn.over_sampling.SMOTE(), StandardScaler(), SimpleImputer(strategy='median'))

# age_pipe = make_pipeline(imblearn.over_sampling.SMOTE(), MinMaxScaler((0,1)), SimpleImputer(strategy='median'))

# cat_pipe = make_pipeline(imblearn.over_sampling.SMOTE(), SimpleImputer(strategy='most_frequent'))

# preprocessing = ColumnTransformer([('categorical', cat_pipe, cat_cols),
#                                    ('continuous', con_pipe, con_cols),
#                                    ('age', age_pipe, age_col)])

# Algorithms & Search
-----

In [91]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

In [217]:

# pipe = make_pipeline(imblearn.over_sampling.SMOTE(),
#                      preprocessing,
#                      PCA(),
#                      DummyEstimator()
#                      )

pipe = Pipeline(steps = [('preprocessing', preprocessing),
                         ('dummyestimator', DummyEstimator())])


search_space = [
    {'dummyestimator': [LogisticRegression(n_jobs=-1)],
        'dummyestimator__C': np.linspace(0.01, 5, 10),
        'dummyestimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'dummyestimator__class_weight': ['balanced', None],
        'dummyestimator__penalty': ['l1', 'l2', 'elasticnet', 'none']},
    
    {'dummyestimator': [RandomForestClassifier(n_jobs=-1)],
        'dummyestimator__criterion': ['gini', 'entropy'],
        'dummyestimator__min_samples_leaf': np.linspace(1, 10, 4, dtype=int),
        'dummyestimator__bootstrap': [True, False],
        'dummyestimator__class_weight': [None, 'balanced', 'balanced_subsample'],
        'dummyestimator__n_estimators': np.linspace(50, 300, 5, dtype=int)},
    
    {'dummyestimator': [SVC()], 
         'dummyestimator__class_weight': ['balanced', None],
         'dummyestimator__C': np.linspace(1, 100, 10),
         'dummyestimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
         'dummyestimator__degree': range(1,10)}, 
    
    {'dummyestimator': [KNeighborsClassifier(n_jobs=-1)],
         'dummyestimator__leaf_size': np.linspace(5, 50, 5, dtype=int),
         'dummyestimator__n_neighbors': np.linspace(3, 13, 4, dtype=int),
         'dummyestimator__weights': ['uniform', 'distance'],
         'dummyestimator__p': [1,2]}, 
    
    {'dummyestimator': [GaussianNB()]}, 
    
    {'dummyestimator': [ExtraTreesClassifier(n_jobs=-1)], 
         'dummyestimator__criterion': ['gini', 'entropy'],
         'dummyestimator__min_samples_leaf': np.linspace(1, 30, 5, dtype=int),
         'dummyestimator__bootstrap': [True, False],
         'dummyestimator__class_weight': [None, 'balanced', 'balanced_subsample'],
         'dummyestimator__n_estimators': np.linspace(50, 500, 8, dtype=int)}
]


# print('''search_space = [
#     {'dummyestimator': [LogisticRegression(n_jobs=-1)],
#         'dummyestimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#         'clf__class_weight': ['balanced', None],
#         'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']},
    
#     {'dummyestimator': [RandomForestClassifier(n_jobs=-1)],
#         'clf__criterion': ['gini', 'entropy'],
#         'clf__min_samples_leaf': np.linspace(1, 10, 4, dtype=int),
#         'clf__bootstrap': [True, False],
#         'clf__class_weight': [None, 'balanced', 'balanced_subsample'],
#         'clf__n_estimators': np.linspace(50, 300, 5, dtype=int)},
    
#     {'dummyestimator': [SVC()], 
#          'clf__class_weight': ['balanced', None],
#          'clf__C': np.linspace(1, 100, 10),
#          'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
#          'clf__degree': range(1,10)}, 
    
#     {'dummyestimator': [KNeighborsClassifier(n_jobs=-1)],
#          'clf__leaf_size': np.linspace(5, 50, 5, dtype=int),
#          'clf__n_neighbors': np.linspace(3, 13, 4, dtype=int),
#          'clf__weights': ['uniform', 'distance'],
#          'clf__p': [1,2]}
# ]'''.replace('clf__', 'dummyestimator__'))

In [218]:
gs = RandomizedSearchCV(pipe, 
                        search_space, 
                        scoring='f1_weighted', 
                        n_iter=30,
                        cv=5,
                        n_jobs=-1
                        )

gs.fit(X_train, y_train)

gs.best_score_, gs.best_params_

(0.8436796864430655,
 {'dummyestimator__n_estimators': 300,
  'dummyestimator__min_samples_leaf': 10,
  'dummyestimator__criterion': 'gini',
  'dummyestimator__class_weight': 'balanced_subsample',
  'dummyestimator__bootstrap': True,
  'dummyestimator': RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=10,
                         n_estimators=300, n_jobs=-1)})

In [222]:
model = gs.best_params_['dummyestimator']
model

RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=10,
                       n_estimators=300, n_jobs=-1)

# Evaluation Metrics
----

Now that I have our ideal model based on automated hyperparameter search and model selection, next I look at a variety of evaluation metrics on the training data to assess our models performance on the training set. The first metric I looked at was the weighted f1 score since that is what I used for the cross validation.

In [224]:
pipe = Pipeline(steps = [('preprocessing', preprocessing), 
                         ('clf', model)])

In [226]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_train)
f1_score(y_train, y_pred, average='weighted')

0.8991584936439413

In [230]:
f1_score(y_train, y_pred, average='weighted')

0.8991584936439413

I chose to look at a fbeta_score metric with a beta=2. I chose this metric since in a business setting use of this ML model deals with human lives. Therefore, this model should focus on reducing the amount of false negatives (we would rather tell someone they are at risk of a death event when they're not than miss identifying a person who has a death event. For this reason, I chose to look at fbeta_score with a beta value greater than 1 (as this puts more emphasis on false positives).

In [232]:
fbeta_score(y_train, y_pred, beta=2)

0.8793103448275862

Another evaluation metric I chose to look at 