In [1]:
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(Path(os.getenv('DATA')) / 'stroke/healthcare-dataset-stroke-data.csv')

In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
df = df.drop('id', axis=1)

In [6]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

We have significantly unbalanced data. We'll have to fix this.

In [8]:
le = LabelEncoder()
en_df = df.apply(le.fit_transform)
en_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,88,0,1,1,2,1,3850,239,1,1
1,0,82,0,0,1,3,0,3588,418,2,1
2,1,101,0,1,1,2,0,2483,198,2,1
3,0,70,0,0,1,2,1,3385,217,3,1
4,0,100,1,0,1,3,0,3394,113,2,1


# Clean up dataset

In [9]:
en_df_imputed = en_df
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputer.fit_transform(en_df_imputed)

array([[  1.,  88.,   0., ..., 239.,   1.,   1.],
       [  0.,  82.,   0., ..., 418.,   2.,   1.],
       [  1., 101.,   0., ..., 198.,   2.,   1.],
       ...,
       [  0.,  56.,   0., ..., 179.,   2.,   0.],
       [  1.,  72.,   0., ..., 129.,   1.,   0.],
       [  0.,  65.,   0., ..., 135.,   0.,   0.]])

In [10]:
en_df_imputed.isnull().sum()


gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [11]:
features=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type',
       'smoking_status']

In [12]:
from imblearn.over_sampling import SMOTE
X, y = en_df_imputed[features], en_df_imputed["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

# Modeling

In [13]:
from functools import partial

from hyperopt import STATUS_OK, Trials, fmin, hp, space_eval, tpe
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn.metrics import accuracy_score, f1_score

In [14]:
num_trials = 100

## XGBoost

In [15]:
from xgboost import XGBClassifier

In [16]:
xgb_space={'max_depth': scope.int(hp.quniform("max_depth", 3, 18, 1)),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [17]:
def train_clf(clf, params):
    clf=clf(**params)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    accuracy = accuracy_score(y_test, preds>0.5)

    return {'loss': -accuracy, 'status': STATUS_OK}

In [18]:
def train_xgb(params):
    """
    xgb needs eval_metric or lots of warnings
    """
    clf=XGBClassifier(**params)
    clf.fit(X_train, y_train, eval_metric='logloss')
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)

    return {'loss': -accuracy, 'status': STATUS_OK}

In [19]:
trials = Trials()

fmin(fn = train_xgb,
    space = xgb_space,
    algo = tpe.suggest,
    max_evals = num_trials,
    trials = trials)

100%|█████████████████████████████████████████████| 100/100 [00:24<00:00,  4.17trial/s, best loss: -0.7553816046966731]


{'colsample_bytree': 0.680758006639028,
 'gamma': 2.3049640504078956,
 'max_depth': 10.0,
 'min_child_weight': 9.0,
 'reg_alpha': 40.0,
 'reg_lambda': 0.801884811049699}

In [20]:
best_hyperparams = space_eval(xgb_space, trials.argmin)

In [21]:
best_hyperparams

{'colsample_bytree': 0.680758006639028,
 'gamma': 2.3049640504078956,
 'max_depth': 10,
 'min_child_weight': 9.0,
 'n_estimators': 180,
 'reg_alpha': 40.0,
 'reg_lambda': 0.801884811049699,
 'seed': 0}

In [22]:
xgb_clf = XGBClassifier(**best_hyperparams)

In [23]:
xgb_clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.680758006639028,
              enable_categorical=False, gamma=2.3049640504078956, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=9.0, missing=nan, monotone_constraints='()',
              n_estimators=180, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=40.0,
              reg_lambda=0.801884811049699, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [24]:
xgb_preds = xgb_clf.predict(X_test)

In [25]:
f1_score(y_test, xgb_preds)

0.2378048780487805

In [26]:
accuracy_score(y_test, xgb_preds)

0.7553816046966731

## Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
rf_space = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 100, 600, 50)),
    "max_depth": hp.quniform("max_depth", 1, 15, 1),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
}

In [None]:
trials = Trials()

fmin(fn = partial(train_clf, RandomForestClassifier),
    space = rf_space,
    algo = tpe.suggest,
    max_evals = num_trials,
    trials = trials)

 21%|█████████▋                                    | 21/100 [00:17<01:29,  1.13s/trial, best loss: -0.8463796477495108]

In [None]:
rf_best_hyperparams = space_eval(rf_space, trials.argmin)

In [None]:
rf_best_hyperparams

In [None]:
rf_clf = RandomForestClassifier(**rf_best_hyperparams)

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
rf_preds = rf_clf.predict(X_test)

In [None]:
f1_score(y_test, rf_preds)

In [None]:
accuracy_score(y_test, rf_preds)

## Ensembling

Some of these models are already ensemble models. But who says you can't ensemble ensemble models? No one that I'm currently listening to!

#### Averaging

In [None]:
xgb_preds + rf_preds

#### Voting

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
clfs = [('xbg', xgb_clf), ('rf', rf_clf)]
ensemble = VotingClassifier(clfs, voting='hard')

In [None]:
ensemble.fit(X_train, y_train)

In [None]:
ensemble_preds = ensemble.predict(X_test)

In [None]:
f1_score(ensemble_preds, y_test)

In [None]:
accuracy_score(ensemble.predict(X_test), y_test)