In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import (
    train_test_split,
    RepeatedStratifiedKFold,
    cross_val_score,
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import Pipeline
import joblib
import warnings

warnings.filterwarnings(action="ignore")


## Data exploration and pre-processing

In [38]:
df = pd.read_csv("../data/healthcare-dataset-stroke-data.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Dataset shape: {df.shape}")

Dataset shape: (5110, 12)


In [39]:
df.sample(n=4)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
2928,21963,Male,31.0,0,0,Yes,Private,Urban,108.51,26.7,Unknown,0
3238,61477,Female,25.0,0,0,No,Private,Urban,68.07,18.6,smokes,0
2541,28247,Male,82.0,0,0,No,Self-employed,Urban,101.57,24.3,smokes,0
205,40353,Female,61.0,0,0,Yes,Private,Urban,114.09,25.7,never smoked,0


In [40]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [41]:
# rename columns
df = df.rename(columns={"Residence_type": "residence_type"})

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [43]:
# convert string values to lowercase and normalize them
for col in ["gender", "ever_married", "work_type", "residence_type", "smoking_status"]:
    df[col] = df[col].str.lower()
    df[col] = df[col].str.replace(" ", "_")
    df[col] = df[col].str.replace("-", "_")
    print(f"'{col}' values:", df[col].unique())

'gender' values: ['male' 'female' 'other']
'ever_married' values: ['no' 'yes']
'work_type' values: ['self_employed' 'children' 'govt_job' 'private' 'never_worked']
'residence_type' values: ['rural' 'urban']
'smoking_status' values: ['unknown' 'never_smoked' 'smokes' 'formerly_smoked']


In [44]:
# only keep 'male' and 'female' samples
df = df[df["gender"] != "other"].copy()

In [45]:
# number of null values
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [46]:
# numerical data stats
round(df.describe(exclude=["object"]), 2)

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5109.0,5109.0,5109.0,5109.0,5109.0,4908.0,5109.0
mean,36513.99,43.23,0.1,0.05,106.14,28.89,0.05
std,21162.01,22.61,0.3,0.23,45.29,7.85,0.22
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17740.0,25.0,0.0,0.0,77.24,23.5,0.0
50%,36922.0,45.0,0.0,0.0,91.88,28.1,0.0
75%,54643.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [47]:
# categorical data stats
round(df.describe(exclude = ['float', 'int64']), 2)

Unnamed: 0,gender,ever_married,work_type,residence_type,smoking_status
count,5109,5109,5109,5109,5109
unique,2,2,5,2,4
top,female,yes,private,urban,never_smoked
freq,2994,3353,2924,2596,1892


In [48]:
# distribution of targets
round(df["stroke"].value_counts(normalize=True), 4)

0    0.9513
1    0.0487
Name: stroke, dtype: float64

## Compare models

In [49]:
def evaluate_model(X, y, model, n_splits=5, n_repeats=3, scoring_metric="f1"):
    cv = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=42
    )
    scores = cross_val_score(model, X, y, scoring=scoring_metric, cv=cv, n_jobs=-1)
    return scores


In [50]:
X = df.drop(columns=["id", "stroke"])
y = df["stroke"].copy()
categorical = X.select_dtypes(include="object").columns.to_list()
numerical = X.select_dtypes(include="number").columns.to_list()

In [51]:
models = [
    ("LogisticRegression", LogisticRegression(max_iter=5000, random_state=42)),
    ("LDA", LinearDiscriminantAnalysis()),
    ("RidgeClassifier", RidgeClassifier()),
    ("RandomForest", RandomForestClassifier(random_state=42)),
    ("BalancedRandomForestClassifier", BalancedRandomForestClassifier(random_state=42)),
    ("ExtraTrees", ExtraTreesClassifier(random_state=42)),
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("GradientBoosting", GradientBoostingClassifier(random_state=42)),
    ("BalancedBaggingClassifier", BalancedBaggingClassifier(random_state=42)),
]

In [52]:
# compare models with imbalanced data
for name, model in models:
    transformer = ColumnTransformer(
        transformers=[
            ("imp", SimpleImputer(strategy="median"), numerical),
            ("o", OneHotEncoder(), categorical),
        ]
    )
    pipeline = Pipeline(steps=[("t", transformer), ("m", model)])
    scores = evaluate_model(X, y, model=pipeline, scoring_metric="recall")
    print(f"Model: {name}")
    print(f"recall: {round(np.mean(scores), 4)}")
    print(f"std: {round(np.std(scores), 4)}")
    print("="*50)


Model: LogisticRegression
recall: 0.0027
std: 0.0068
Model: LDA
recall: 0.063
std: 0.0273
Model: RidgeClassifier
recall: 0.0
std: 0.0
Model: RandomForest
recall: 0.0081
std: 0.0123
Model: BalancedRandomForestClassifier
recall: 0.8433
std: 0.0598
Model: ExtraTrees
recall: 0.0322
std: 0.0251
Model: AdaBoost
recall: 0.0094
std: 0.0124
Model: GradientBoosting
recall: 0.0134
std: 0.0141
Model: BalancedBaggingClassifier
recall: 0.6199
std: 0.064


In [50]:
# compare models with resampled data (SMOTE oversampling) 
for name, model in models:
    transformer = ColumnTransformer(
        transformers=[
            ("imp", SimpleImputer(strategy="median"), numerical),
            ("o", OneHotEncoder(), categorical),
        ]
    )
    pipeline = Pipeline(steps=[("t", transformer), ("over", SMOTE()), ("m", model)])
    scores = evaluate_model(X, y, model=pipeline, scoring_metric="recall")
    print(f"Model: {name}")
    print(f"recall: {round(np.mean(scores), 4)}")
    print(f"std: {round(np.std(scores), 4)}")
    print("="*50)


Model: LogisticRegression
recall: 0.7816
std: 0.0664
Model: LDA
recall: 0.7936
std: 0.0696
Model: RidgeClassifier
recall: 0.799
std: 0.0673
Model: RandomForest
recall: 0.0281
std: 0.0217
Model: BalancedRandomForestClassifier
recall: 0.0281
std: 0.0191
Model: ExtraTrees
recall: 0.0482
std: 0.0242
Model: AdaBoost
recall: 0.0549
std: 0.0324
Model: GradientBoosting
recall: 0.0335
std: 0.0229
Model: BalancedBaggingClassifier
recall: 0.0429
std: 0.0263


## Final model

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=13)
X_train.shape, X_test.shape

((4342, 10), (767, 10))

In [52]:
clf = LogisticRegression(C=1.9, class_weight={0: 1.3, 1: 1.0})
transformer = ColumnTransformer(
        transformers=[
            ("imp", SimpleImputer(strategy="median"), numerical),
            ("o", OneHotEncoder(), categorical),
        ]
    )
pipeline = Pipeline(steps=[("t", transformer), ("over", SMOTE(random_state=0)), ("m", clf)])

pipeline.fit(X_train, y_train)
y_preds = pipeline.predict(X_test)

print(metrics.classification_report(y_test, y_preds, digits=4))
print(metrics.confusion_matrix(y_test, y_preds))

              precision    recall  f1-score   support

           0     0.9883    0.8090    0.8897       733
           1     0.1617    0.7941    0.2687        34

    accuracy                         0.8083       767
   macro avg     0.5750    0.8016    0.5792       767
weighted avg     0.9517    0.8083    0.8622       767

[[593 140]
 [  7  27]]


In [53]:
# export the pipeline
joblib.dump(pipeline, "../models/lr_stroke_prediction_v1.pkl")

['../models/lr_stroke_prediction_v1.pkl']