In [1]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from functools import partial

In [2]:
# load data, peek at data
df = pd.read_csv('healthcare-dataset-stroke-data-cleaned.csv').set_index('id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 9046 to 44679
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             5109 non-null   float64
 1   hypertension                    5109 non-null   float64
 2   heart_disease                   5109 non-null   float64
 3   avg_glucose_level               5109 non-null   float64
 4   bmi                             5109 non-null   float64
 5   stroke                          5109 non-null   float64
 6   gender_Male                     5109 non-null   float64
 7   ever_married_Yes                5109 non-null   float64
 8   work_type_Never_worked          5109 non-null   float64
 9   work_type_Private               5109 non-null   float64
 10  work_type_Self-employed         5109 non-null   float64
 11  work_type_children              5109 non-null   float64
 12  Residence_type_Urban          

In [3]:
# Relabel, split
X = df.loc[ : , df.columns != 'stroke']
y = df['stroke'].astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 12345)

In [4]:
# Fix imbalance with SMOTE
from imblearn.over_sampling import SMOTE
smote_sampler = SMOTE(random_state = 12345)
X_smo, y_smo = smote_sampler.fit_resample(X_train, y_train)

In [5]:
# Impute missing values with mean. StandardScaler for numerical, OneHotEncoder or OrdinalEncoder for categorical. Train
lazyclassify = LazyClassifier(predictions=True, verbose=0, custom_metric=recall_score)
models, predictions = lazyclassify.fit(X_smo, X_test, y_smo, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:19<00:00,  1.49it/s]


In [7]:
# Prediction results
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
BernoulliNB                        0.71               0.80     0.80      0.79   
AdaBoostClassifier                 0.76               0.76     0.76      0.83   
CalibratedClassifierCV             0.74               0.76     0.76      0.82   
SGDClassifier                      0.76               0.76     0.76      0.83   
LogisticRegression                 0.74               0.76     0.76      0.81   
LinearSVC                          0.74               0.76     0.76      0.81   
LinearDiscriminantAnalysis         0.73               0.76     0.76      0.81   
RidgeClassifierCV                  0.73               0.76     0.76      0.81   
RidgeClassifier                    0.73               0.76     0.76      0.81   
NearestCentroid                    0.67               0.73     0.73      0.77   
NuSVC                       