In [79]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
 
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

import joblib

In [30]:
data = pd.read_csv('../data/healthcare-dataset-stroke-data.csv')

In [31]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
data.shape

(5110, 12)

In [32]:
data.drop(data[data.gender == 'Other'].index, inplace=True)
data.drop(['id', 'avg_glucose_level'], axis=1, inplace=True)

In [33]:
Y = data[['stroke']]
X = data.drop('stroke', axis=1)

In [120]:
model = RandomForestClassifier(random_state=42)

categorical_cols = data.select_dtypes(include=[object]).columns

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop = 'if_binary'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough')

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)
                                ])

In [117]:
my_pipeline.fit(X, Y)

  return fit_method(estimator, *args, **kwargs)
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [119]:
my_pipeline[1].oob_score_
#Y.value_counts()
#1 - 249 / (249+4860)

0.04417670682730924

In [13]:
predictions = my_pipeline.predict(X)
print(precision_score(Y, predictions))
print(recall_score(Y, predictions))
print(classification_report(Y, predictions))

1.0
0.9919678714859438
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4860
           1       1.00      0.99      1.00       249

    accuracy                           1.00      5109
   macro avg       1.00      1.00      1.00      5109
weighted avg       1.00      1.00      1.00      5109



In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# You can still fit the pipeline to the entire training set and evaluate on the test set if needed
my_pipeline.fit(X_train, y_train)

# Make predictions on the test set
predictions = my_pipeline.predict(X_test)

# Evaluate the model
#accuracy = accuracy_score(y_test, predictions)
#conf_matrix = confusion_matrix(y_test, predictions)
#class_report = classification_report(y_test, predictions)
print(y_test.value_counts())
print(precision_score(y_test, predictions))
print(recall_score(y_test, predictions))
#print(classification_report(y_test, predictions))

stroke
0         960
1          62
Name: count, dtype: int64
0.0
0.0


  return fit_method(estimator, *args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
my_pipeline.steps[1][1].feature_importances_

array([4.12302032e-02, 2.07437280e-02, 1.41341557e-02, 6.99603574e-06,
       1.99791422e-02, 1.91599021e-02, 1.24888251e-03, 4.78446645e-02,
       1.92243603e-02, 1.72111180e-02, 1.98426733e-02, 1.92355686e-02,
       3.23612672e-01, 2.92961559e-02, 2.84774723e-02, 3.78752305e-01])

In [34]:
my_pipeline[:-1].get_feature_names_out()

array(['cat__gender_Male', 'cat__ever_married_Yes',
       'cat__work_type_Govt_job', 'cat__work_type_Never_worked',
       'cat__work_type_Private', 'cat__work_type_Self-employed',
       'cat__work_type_children', 'cat__Residence_type_Urban',
       'cat__smoking_status_Unknown',
       'cat__smoking_status_formerly smoked',
       'cat__smoking_status_never smoked', 'cat__smoking_status_smokes',
       'remainder__age', 'remainder__hypertension',
       'remainder__heart_disease', 'remainder__bmi'], dtype=object)

In [37]:
# just an attempt on the two most important features
Xtrain = X[['age','bmi']]
model.fit(Xtrain, Y)
print(precision_score(Y, model.predict(Xtrain)))
print(recall_score(Y, model.predict(Xtrain)))

  return fit_method(estimator, *args, **kwargs)


0.9095477386934674
0.7269076305220884


In [77]:
single_case = pd.DataFrame(['Female', 40.0, 0, 0, 'Yes', 'Self-employed', 'Urban', 16, 'formerly smoked']).T
single_case.columns = X.columns
#single_case = single_case.astype({'age': float, 'hypertension': int, 'heart_disease': int, 'bmi': float})

single_case

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,bmi,smoking_status
0,Female,40.0,0,0,Yes,Self-employed,Urban,16,formerly smoked


In [78]:
my_pipeline.predict(single_case)

array([0])

In [73]:
joblib.dump(my_pipeline, '../data/pipeline_model.pkl', compress = 1)

['../data/pipeline_model.pkl']

In [74]:
#mm = joblib.load('../data/pipeline_model.pkl')
#mm.predict(X)

array([1, 1, 1, ..., 0, 0, 0])

In [84]:
# grid search

my_pipeline.fit(X, Y)

  return fit_method(estimator, *args, **kwargs)
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [125]:

param_tuning = {
        'model__n_estimators': [30, 60, 100],
        'model__min_samples_leaf': [2, 4, 5],
        'model__max_samples' : [0.7, 1.0]
    }

In [126]:
gsearch = GridSearchCV(estimator = my_pipeline,
                           param_grid = param_tuning,                        
                           scoring = 'f1_macro',
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

gsearch.fit(X,Y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [128]:
gsearch.best_params_

{'model__max_samples': 1.0,
 'model__min_samples_leaf': 2,
 'model__n_estimators': 100}