In [19]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline


np.random.seed(0)

## get the data

data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data.isna().sum()
data.rename(columns=str.lower, inplace=True)

## get the features that need finetuning
features = ['gender', 'ever_married', 'work_type', 'residence_type', 'smoking_status']
features_transform = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


bmi = ['bmi']
bmi_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

## get a columntransformer and transform all those features that need finetuning, save it as preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('feat', features_transform, features),
    ('bmi', bmi_transform, bmi)])

## stitch all these together with the model

model = Pipeline(steps=[('preprocessor', preprocessor), 
                        ('model', RandomForestClassifier())])

## split ur data

X = data.drop('stroke', axis=1)
y = data['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## fit the model

model.fit(X_train, y_train)
model.score(X_test, y_test)




0.9197651663405088

In [20]:
pipe_grid = {'preprocessor__bmi__imputer__strategy': ['mean', 'median'],
             'model__n_estimators': [100, 1000],
             'model__max_depth': [None, 5],
             'model__max_features': ['auto'],
             'model__min_samples_split': [2, 4],
             'model__min_samples_leaf': [1]}

gs_model = GridSearchCV(model, pipe_grid, cv=3, verbose=2)
gs_model.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__bmi__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__bmi__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__bmi__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__bmi__imputer__strategy=median; total time=   0.5s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100,

[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__bmi__imputer__strategy=median; total time=   0.4s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=1000, preprocessor__bmi__imputer__strategy=mean; total time=   5.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=1000, preprocessor__bmi__imputer__strategy=mean; total time=   3.8s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=1000, preprocessor__bmi__imputer__strategy=mean; total time=   3.9s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=1000, preprocessor__bmi__imputer__strategy=median; total time=   3.8s
[CV] EN

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('feat',
                                                                         Pipeline(steps=[('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['gender',
                                                                          'ever_married',
                                                                          'work_type',
                                                                          'residence_type',
                                                                          'smoking_status']),
                                                                        ('bmi',
                                                                         Pipeline(st