## Importing the Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset=pd.read_csv('healthcare-dataset-stroke-data.csv')
dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


## Data preprocessing 

In [11]:
from category_encoders import TargetEncoder

import pickle

def preprocess_data_train(df):

    #making copy of dataset

    df=df.copy()

 

    #dropping column 'id'

    df=df.drop('id',axis=1)

 

    #binary encoding

    df['ever_married']=df['ever_married'].replace({'No':0,'Yes':1})

    df['Residence_type']=df['Residence_type'].replace({'Rural':0,'Urban':1})

   

    

    encoder = TargetEncoder()

    col_to_encode=['gender', 'work_type', 'smoking_status']

    encoder.fit(df[col_to_encode], df['stroke'])

    df[col_to_encode] = encoder.transform(df[col_to_encode])

   

    #### Create a Pickle file using serialization

 

    pickle_out = open("target.pkl","wb")

    pickle.dump(encoder, pickle_out)

    pickle_out.close()

    #Handling Missing Values with Strategy Mean

    df['bmi'].fillna(df['bmi'].mean(), inplace=True )

 

    return df

 

In [8]:
from category_encoders import TargetEncoder
import pickle

def preprocess_data_test(df):

    #making copy of dataset

    df=df.copy()

 

    #dropping column 'id'

    #df=df.drop('id',axis=1)

 

    #binary encoding

    df['ever_married']=df['ever_married'].replace({'No':0,'Yes':1})

    df['Residence_type']=df['Residence_type'].replace({'Rural':0,'Urban':1})

   

    ##read pickle file of encoder

    pickle_in = open(r"D:\temp\target.pkl","rb")

    encoder=pickle.load(pickle_in)

    col_to_encode=['gender', 'work_type', 'smoking_status']

    df[col_to_encode] = encoder.transform(df[col_to_encode])

   

    #### Create a Pickle file using serialization

 

    #Handling Missing Values with Strategy Mean

    df['bmi'].fillna(df['bmi'].mean(), inplace=True )

 

    return df

In [12]:
data=preprocess_data_train(dataset)
data



Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0.051064,67.0,0,1,1,0.050940,1,228.69,36.600000,0.079096,1
1,0.047094,61.0,0,0,1,0.079365,0,202.21,28.893237,0.047569,1
2,0.051064,80.0,0,1,1,0.050940,0,105.92,32.500000,0.047569,1
3,0.047094,49.0,0,0,1,0.050940,1,171.23,34.400000,0.053232,1
4,0.047094,79.0,1,0,1,0.079365,0,174.12,24.000000,0.047569,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,0.047094,80.0,1,0,1,0.050940,1,83.75,28.893237,0.047569,0
5106,0.047094,81.0,0,0,1,0.079365,1,125.20,40.000000,0.047569,0
5107,0.047094,35.0,0,0,1,0.079365,0,82.99,30.600000,0.047569,0
5108,0.051064,51.0,0,0,1,0.050940,0,166.29,25.600000,0.079096,0


### x refers to features and y refers to output variable

In [13]:
y=data['stroke']
x=data.drop('stroke',axis=1)

### Test_Train_Split

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split(x,y,train_size=0.8,shuffle=True, random_state=1)

In [15]:
x_train.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

## Training Model Using Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=100,criterion='gini',random_state=0)
classifier.fit(x_train,y_train)
print("Random Forest Classifier Trained")

Random Forest Classifier Trained


In [17]:
y_train.value_counts()

0    3899
1     189
Name: stroke, dtype: int64

## Predicting the Test set results

In [18]:
y_pred=classifier.predict(x_test)

## Classification Report

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       962
           1       0.00      0.00      0.00        60

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.89      0.94      0.91      1022



## Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test,y_pred)
print("Confusion Matrix : ")
print(cm)
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy is :",accuracy)

Confusion Matrix : 
[[959   3]
 [ 60   0]]
Accuracy is : 0.9383561643835616


# Handling Class imbalance with SMOTE

In [21]:
from sklearn.utils import class_weight
# A array object that contains the weights for both classes
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# A dictionary object containing key-value pairs of both classes and # their weights
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
class_weights_dict

{0: 0.5242369838420108, 1: 10.814814814814815}

In [22]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
x_t_res,y_t_res=smote.fit_resample(x_train,y_train)

In [23]:
y_t_res.value_counts()

0    3899
1    3899
Name: stroke, dtype: int64

In [24]:
# Number of trees in random forest
n_estimators= [130, 180, 230]
# Maximum number of levels in tree
max_depth = [2,4,6,8]
# Minimum number of samples required to split a node
min_samples_split = [2, 5,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [4,8,12]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [25]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'criterion': ['gini', 'entropy'], 
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [130, 180, 230], 'max_depth': [2, 4, 6, 8], 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [4, 8, 12], 'bootstrap': [True, False]}


In [26]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
gcv=RandomizedSearchCV(RandomForestClassifier(random_state=1),param_distributions=param_grid,cv=5,n_iter=72, n_jobs = 4)
gcv.fit(x_t_res,y_t_res)
print("Random Forest Classifier Trained")

Random Forest Classifier Trained


In [27]:
y_predicted=gcv.predict(x_test)

In [28]:
best_params = gcv.best_params_
print(best_params)

{'n_estimators': 130, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 8, 'criterion': 'entropy', 'bootstrap': False}


In [29]:
best_forest = RandomForestClassifier(**best_params, class_weight=class_weights_dict)

In [30]:
# Fitting our model to the train set
fit_forest = best_forest.fit(x_train, y_train)
# Creating predicted variables to compare against y_test
y_pred = fit_forest.predict(x_test)

### Classification Report

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.97      0.83      0.90       962
           1       0.17      0.57      0.27        60

    accuracy                           0.82      1022
   macro avg       0.57      0.70      0.58      1022
weighted avg       0.92      0.82      0.86      1022



### Confusion Matrix

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test,y_pred)
print("Confusion Matrix : ")
print(cm)
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy is :",accuracy)

Confusion Matrix : 
[[801 161]
 [ 26  34]]
Accuracy is : 0.8170254403131115


## Feature Importance using ELI5

### Global Feature importance

In [33]:
import eli5
from eli5.sklearn import PermutationImportance
perm=PermutationImportance(best_forest,random_state=1).fit(x_test,y_test)
eli5.show_weights(perm,feature_names=x_test.columns.tolist())

Weight,Feature
0.0078  ± 0.0227,age
0.0016  ± 0.0050,gender
0.0014  ± 0.0049,Residence_type
0.0006  ± 0.0027,hypertension
-0.0016  ± 0.0032,heart_disease
-0.0020  ± 0.0072,ever_married
-0.0047  ± 0.0026,smoking_status
-0.0068  ± 0.0045,avg_glucose_level
-0.0096  ± 0.0050,work_type
-0.0129  ± 0.0065,bmi


### Local Feature Importance 

In [34]:
i = 1
x_test.iloc[[i]].shape

(1, 10)

In [35]:
eli5.show_prediction(best_forest, np.array(x_test)[i],show_feature_values=True, feature_names=x_test.columns.tolist())



Contribution?,Feature,Value
0.5,<BIAS>,1.0
0.266,age,78.0
0.01,work_type,0.051
0.001,ever_married,1.0
0.001,smoking_status,0.079
-0.001,Residence_type,1.0
-0.003,gender,0.047
-0.011,heart_disease,0.0
-0.022,hypertension,0.0
-0.022,bmi,30.1


In [36]:
### Create a Pickle file using serialization 
import pickle
pickle_out = open("best_forest.pkl","wb")
pickle.dump(best_forest, pickle_out)
pickle_out.close()