In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [29]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [41]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [42]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [43]:
df.shape

(5110, 12)

__Target Variable: Stroke__

In [44]:
df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [45]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

__Gender, ever_married, work_type, Residence_type, smoking_status, hypertension, heart_disease : Categorical Variables__  
__age , avg_glucose_level, bmi : Numerical Variables__

In [46]:
df['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [47]:
df['hypertension'].value_counts()

0    4612
1     498
Name: hypertension, dtype: int64

In [48]:
df['heart_disease'].value_counts()

0    4834
1     276
Name: heart_disease, dtype: int64

In [49]:
df['ever_married'].value_counts()

Yes    3353
No     1757
Name: ever_married, dtype: int64

In [50]:
df['work_type'].value_counts()

Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64

In [51]:
df['smoking_status'].value_counts()

never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64

In [53]:
df['Residence_type'].value_counts()

Urban    2596
Rural    2514
Name: Residence_type, dtype: int64

#### Data Preprocessing

In [54]:
df = df[df['gender']!='Other'].copy()

In [55]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [56]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke'], dtype='object')

In [57]:
df = pd.get_dummies(columns=['gender','ever_married',
       'work_type', 'Residence_type','smoking_status'], data = df, prefix_sep= '_', drop_first= True)

In [58]:
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,1,1,0,1,0,0,1,1,0,0
1,51676,61.0,0,0,202.21,,1,0,1,0,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,1,1,0,1,0,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,0,1,0,1,0,0,1,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,0,1,0,0,1,0,0,0,1,0


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              5109 non-null   int64  
 1   age                             5109 non-null   float64
 2   hypertension                    5109 non-null   int64  
 3   heart_disease                   5109 non-null   int64  
 4   avg_glucose_level               5109 non-null   float64
 5   bmi                             4908 non-null   float64
 6   stroke                          5109 non-null   int64  
 7   gender_Male                     5109 non-null   uint8  
 8   ever_married_Yes                5109 non-null   uint8  
 9   work_type_Never_worked          5109 non-null   uint8  
 10  work_type_Private               5109 non-null   uint8  
 11  work_type_Self-employed         5109 non-null   uint8  
 12  work_type_children              51

In [60]:
df.isnull().sum()

id                                  0
age                                 0
hypertension                        0
heart_disease                       0
avg_glucose_level                   0
bmi                               201
stroke                              0
gender_Male                         0
ever_married_Yes                    0
work_type_Never_worked              0
work_type_Private                   0
work_type_Self-employed             0
work_type_children                  0
Residence_type_Urban                0
smoking_status_formerly smoked      0
smoking_status_never smoked         0
smoking_status_smokes               0
dtype: int64

bmi has ~200 missing values, which can be imputed.

In [61]:
from sklearn.impute import KNNImputer

In [62]:
knn_impute = KNNImputer(n_neighbors= 10, weights= 'distance')

In [63]:
imputed_bmi = knn_impute.fit_transform(df[['bmi']])

In [65]:
df = df.drop(columns=['bmi'], axis= 1)

In [66]:
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,stroke,gender_Male,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,1,1,1,0,1,0,0,1,1,0,0
1,51676,61.0,0,0,202.21,1,0,1,0,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,1,1,1,0,1,0,0,0,0,1,0
3,60182,49.0,0,0,171.23,1,0,1,0,1,0,0,1,0,0,1
4,1665,79.0,1,0,174.12,1,0,1,0,0,1,0,0,0,1,0


In [67]:
df['bmi_imputed'] = imputed_bmi

In [68]:
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,stroke,gender_Male,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,bmi_imputed
0,9046,67.0,0,1,228.69,1,1,1,0,1,0,0,1,1,0,0,36.6
1,51676,61.0,0,0,202.21,1,0,1,0,0,1,0,0,0,1,0,28.89456
2,31112,80.0,0,1,105.92,1,1,1,0,1,0,0,0,0,1,0,32.5
3,60182,49.0,0,0,171.23,1,0,1,0,1,0,0,1,0,0,1,34.4
4,1665,79.0,1,0,174.12,1,0,1,0,0,1,0,0,0,1,0,24.0


#### Modeling

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [71]:
df.columns

Index(['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'stroke', 'gender_Male', 'ever_married_Yes', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes', 'bmi_imputed'], dtype='object')

In [88]:
X = df.drop(columns=['id','stroke'])
y = df['stroke']

In [89]:
X.shape, y.shape

((5109, 15), (5109,))

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=420)

In [91]:
X_train.shape , X_test.shape

((3576, 15), (1533, 15))

In [92]:
model_rf = RandomForestClassifier(n_estimators= 100 , n_jobs= -1, max_depth=2, criterion= 'entropy')
model_svm = SVC()
model_dt = DecisionTreeClassifier(criterion='gini', splitter= 'best')
model_lr = LogisticRegression()

In [97]:
model_rf.fit(X_train, y_train)
model_svm.fit(X_train, y_train)
model_dt.fit(X_train, y_train)
#model_lr.fit(X_train,y_train)

DecisionTreeClassifier()

In [98]:
y_pred_rf = model_rf.predict(X_test)
y_pred_svm = model_svm.predict(X_test)
y_pred_dt = model_dt.predict(X_test)
y_pred_lr = model_lr.predict(X_test)

In [99]:
print('Accuracy of Random Forest', np.round(accuracy_score(y_pred= y_pred_rf, y_true = y_test),3))
print('Accuracy of SVM', np.round(accuracy_score(y_pred= y_pred_svm, y_true = y_test),3))
print('Accuracy of Decision Tree', np.round(accuracy_score(y_pred= y_pred_dt, y_true = y_test),3))
print('Accuracy of Logistic Regression', np.round(accuracy_score(y_pred= y_pred_lr, y_true = y_test),3))

Accuracy of Random Forest 0.951
Accuracy of SVM 0.951
Accuracy of Decision Tree 0.913
Accuracy of Logistic Regression 0.951


__We will go with Random Forest Classifier__