## Importing the Libraies

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from collections import Counter

## Reading the dataset

In [3]:
data = pd.read_csv('dataset/healthcare-dataset-stroke-data.csv', na_values='N/A')

In [4]:
# Removing " " empty space between feature values  

from pandas.api.types import is_string_dtype

for column in data.columns:
    if (is_string_dtype(data[column].dtype)):
        data[column] = data[column].str.strip()

In [5]:
X = data.loc[:, data.columns != 'stroke']
y = data['stroke']

In [6]:
print(X.shape, y.shape)

(5110, 11) (5110,)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Train - 80% , Test - 20%

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4088, 11) (1022, 11) (4088,) (1022,)


## Data Preprocessing

### Handling Missing Values

#### Simple Imputation by Sklearn - Mean/Median/Most Frequent

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
si_X_train = pd.DataFrame() # create a new dataframe to save the train dataset
si_X_test = pd.DataFrame() # create a new dataframe to save the test dataset

In [11]:
for column in X_train.columns:
  if (is_string_dtype(X_train[column].dtype)):
    si = SimpleImputer(strategy='most_frequent')
  else:
    si = SimpleImputer(strategy='median')
  si.fit(X_train[[column]])
  si_X_train[column] = si.transform(X_train[[column]]).flatten() # Flatten 2D matrix to 1D 
  si_X_test[column] = si.transform(X_test[[column]]).flatten()

In [12]:
si_X_train

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,28932.0,Female,36.0,0.0,0.0,Yes,Private,Rural,67.29,36.7,formerly smoked
1,37150.0,Female,34.0,0.0,0.0,Yes,Private,Rural,83.53,48.5,formerly smoked
2,71669.0,Male,60.0,0.0,0.0,Yes,Private,Rural,65.16,30.8,never smoked
3,27153.0,Female,75.0,0.0,0.0,Yes,Self-employed,Rural,78.80,29.3,formerly smoked
4,58235.0,Male,76.0,0.0,0.0,Yes,Private,Urban,58.65,25.6,smokes
...,...,...,...,...,...,...,...,...,...,...,...
4083,30457.0,Female,53.0,1.0,0.0,Yes,Govt_job,Rural,98.61,38.8,smokes
4084,24836.0,Female,61.0,0.0,0.0,Yes,Private,Rural,72.01,26.0,formerly smoked
4085,17079.0,Male,44.0,0.0,0.0,Yes,Private,Rural,94.71,28.4,smokes
4086,72340.0,Male,21.0,0.0,0.0,No,Private,Urban,120.94,29.7,formerly smoked


### Handling Text Features

#### Label Encoder

In [13]:
categorical_features = []
for col in data.columns:
  if col=='Class':
    continue
  if is_string_dtype(data[col].dtype):
    categorical_features.append(col)

In [14]:
categorical_features

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [16]:
# Convert the Label Class 

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [17]:
l_X_train = pd.DataFrame() # Train dataset --> before scaling
l_X_test = pd.DataFrame() # Test dataset --> before scaling

In [22]:
# Convert the text features

for column in X_train.columns:
  if column in categorical_features:
    l_X_train[column] = le.fit_transform(si_X_train[column])
    l_X_test[column] = le.transform(si_X_test[column])
  else:
    l_X_train[column] = si_X_train[column].copy()
    l_X_test[column] = si_X_test[column].copy()

In [25]:
l_X_train.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

## Oversampling the dataset

whichever classess has less number of values, we will impute and increase the values in those classes

In [26]:
from imblearn.over_sampling import RandomOverSampler

os=RandomOverSampler(0.75) # 75%
l_X_train_ns,y_train_ns = os.fit_resample(l_X_train,y_train)

print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 3893, 1: 195})
The number of classes after fit Counter({0: 3893, 1: 2919})


## Feature Scaling

#### Standardization

In [27]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

l_X_train_ns = ss.fit_transform(l_X_train_ns)
l_X_test = ss.transform(l_X_test)

In [28]:
print(l_X_train)

           id  gender   age  hypertension  heart_disease  ever_married  \
0     28932.0       0  36.0           0.0            0.0             1   
1     37150.0       0  34.0           0.0            0.0             1   
2     71669.0       1  60.0           0.0            0.0             1   
3     27153.0       0  75.0           0.0            0.0             1   
4     58235.0       1  76.0           0.0            0.0             1   
...       ...     ...   ...           ...            ...           ...   
4083  30457.0       0  53.0           1.0            0.0             1   
4084  24836.0       0  61.0           0.0            0.0             1   
4085  17079.0       1  44.0           0.0            0.0             1   
4086  72340.0       1  21.0           0.0            0.0             0   
4087  52242.0       0  58.0           1.0            0.0             1   

      work_type  Residence_type  avg_glucose_level   bmi  smoking_status  
0             2               0     

## Model Building

## Classification Evaluatin Metrics

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

def evaluate_preds(y_test,y_pred):
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred) 
    f1 = f1_score(y_test,y_pred)
    mcc = matthews_corrcoef(y_test,y_pred)

    metric_dict = {
        "accuracy":round(accuracy,2),
        "precision":round(precision,2),
        "recall":round(recall,2),
        "f1":round(f1,2),
        "mcc": mcc 
    } # A dictionary that stores the results of the evaluation metrics
    
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    print(f'MCC Score: {mcc:.2f}')
    
    return metric_dict

### SVM

In [47]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf',random_state=0)
svc.fit(l_X_train_ns,y_train_ns)

SVC(random_state=0)

In [48]:
y_pred = svc.predict(l_X_test)
model_metrics = evaluate_preds(y_test, y_pred)

Acc: 82.29%
Precision: 0.15
Recall: 0.48
F1 score: 0.22
MCC Score: 0.19


### Naive Bayes

In [49]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(l_X_train_ns,y_train_ns)

GaussianNB()

In [50]:
y_pred = naive.predict(l_X_test)
model_metrics = evaluate_preds(y_test, y_pred)

Acc: 79.26%
Precision: 0.16
Recall: 0.69
F1 score: 0.26
MCC Score: 0.26


### Logistic Regression

In [51]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(l_X_train_ns,y_train_ns)

LogisticRegression()

In [52]:
y_pred = logistic.predict(l_X_test)
model_metrics = evaluate_preds(y_test, y_pred)

Acc: 80.72%
Precision: 0.16
Recall: 0.65
F1 score: 0.26
MCC Score: 0.26


### k Nearest Neighbours

In [53]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=40)
neigh.fit(l_X_train_ns,y_train_ns)

KNeighborsClassifier(n_neighbors=40)

In [54]:
y_pred = neigh.predict(l_X_test)
model_metrics = evaluate_preds(y_test, y_pred)

Acc: 76.91%
Precision: 0.13
Recall: 0.59
F1 score: 0.21
MCC Score: 0.19


### RandomForestClassifier

In [55]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy')
rf.fit(l_X_train_ns,y_train_ns)

RandomForestClassifier(criterion='entropy', n_estimators=500, n_jobs=-1)

In [56]:
y_pred = rf.predict(l_X_test)
model_metrics = evaluate_preds(y_test, y_pred)

Acc: 94.81%
Precision: 1.00
Recall: 0.02
F1 score: 0.04
MCC Score: 0.13
