# Heart Stroke prediction - Machine Learning approach

In [61]:
import pandas as pd 
import numpy as np 
import matplotlib as plt 

In [62]:
# Set the display format for floating-point numbers
pd.set_option('display.float_format', '{:.4f}'.format)

## Data Preview

In [63]:
stroke_data = pd.read_csv('data/stroke_data.csv')

In [64]:
stroke_data

Unnamed: 0,id,age,avg_glucose_level,bmi,ever_married,feat01,feat02,feat03,feat04,feat05,...,feat08,feat09,feat10,gender,heart_disease,hypertension,Residence_type,smoking_status,stroke,work_type
0,1,43.0000,92.7100,30.5000,Yes,0.6216,0.6636,1.0703,1.2745,1.3907,...,1.1747,1.1307,0.4158,Male,0,0,Urban,formerly smoked,0,Private
1,2,59.0000,93.9000,42.2000,Yes,0.2858,0.4051,1.2722,0.9503,1.5604,...,0.9100,0.5848,0.5955,Male,0,0,Rural,never smoked,0,Private
2,3,25.0000,92.1400,36.2000,Yes,0.6959,0.4963,1.4668,1.0263,0.5139,...,1.5253,0.8223,0.1276,Male,0,0,Rural,Unknown,0,Private
3,4,74.0000,205.8400,54.6000,Yes,0.7181,0.4085,0.6438,0.8952,0.9654,...,1.4003,1.2426,0.3291,Female,0,1,Urban,never smoked,0,Self-employed
4,5,34.0000,79.8000,37.4000,Yes,0.4722,0.4638,1.1612,1.3084,0.8025,...,1.5124,1.3106,0.3367,Female,0,0,Urban,smokes,0,Private
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5354,5355,62.0000,98.0500,27.9000,Yes,0.6588,0.4631,1.1928,0.8352,1.0432,...,1.2921,0.7789,0.5882,Female,0,0,Rural,never smoked,0,Private
5355,5356,78.0000,244.9700,,Yes,0.2496,0.4180,0.8447,0.7113,1.8057,...,0.7072,0.8729,0.4280,Male,0,0,Urban,formerly smoked,1,Private
5356,5357,56.0000,227.0400,23.0000,Yes,0.5955,0.6009,1.2593,0.2409,1.0876,...,1.3512,1.0232,0.7545,Female,0,0,Rural,smokes,0,Private
5357,5358,77.0000,190.6900,31.4000,Yes,0.3484,0.2541,0.9378,0.6646,1.4418,...,1.0762,1.1184,0.5017,Female,0,0,Rural,never smoked,1,Govt_job


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5359 entries, 0 to 5358
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5359 non-null   int64  
 1   age                5359 non-null   float64
 2   avg_glucose_level  5359 non-null   float64
 3   bmi                5118 non-null   float64
 4   ever_married       5359 non-null   object 
 5   feat01             5359 non-null   float64
 6   feat02             5359 non-null   float64
 7   feat03             5359 non-null   float64
 8   feat04             5359 non-null   float64
 9   feat05             5359 non-null   float64
 10  feat06             5359 non-null   float64
 11  feat07             5359 non-null   float64
 12  feat08             5359 non-null   float64
 13  feat09             5359 non-null   float64
 14  feat10             5359 non-null   float64
 15  gender             5359 non-null   object 
 16  heart_disease      5359 

In [66]:
stroke_data.columns

Index(['id', 'age', 'avg_glucose_level', 'bmi', 'ever_married', 'feat01',
       'feat02', 'feat03', 'feat04', 'feat05', 'feat06', 'feat07', 'feat08',
       'feat09', 'feat10', 'gender', 'heart_disease', 'hypertension',
       'Residence_type', 'smoking_status', 'stroke', 'work_type'],
      dtype='object')

In [67]:
stroke_data.isna().sum()

id                     0
age                    0
avg_glucose_level      0
bmi                  241
ever_married           0
feat01                 0
feat02                 0
feat03                 0
feat04                 0
feat05                 0
feat06                 0
feat07                 0
feat08                 0
feat09                 0
feat10                 0
gender                 0
heart_disease          0
hypertension           0
Residence_type         0
smoking_status         0
stroke                 0
work_type              0
dtype: int64

In [77]:
stroke_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,5359.0,44.3651,22.8401,0.08,26.0,46.0,62.0,82.0
avg_glucose_level,5359.0,107.3836,46.5744,48.78,77.34,92.23,115.41,271.74
bmi,5118.0,28.9577,7.8034,10.3,23.7,28.1,33.1,97.6
feat01,5359.0,0.5251,0.1331,0.0,0.4335,0.5259,0.6156,1.0
feat02,5359.0,0.5467,0.1095,0.0,0.4737,0.5481,0.6201,1.0
feat03,5359.0,0.9955,0.3201,0.1413,0.7371,0.9975,1.2523,1.8259
feat04,5359.0,0.9769,0.321,0.0596,0.7265,0.9802,1.2291,1.8603
feat05,5359.0,1.0426,0.3136,0.2189,0.7999,1.0428,1.296,1.8057
feat06,5359.0,0.961,0.319,0.0206,0.7131,0.9571,1.2129,1.8526
feat07,5359.0,0.9784,0.3244,0.1056,0.7233,0.9785,1.2294,1.8549


We definitely do not want to include `id` field into the modelling part, so we remove it now.

In [69]:
stroke_data = stroke_data.drop(columns='id')

## Stratified train-test sampling

In [70]:
stroke_data['stroke'].mean().round(4)*100

9.29

In [71]:
# Perform stratified sampling with sklearn 
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(stroke_data, stroke_data['stroke']):
    train_set = stroke_data.loc[train_index]
    test_set = stroke_data.loc[test_index]

In [72]:
test_set['stroke'].value_counts()/len(test_set)

stroke
0   0.9067
1   0.0933
Name: count, dtype: float64

In [73]:
train_set['stroke'].value_counts()/len(train_set)

stroke
0   0.9072
1   0.0928
Name: count, dtype: float64

## One Hot Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
stroke_data

## Exploratory Data Analysis

In [74]:
from sklearn.impute import SimpleImputer

In [75]:
imputer =SimpleImputer(strategy="median")

In [76]:
imputer.fit(train_set)

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'No'