In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_fscore_support

In [2]:
df = pd.read_csv('Dataset/Raw Data/stroke.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [82]:
#drop id
df.drop('id', axis=1, inplace=True)

In [83]:
#check for missing values
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [84]:
# Onehot encoding gender
df= pd.get_dummies(df, columns = ['gender','smoking_status','Residence_type','ever_married',])
df.head()

Unnamed: 0,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,Residence_type_Rural,Residence_type_Urban,ever_married_No,ever_married_Yes
0,67.0,0,1,Private,228.69,36.6,1,0,1,0,0,1,0,0,0,1,0,1
1,61.0,0,0,Self-employed,202.21,,1,1,0,0,0,0,1,0,1,0,0,1
2,80.0,0,1,Private,105.92,32.5,1,0,1,0,0,0,1,0,1,0,0,1
3,49.0,0,0,Private,171.23,34.4,1,1,0,0,0,0,0,1,0,1,0,1
4,79.0,1,0,Self-employed,174.12,24.0,1,1,0,0,0,0,1,0,1,0,0,1


In [85]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,Residence_type_Rural,Residence_type_Urban,ever_married_No,ever_married_Yes
0,67.0,0,1,Private,228.69,36.6,1,0,1,0,0,1,0,0,0,1,0,1
1,61.0,0,0,Self-employed,202.21,,1,1,0,0,0,0,1,0,1,0,0,1
2,80.0,0,1,Private,105.92,32.5,1,0,1,0,0,0,1,0,1,0,0,1
3,49.0,0,0,Private,171.23,34.4,1,1,0,0,0,0,0,1,0,1,0,1
4,79.0,1,0,Self-employed,174.12,24.0,1,1,0,0,0,0,1,0,1,0,0,1


In [94]:
# Co relation matrix
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
age,1.0,0.276398,0.263796,0.238171,0.333398,0.245257
hypertension,0.276398,1.0,0.108306,0.174474,0.167811,0.127904
heart_disease,0.263796,0.108306,1.0,0.161857,0.041357,0.134914
avg_glucose_level,0.238171,0.174474,0.161857,1.0,0.175502,0.131945
bmi,0.333398,0.167811,0.041357,0.175502,1.0,0.042374
stroke,0.245257,0.127904,0.134914,0.131945,0.042374,1.0


In [110]:
# Drop gender','smoking_status','Residence_type','ever_married
df.drop('id', axis=1, inplace=True)
df.drop('smoking_status', axis=1, inplace=True)
df.drop('Residence_type', axis=1, inplace=True)
#df.drop('ever_married', axis=1, inplace=True)
df.drop('work_type', axis=1, inplace=True)
df.drop('gender', axis=1, inplace=True)
df.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke
0,67.0,0,1,Yes,228.69,36.6,1
1,61.0,0,0,Yes,202.21,,1
2,80.0,0,1,Yes,105.92,32.5,1
3,49.0,0,0,Yes,171.23,34.4,1
4,79.0,1,0,Yes,174.12,24.0,1


In [113]:
# Onehot encoding married yes=1 and no = 1
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})
df.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke
0,67.0,0,1,,228.69,36.6,1
1,61.0,0,0,,202.21,,1
2,80.0,0,1,,105.92,32.5,1
3,49.0,0,0,,171.23,34.4,1
4,79.0,1,0,,174.12,24.0,1


In [4]:
print('Hello')

Hello
