# HEART STROKE DETECTION

# IMPORTING NECESSARY LIBRARIES

In [37]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
import numpy as np

# DATA PREPROCESSING

Import heart stroke dataset :

In [38]:
stroke_data=pd.read_csv(r'D:\Keethu\MSc data science\5th sem\ml lab\New folder\healthcare-dataset-stroke-data.csv')
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


Specify independent and dependent variables :

In [39]:
X=stroke_data.drop('stroke',axis=1)
y=stroke_data["stroke"]

Detecting Missing values in independent variables : 

In [40]:
X.isna().any()

id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
dtype: bool

Replacing missing values with mode (since data is categorical) :

In [41]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputerimputer= imputer.fit(X)  
X= imputer.transform(X)  
X=pd.DataFrame(data=X,columns=['id','gender','age','hypertension','heart_disease','ever_married','work_type','Residence_type','avg_glucose_level','bmi','smoking_status'])

In [42]:
X.isna().any()

id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
dtype: bool

Changing column type to categorical :

In [43]:
X['gender'] = X['gender'].astype('category')
X['ever_married']=X['ever_married'].astype('category')
X['work_type']=X['work_type'].astype('category')
X['Residence_type']=X['Residence_type'].astype('category')
X['smoking_status']=X['smoking_status'].astype('category')

Encoding categorical data :

In [44]:
X['gender'] = X['gender'].cat.codes
X['ever_married']=X['ever_married'].cat.codes
X['work_type']=X['work_type'].cat.codes
X['Residence_type']=X['Residence_type'].cat.codes
X['smoking_status']=X['smoking_status'].cat.codes

In [45]:
X.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.7,2
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2


Splitting the Dataset into the Training set and Test set :

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.70, random_state=0)

Feature scaling :

In [47]:
from sklearn.preprocessing import StandardScaler
ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)
ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)

# INITIALIZING VARIOUS BINARY CLASSIFIERS

# Logistic Regression :

In [48]:
models = {}
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# K-Nearest Neighbors :

In [49]:
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

# Naive Bayes :

In [50]:
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# Random Forest :

In [51]:
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Decision Trees :

In [52]:
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Support Vector Machines :

In [53]:
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# PERFORMANCE EVALUATION

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    models[key].fit(X_train, y_train)
    predictions = models[key].predict(X_test)
    accuracy[key] = accuracy_score(predictions, y_test)

In [57]:
evaluation = pd.DataFrame(columns=['Model','Accuracy'])
evaluation['Model']=models.keys()
evaluation['Accuracy'] = accuracy.values()

evaluation

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.952195
1,K-Nearest Neighbor,0.951356
2,Naive Bayes,0.878669
3,Random Forest,0.953033
4,Decision Trees,0.911378
5,Support Vector Machines,0.953592


# Best model :

In [67]:
best_model=evaluation[evaluation.Accuracy == evaluation.Accuracy.max()]
b_model=best_model['Model'].loc[best_model.index[0]]
b_accuracy=best_model['Accuracy'].loc[best_model.index[0]]*100
print("Best Model :",b_model,"\nAccuracy :",round(b_accuracy,2),"%")

Best Model : Support Vector Machines 
Accuracy : 95.36 %
