# Load dataset and importing all packages

In [1]:
import pandas as pd
from statistics import median,mode,mean
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
df=pd.read_csv('health_care.csv')
def Average(lst):
    return sum(lst) / len(lst)

# Top five dataset instances

In [7]:
lst=df.dtypes
print(lst)

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


# Dataset information

In [None]:
df.info()

In [None]:
df.describe() #missing values and spread of data

# Replacing null values with Mean, Median, Mode

In [None]:
w=[] #empty list
w=df['bmi'] 
cleanlist=[x for x in w if (pd.isnull(x) == False)]
print(len(cleanlist))
b = median(cleanlist)
df['bmi'].fillna(b, inplace = True)

# Removing null value rows

In [None]:
df=df.dropna(axis=0)

In [None]:
df.info()

# Seprating dataframe

In [None]:
df_2=pd.DataFrame()
df_object=pd.DataFrame()
    
for i in range(len(df.columns)):
        
    if df.dtypes[i] == 'int64' or df.dtypes[i] == 'float64':
        df_2[df.columns[i]]=df[df.columns[i]]

    if df.dtypes[i] == 'object':
        df_object[df.columns[i]]=df[df.columns[i]]

In [None]:
df_2=df_2.drop('id',axis=1) #Dropping the id column

In [None]:
df_object.info()

# Label Encoder for categorical columns

In [None]:
label_encoder = preprocessing.LabelEncoder()
for j in df_object.columns:
    df_object[j]= label_encoder.fit_transform(df_object[j])

In [None]:
for w in df_object.columns:
    df_2[w]=df_object[w]

In [None]:
df_2.info()

# Algorithm

In [None]:
X = df_2.drop(columns = 'stroke',axis =1)
y = df_2['stroke']

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=9)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=9,shuffle ='True')

# Random forest

In [None]:
random_forest_model = RandomForestClassifier(random_state =42)
random_forest_model.fit(X_train,y_train)

In [None]:
predict_train_data = random_forest_model.predict(X_test)
print(predict_train_data)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
array=confusion_matrix(y_test, predict_train_data)
print(array)

In [None]:
df_cm = pd.DataFrame(array, range(2), range(2))
sns.set(font_scale=2.4) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

In [None]:
y_train_pred = cross_val_score(random_forest_model,X,y, cv=5)
average=Average(y_train_pred)
print(average)

# KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)

In [None]:
predict_train_data = knn_model.predict(X_test)
print(predict_train_data)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
array=confusion_matrix(y_test, predict_train_data)
print(array)

In [None]:
df_cm = pd.DataFrame(array, range(2), range(2))
sns.set(font_scale=2.4) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

In [None]:
y_train_pred = cross_val_score(knn_model,X,y, cv=5)
average=Average(y_train_pred)
print(average)

# Logistic regression

In [None]:
logistic_reg_model = LogisticRegression(max_iter=1000)
logistic_reg_model.fit(X_train,y_train)

In [None]:
predict_train_data = logistic_reg_model.predict(X_test)
print(predict_train_data)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
array=confusion_matrix(y_test, predict_train_data)
print(array)

In [None]:
df_cm = pd.DataFrame(array, range(2), range(2))
sns.set(font_scale=2.4) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

In [None]:
y_train_pred = cross_val_score(logistic_reg_model,X,y, cv=5)
average=Average(y_train_pred)
print(average)

# Suppor vector machine

In [None]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train,y_train)

In [None]:
predict_train_data = svm_model.predict(X_test)
print(predict_train_data)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
array=confusion_matrix(y_test, predict_train_data)
print(array)

In [None]:
df_cm = pd.DataFrame(array, range(2), range(2))
sns.set(font_scale=2.4) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

In [None]:
y_train_pred = cross_val_score(svm_model,X,y, cv=5)
average=Average(y_train_pred)
print(average)

# Decision Tree

In [None]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [None]:
predict_train_data=clf.predict(X_test)
print(predict_train_data)
acc=accuracy_score(y_test,predict_train_data)
accuracy=acc*100
print(str(accuracy)+"%")

In [None]:
array=confusion_matrix(y_test, predict_train_data)
print(array)

In [None]:
df_cm = pd.DataFrame(array, range(2), range(2))
sns.set(font_scale=2.4) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

In [None]:
y_train_pred = cross_val_score(clf,X,y, cv=5)
average=Average(y_train_pred)
print(average)