In [3]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [4]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read Dataset

In [5]:
df=pd.read_csv("/kaggle/input/brain-stroke-dataset/brain_stroke.csv")

# Check Head and Tail of the datset

In [11]:
df.head()

In [10]:

df.tail()

# Check the shape of the dataset

In [7]:
df.shape

In [12]:
df.info()

In [13]:
df.describe()

# Check any null value in the dataset

In [14]:
df.isnull().sum()

In [15]:
df.nunique()

# Now check the columns of the dataset

In [16]:
df.columns

In [18]:
df_cat=df[['gender', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 
                 'smoking_status', 'stroke']]

# Now check the unique things in the dataset

In [19]:
for i in df_cat.columns:
    print(df_cat[i].unique())

In [20]:
for i in df_cat.columns:
    print(df_cat[i].value_counts())

# Now visualization Part 

In [21]:
for i in df_cat.columns:
    plt.figure(figsize = (15,6))
    sns.countplot(df_cat[i], data = df_cat, palette = 'hls')
    plt.xticks(rotation = 90)
    plt.show()

In [22]:
for i in df_cat.columns:
    plt.figure(figsize = (15,6))
    df_cat[i].value_counts().plot(kind = 'pie', autopct = '%1.1f%%')
    plt.xticks(rotation = 90)
    plt.show()

In [23]:
for i in df_cat.columns:
    plt.figure(figsize = (15,6))
    sns.countplot(df_cat[i], data = df_cat, hue = 'stroke' , palette = 'hls')
    plt.xticks(rotation = 90)
    plt.show()

In [24]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)  

In [25]:
gender = df.groupby(df['gender'])['stroke'].sum()
data_gender = pd.DataFrame({'labels': gender.index,
                   'values': gender.values
                  })
colors = ['lightpink', 'lightskyblue']
data_gender.iplot(kind='pie',labels='labels',values='values', title='The Proportion of Stroke among Gender', colors = colors)

In [28]:
job = df.groupby(df['work_type'])['stroke'].sum()
data_job = pd.DataFrame({'labels': job.index,
                   'values': job.values
                  })
colors2= ['palegreen','paleturquoise','thistle','moccasin']
data_job.iplot(kind='pie',labels='labels',values='values', title='Work type of people who had stroke', colors = colors2,pull=[0.1, 0.1, 0.1, 0.2])

In [29]:
smoke = df.groupby(df['smoking_status'])['stroke'].sum()
data_smoke = pd.DataFrame({'labels': smoke.index,'values': smoke.values})
data_smoke.iplot(kind='pie',labels='labels',values='values', title='Smoking status of people who had stroke', colors = colors2, 
            pull=[0.02, 0.02, 0.1, 0.02])

In [30]:
Residence = df.groupby(df['Residence_type'])['stroke'].sum()
data_Residence = pd.DataFrame({'labels': Residence.index,
                   'values': Residence.values
                  })
data_Residence.iplot(kind='pie',labels='labels',values='values', title='Residence area of people who had stroke', colors = colors2, 
            pull=[0.02, 0.02],hole = 0.3)

In [31]:
Married = df.groupby(df['ever_married'])['stroke'].sum()
data_Married = pd.DataFrame({'labels': Married.index,
                   'values': Married.values
                  })
data_Married.iplot(kind='pie',labels='labels',values='values', title='Marriage status of people who had stroke', colors = colors2, 
            pull=[0.02, 0.02],hole = 0.3)

In [32]:
df_num = df[['age', 'avg_glucose_level', 'bmi']]


In [33]:
for i in df_num.columns:
    plt.figure(figsize = (15,6))
    sns.histplot(df_num[i], palette = 'hls')
    plt.xticks(rotation = 90)
    plt.show()

In [None]:
stroke = df.loc[df['stroke']== 1].reset_index()
stroke["male_age"]=stroke[stroke["gender"]=="Male"]["age"]
stroke["female_age"]=stroke[stroke["gender"]=="Female"]["age"]
stroke[["male_age","female_age"]].iplot(kind="histogram", bins=20, theme="white", title="Stroke Ages",
         xTitle='Ages', yTitle='Count')

In [35]:
df['ever_married'] = [ 0 if i !='Yes' else 1 for i in df['ever_married'] ]
df['gender'] = [0 if i != 'Female' else 1 for i in df['gender']]

In [36]:
df.head()

In [37]:
df=pd.get_dummies(df,columns=['work_type','Residence_type','smoking_status'])

In [38]:
df.sample(10)

# Now split the dataset using train test split

In [39]:
from sklearn.model_selection import train_test_split

In [41]:
X=df.drop(['stroke'],axis=1)
y=df['stroke']

In [42]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.25,random_state=3)

In [43]:
X.shape

In [44]:
y.shape

# Now Model Building Part

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# LogisticRegression

In [48]:
log_reg=LogisticRegression(random_state=3)
log_reg.fit(X_train,y_train)
y_pred=log_reg.predict(X_test)

In [49]:
from sklearn.metrics import confusion_matrix

In [50]:
cm=confusion_matrix(y_test,y_pred)

In [51]:
print(cm)

In [52]:
print('Training-set accuracy score:', log_reg.score(X_train, y_train))

In [53]:
print('Test-set accuracy score:', log_reg.score(X_test, y_test))

In [54]:
from sklearn import metrics

In [55]:
acc_log_clf = metrics.accuracy_score(y_test,y_pred)
acc_log_clf

In [56]:
pred_list = [log_reg]

for i in pred_list:
    print("Score : ",i.score(X_test,y_test))
    y_pred = i.predict(X_test)
    sns.heatmap(confusion_matrix(y_test,y_pred),annot = True)
    plt.xlabel("Y_pred")
    plt.ylabel("Y_test")
    plt.title(i)
    plt.show()

# Now Decision Tree

In [57]:
dec_clf=DecisionTreeClassifier(criterion='gini',random_state=3,max_depth=5)
dec_clf.fit(X_train,y_train)
y_pred=dec_clf.predict(X_test)

In [58]:
cm=confusion_matrix(y_test,y_pred)
cm

In [59]:
print('Training-set accuracy score:', dec_clf.score(X_train, y_train))
print('Test-set accuracy score:', dec_clf.score(X_test, y_test))

In [60]:
acc_dec_clf = metrics.accuracy_score(y_test,y_pred)
acc_dec_clf

In [61]:
from sklearn.metrics import confusion_matrix
pred_list = [dec_clf]

for i in pred_list:
    print("Score : ",i.score(X_test,y_test))
    y_pred = i.predict(X_test)
    sns.heatmap(confusion_matrix(y_test,y_pred),annot = True)
    plt.xlabel("Y_pred")
    plt.ylabel("Y_test")
    plt.title(i)
    plt.show()

# RandomForestClassifier

In [69]:
rand_clf=RandomForestClassifier(n_estimators=20,criterion="entropy")
rand_clf.fit(X_train,y_train)
y_pred=rand_clf.predict(X_test)

In [63]:
cm=confusion_matrix(y_test,y_pred)
cm

In [66]:
print('Training-set accuracy score:', rand_clf.score(X_train, y_train))
print('Testing-set accuracy score:', rand_clf.score(X_test, y_test))

In [67]:
accuracy = metrics.accuracy_score(y_test,y_pred)
accuracy

In [68]:
from sklearn.metrics import confusion_matrix
pred_list = [rand_clf]

for i in pred_list:
    print("Score : ",i.score(X_test,y_test))
    y_pred = i.predict(X_test)
    sns.heatmap(confusion_matrix(y_test,y_pred),annot = True)
    plt.xlabel("Y_pred")
    plt.ylabel("Y_test")
    plt.title(i)
    plt.show()