In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import f1_score
from sklearn.metrics import auc

In [None]:
#import file
df = pd.read_csv(os.path.join(dirname, filename))

In [None]:
df.head()

In [None]:
#Removing the unncessary Id column
df.drop(columns = ['id'], inplace = True)

In [None]:
df.head()

In [None]:
#Checking Null values
df.apply(lambda x: sum(x.isnull()),axis=0)

In [None]:
#201 out of 5111 values are null. Dropping those null entries
df.dropna(inplace=True)

In [None]:
#Our data containes No null values now
df.apply(lambda x: sum(x.isnull()),axis=0)

In [None]:
sns.countplot(x="stroke", data=df, palette="bwr")
plt.show()

In [None]:
sns.countplot(x="hypertension", data=df, palette="rocket")
plt.show()

In [None]:
sns.countplot(x="gender", data=df, palette="deep")
plt.show()

In [None]:
df['gender'].value_counts()

In [None]:
#Dropping 'Other' row values, since it has occured only once
Others = df[(df['gender'] == 'Other')].index
df.drop(Others , inplace=True)

In [None]:
#Converting Gender into Category 0-Female, 1-Male
df["gender"] = df["gender"].astype('category')
df["gender"] = df["gender"].cat.codes

In [None]:
#Evermarried 1-Yes, 0-No
df["ever_married"] = df["ever_married"].astype('category')
df["ever_married"] = df["ever_married"].cat.codes

In [None]:
df['work_type'].value_counts()

In [None]:
#Cannot form a hierarchy or preference over worktype. Hence, converted to dummies
df = pd.get_dummies(df, prefix=['w_type'], columns=['work_type'])

In [None]:
#Anyone who used to smoke or smokes has been categorized into 1 category. THIS IS 1 APPROACH
df['smoking_status'] = df['smoking_status'].map( 
                   {'formerly smoked':1 ,'smokes':1,'never smoked':0,'Unknown':0}) 

In [None]:
df['Residence_type'].value_counts()

In [None]:
df = pd.get_dummies(df, prefix=['residency_'], columns=['Residence_type'])

In [None]:
df.head()

In [None]:
#Starting with Logistic Regression
#f(x) = 1/(1+e^(-x))
from sklearn.linear_model import LogisticRegression

In [None]:
#Method for XY Split
def XYsplit(df, label_col):
    y = df[label_col].copy()
    X = df.drop(label_col,axis=1)
    return X,y

In [None]:
X,y = XYsplit(df,'stroke')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=0)

In [None]:
LogReg = LogisticRegression(max_iter=1000)
LogReg.fit(X_train,y_train) 

In [None]:
predictions = LogReg.predict(X_test)
#accuracy
accuracy = accuracy_score(y_test, predictions)*100

#precision
precision = precision_score(y_test, predictions,pos_label=1,labels=[0,1])*100
    
    #Recall
recall = recall_score(y_test, predictions,pos_label=1,labels=[0,1])*100
    
    #get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(y_test, predictions)
    
    #AUC
auc_val = auc(fpr, tpr)
    
    #F-Score
f_score = f1_score(y_test, predictions)

In [None]:
        print("Accuracy: \n", accuracy)
        print("Precision of event Happening: \n", precision)
        print("Recall of event Happening: \n", recall)
        print("AUC: \n",auc_val)
        print("F-Score:\n", f_score)
        plt.title('ROC Curve')
        plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_val))
        plt.plot([0,1],[0,1],'r--')
        plt.xlim([-0.1,1.1])
        plt.ylim([-0.1,1.1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.legend(loc='lower right')
        plt.show()

In [None]:
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions,labels=[0,1]))