In [28]:
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

## Read data

In [3]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


## Pre-processing

In [4]:
df.drop('id',axis=1,inplace=True)

In [5]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [8]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


In [9]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [10]:
df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [11]:
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

#### Label Encoding

In [13]:
encoder = LabelEncoder()
df['gender'] = encoder.fit_transform(df['gender'])
df['smoking_status'] = encoder.fit_transform(df['smoking_status'])
df['ever_married'] = encoder.fit_transform(df['ever_married'])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,Private,Urban,228.69,36.6,1,1
1,0,61.0,0,0,1,Self-employed,Rural,202.21,28.893237,2,1
2,1,80.0,0,1,1,Private,Rural,105.92,32.5,2,1
3,0,49.0,0,0,1,Private,Urban,171.23,34.4,3,1
4,0,79.0,1,0,1,Self-employed,Rural,174.12,24.0,2,1


In [16]:
dummies = pd.concat([pd.get_dummies(df['work_type']),
                     pd.get_dummies(df['Residence_type'])] ,
                     axis = 1)
df = pd.concat([df , dummies], axis=1)
df.drop(['work_type' , 'Residence_type' ] , axis=1 , inplace = True)

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban
0,1,67.0,0,1,1,228.69,36.6,1,1,0,0,1,0,0,0,1
1,0,61.0,0,0,1,202.21,28.893237,2,1,0,0,0,1,0,1,0
2,1,80.0,0,1,1,105.92,32.5,2,1,0,0,1,0,0,1,0
3,0,49.0,0,0,1,171.23,34.4,3,1,0,0,1,0,0,0,1
4,0,79.0,1,0,1,174.12,24.0,2,1,0,0,0,1,0,1,0


### Scaling

In [18]:
y = df['stroke']
x = df.drop('stroke' , axis = 1)

In [20]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban
0,1.188073,1.051434,-0.328602,4.185032,0.723884,2.706375,1.001234e+00,-0.351781,-0.384111,-0.065756,0.864297,-0.436881,-0.394112,-0.984080,0.984080
1,-0.840344,0.786070,-0.328602,-0.238947,0.723884,2.121559,4.615554e-16,0.581552,-0.384111,-0.065756,-1.157010,2.288955,-0.394112,1.016178,-1.016178
2,1.188073,1.626390,-0.328602,4.185032,0.723884,-0.005028,4.685773e-01,0.581552,-0.384111,-0.065756,0.864297,-0.436881,-0.394112,1.016178,-1.016178
3,-0.840344,0.255342,-0.328602,-0.238947,0.723884,1.437358,7.154182e-01,1.514885,-0.384111,-0.065756,0.864297,-0.436881,-0.394112,-0.984080,0.984080
4,-0.840344,1.582163,3.043196,-0.238947,0.723884,1.501184,-6.357112e-01,0.581552,-0.384111,-0.065756,-1.157010,2.288955,-0.394112,1.016178,-1.016178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,-0.840344,1.626390,3.043196,-0.238947,0.723884,-0.494658,4.615554e-16,0.581552,-0.384111,-0.065756,0.864297,-0.436881,-0.394112,-0.984080,0.984080
5106,-0.840344,1.670617,-0.328602,-0.238947,0.723884,0.420775,1.442949e+00,0.581552,-0.384111,-0.065756,-1.157010,2.288955,-0.394112,-0.984080,0.984080
5107,-0.840344,-0.363842,-0.328602,-0.238947,0.723884,-0.511443,2.217363e-01,0.581552,-0.384111,-0.065756,-1.157010,2.288955,-0.394112,1.016178,-1.016178
5108,1.188073,0.343796,-0.328602,-0.238947,0.723884,1.328257,-4.278451e-01,-0.351781,-0.384111,-0.065756,0.864297,-0.436881,-0.394112,1.016178,-1.016178


### Splitting Data

In [22]:
X_train , X_test , Y_train , Y_test = train_test_split(X ,y ,train_size=0.8)

In [23]:
model1 = LogisticRegression(random_state=0 , solver= 'saga', penalty = 'none', max_iter = 5000, intercept_scaling = 2, fit_intercept= True, C= 5)

model1.fit(X_train , Y_train)



In [24]:
y_pred1 = model1.predict(X_test)

In [26]:
print(f"Logistic Accuracy {accuracy_score(Y_test ,y_pred1)}")

Logistic Accuracy 0.9500978473581213


In [29]:
print(classification_report(Y_test , y_pred1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       971
           1       0.00      0.00      0.00        51

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022



In [30]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'avg_glucose_level', 'bmi', 'smoking_status', 'stroke', 'Govt_job',
       'Never_worked', 'Private', 'Self-employed', 'children', 'Rural',
       'Urban'],
      dtype='object')