In [5]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


# Handling missing values

In [8]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [9]:
val = df.mean(numeric_only=True)
val

id                   36517.829354
age                     43.226614
hypertension             0.097456
heart_disease            0.054012
avg_glucose_level      106.147677
bmi                     28.893237
stroke                   0.048728
dtype: float64

In [10]:
df.fillna(val, inplace=True)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [11]:
df.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

# Checking invalid data

In [12]:
df[df.bmi > 30]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
11,12095,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1
15,58202,Female,50.0,1,0,Yes,Self-employed,Rural,167.41,30.9,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5092,56799,Male,76.0,0,0,Yes,Govt_job,Urban,82.35,38.9,never smoked,0
5097,64520,Male,68.0,0,0,Yes,Self-employed,Urban,91.68,40.8,Unknown,0
5103,22127,Female,18.0,0,0,No,Private,Urban,82.85,46.9,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0


# Encoding

columns need to be encoded = `gender, ever_married, work_type, residence_type, smoking_status`

## `gender`

male = 1; female = 0

In [13]:
df["gender_2"] = df.gender.map({"Male": 1, "Female": 0})
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_2
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,1.0
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1,0.0
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,1.0
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,0.0
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,0.0


## `ever_married`

yes = 1; no = 0

In [14]:
df["ever_married_2"] = df.ever_married.map({"Yes": 1, "No": 1})
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_2,ever_married_2
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,1.0,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1,0.0,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,1.0,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,0.0,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,0.0,1


## `Residence_type`

Urban = 1; Rural = 0

In [15]:
df["Residence_type_2"] = df.Residence_type.map({"Urban": 1, "Rural": 0})
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_2,ever_married_2,Residence_type_2
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,1.0,1,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1,0.0,1,0
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,1.0,1,0
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,0.0,1,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,0.0,1,0


## `work_type`

known values: private, self-employed, children, govt_job, never_worked

In [16]:
wrk = df["work_type"].values.reshape(-1,1)
wrk

array([['Private'],
       ['Self-employed'],
       ['Private'],
       ...,
       ['Self-employed'],
       ['Private'],
       ['Govt_job']], dtype=object)

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
ohe = OneHotEncoder()
wrk = ohe.fit_transform(wrk).toarray()
wrk

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [19]:
ohe.categories_

[array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
       dtype=object)]

In [20]:
categories_array = ohe.categories_[0]
categories_array

array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
      dtype=object)

In [21]:
df_ohe = pd.DataFrame(wrk, columns=categories_array)
df_ohe.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0


In [22]:
df = pd.concat([df_ohe, df], axis=1)
df.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_2,ever_married_2,Residence_type_2
0,0.0,0.0,1.0,0.0,0.0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,1.0,1,1
1,0.0,0.0,0.0,1.0,0.0,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1,0.0,1,0
2,0.0,0.0,1.0,0.0,0.0,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,1.0,1,0
3,0.0,0.0,1.0,0.0,0.0,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,0.0,1,1
4,0.0,0.0,0.0,1.0,0.0,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,0.0,1,0


## `smoking_status`

known values: 

In [23]:
df.groupby("smoking_status").size()

smoking_status
Unknown            1544
formerly smoked     885
never smoked       1892
smokes              789
dtype: int64

In [24]:
df = pd.get_dummies(df, columns=["smoking_status"], prefix_sep="_")

In [25]:
df.columns = df.columns.str.replace(" ", "_")

In [26]:
df.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,id,gender,age,hypertension,heart_disease,...,avg_glucose_level,bmi,stroke,gender_2,ever_married_2,Residence_type_2,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes
0,0.0,0.0,1.0,0.0,0.0,9046,Male,67.0,0,1,...,228.69,36.6,1,1.0,1,1,False,True,False,False
1,0.0,0.0,0.0,1.0,0.0,51676,Female,61.0,0,0,...,202.21,28.893237,1,0.0,1,0,False,False,True,False
2,0.0,0.0,1.0,0.0,0.0,31112,Male,80.0,0,1,...,105.92,32.5,1,1.0,1,0,False,False,True,False
3,0.0,0.0,1.0,0.0,0.0,60182,Female,49.0,0,0,...,171.23,34.4,1,0.0,1,1,False,False,False,True
4,0.0,0.0,0.0,1.0,0.0,1665,Female,79.0,1,0,...,174.12,24.0,1,0.0,1,0,False,False,True,False


# Check

In [27]:
df.iloc[3116]

Govt_job                              0.0
Never_worked                          0.0
Private                               1.0
Self-employed                         0.0
children                              0.0
id                                  56156
gender                              Other
age                                  26.0
hypertension                            0
heart_disease                           0
ever_married                           No
work_type                         Private
Residence_type                      Rural
avg_glucose_level                  143.33
bmi                                  22.4
stroke                                  0
gender_2                              NaN
ever_married_2                          1
Residence_type_2                        0
smoking_status_Unknown              False
smoking_status_formerly_smoked       True
smoking_status_never_smoked         False
smoking_status_smokes               False
Name: 3116, dtype: object

In [28]:
df.drop(labels=3116, axis=0, inplace=True)

In [29]:
df.isna().sum()

Govt_job                          0
Never_worked                      0
Private                           0
Self-employed                     0
children                          0
id                                0
gender                            0
age                               0
hypertension                      0
heart_disease                     0
ever_married                      0
work_type                         0
Residence_type                    0
avg_glucose_level                 0
bmi                               0
stroke                            0
gender_2                          0
ever_married_2                    0
Residence_type_2                  0
smoking_status_Unknown            0
smoking_status_formerly_smoked    0
smoking_status_never_smoked       0
smoking_status_smokes             0
dtype: int64

Dataset finally has zero missing values

# Scaling Numerical Attributes

## Separating

In [30]:
age = df.pop('age')
age.head()

0    67.0
1    61.0
2    80.0
3    49.0
4    79.0
Name: age, dtype: float64

In [31]:
gluc = df.pop('avg_glucose_level')
gluc.head()

0    228.69
1    202.21
2    105.92
3    171.23
4    174.12
Name: avg_glucose_level, dtype: float64

In [32]:
bmi = df.pop('bmi')
bmi.head()

0    36.600000
1    28.893237
2    32.500000
3    34.400000
4    24.000000
Name: bmi, dtype: float64

In [33]:
df_num = pd.concat([age, bmi, gluc], axis=1)
df_num.head()

Unnamed: 0,age,bmi,avg_glucose_level
0,67.0,36.6,228.69
1,61.0,28.893237,202.21
2,80.0,32.5,105.92
3,49.0,34.4,171.23
4,79.0,24.0,174.12


In [34]:
scaler = StandardScaler()

df_num_scaled = scaler.fit_transform(df_num)
df_num_scaled = pd.DataFrame(df_num_scaled, columns=df_num.columns)

In [35]:
joblib.dump(scaler, 'scaler_model.joblib')

['scaler_model.joblib']

In [36]:
df_num_scaled.head()

Unnamed: 0,age,bmi,avg_glucose_level
0,1.051242,1.001041,2.70645
1,0.785889,-0.000165,2.121652
2,1.626174,0.468399,-0.004867
3,0.255182,0.715233,1.437473
4,1.581949,-0.635858,1.501297


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5109 entries, 0 to 5109
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Govt_job                        5109 non-null   float64
 1   Never_worked                    5109 non-null   float64
 2   Private                         5109 non-null   float64
 3   Self-employed                   5109 non-null   float64
 4   children                        5109 non-null   float64
 5   id                              5109 non-null   int64  
 6   gender                          5109 non-null   object 
 7   hypertension                    5109 non-null   int64  
 8   heart_disease                   5109 non-null   int64  
 9   ever_married                    5109 non-null   object 
 10  work_type                       5109 non-null   object 
 11  Residence_type                  5109 non-null   object 
 12  stroke                          5109 no

## Concat

In [38]:
df1 = pd.concat([df, df_num_scaled], axis=1)

In [39]:
df1.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,id,gender,hypertension,heart_disease,ever_married,...,gender_2,ever_married_2,Residence_type_2,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,age,bmi,avg_glucose_level
0,0.0,0.0,1.0,0.0,0.0,9046.0,Male,0.0,1.0,Yes,...,1.0,1.0,1.0,False,True,False,False,1.051242,1.001041,2.70645
1,0.0,0.0,0.0,1.0,0.0,51676.0,Female,0.0,0.0,Yes,...,0.0,1.0,0.0,False,False,True,False,0.785889,-0.000165,2.121652
2,0.0,0.0,1.0,0.0,0.0,31112.0,Male,0.0,1.0,Yes,...,1.0,1.0,0.0,False,False,True,False,1.626174,0.468399,-0.004867
3,0.0,0.0,1.0,0.0,0.0,60182.0,Female,0.0,0.0,Yes,...,0.0,1.0,1.0,False,False,False,True,0.255182,0.715233,1.437473
4,0.0,0.0,0.0,1.0,0.0,1665.0,Female,1.0,0.0,Yes,...,0.0,1.0,0.0,False,False,True,False,1.581949,-0.635858,1.501297


In [40]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5110 entries, 0 to 3116
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Govt_job                        5109 non-null   float64
 1   Never_worked                    5109 non-null   float64
 2   Private                         5109 non-null   float64
 3   Self-employed                   5109 non-null   float64
 4   children                        5109 non-null   float64
 5   id                              5109 non-null   float64
 6   gender                          5109 non-null   object 
 7   hypertension                    5109 non-null   float64
 8   heart_disease                   5109 non-null   float64
 9   ever_married                    5109 non-null   object 
 10  work_type                       5109 non-null   object 
 11  Residence_type                  5109 non-null   object 
 12  stroke                          5109 no

In [41]:
df1.isna().sum()

Govt_job                          1
Never_worked                      1
Private                           1
Self-employed                     1
children                          1
id                                1
gender                            1
hypertension                      1
heart_disease                     1
ever_married                      1
work_type                         1
Residence_type                    1
stroke                            1
gender_2                          1
ever_married_2                    1
Residence_type_2                  1
smoking_status_Unknown            1
smoking_status_formerly_smoked    1
smoking_status_never_smoked       1
smoking_status_smokes             1
age                               1
bmi                               1
avg_glucose_level                 1
dtype: int64

In [42]:
df1.dropna(inplace=True)

In [43]:
df1.isna().sum()

Govt_job                          0
Never_worked                      0
Private                           0
Self-employed                     0
children                          0
id                                0
gender                            0
hypertension                      0
heart_disease                     0
ever_married                      0
work_type                         0
Residence_type                    0
stroke                            0
gender_2                          0
ever_married_2                    0
Residence_type_2                  0
smoking_status_Unknown            0
smoking_status_formerly_smoked    0
smoking_status_never_smoked       0
smoking_status_smokes             0
age                               0
bmi                               0
avg_glucose_level                 0
dtype: int64

# Final Dataset

In [44]:
df_fin = df1.drop(["id", "gender", "ever_married", "work_type", "Residence_type"], axis=1)

col = df_fin.pop('stroke')
df_fin.insert(17, "result", col)
df_fin.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children,hypertension,heart_disease,gender_2,ever_married_2,Residence_type_2,smoking_status_Unknown,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,age,bmi,avg_glucose_level,result
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,False,True,False,False,1.051242,1.001041,2.70645,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,False,False,True,False,0.785889,-0.000165,2.121652,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,False,False,True,False,1.626174,0.468399,-0.004867,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,False,False,False,True,0.255182,0.715233,1.437473,1.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,False,False,True,False,1.581949,-0.635858,1.501297,1.0


In [45]:
df_fin.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5108 entries, 0 to 5108
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Govt_job                        5108 non-null   float64
 1   Never_worked                    5108 non-null   float64
 2   Private                         5108 non-null   float64
 3   Self-employed                   5108 non-null   float64
 4   children                        5108 non-null   float64
 5   hypertension                    5108 non-null   float64
 6   heart_disease                   5108 non-null   float64
 7   gender_2                        5108 non-null   float64
 8   ever_married_2                  5108 non-null   float64
 9   Residence_type_2                5108 non-null   float64
 10  smoking_status_Unknown          5108 non-null   object 
 11  smoking_status_formerly_smoked  5108 non-null   object 
 12  smoking_status_never_smoked     5108 no

## Saving to `.csv`

In [46]:
df_fin.to_csv("healthcare-dataset-stroke-data_CLEAN.csv", index=False)