In [9]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 1.9 MB/s eta 0:00:01
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from scipy.stats import chi2_contingency
from scipy.stats import norm 
import math
from scipy import stats
from scipy.stats import t
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce 
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('post_eda.csv',index_col='id',usecols=['id','gender','age','hypertension','heart_disease','ever_married','work_type','Residence_type','avg_glucose_level','bmi','smoking_status','stroke'])

In [3]:
df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


During EDA, residence type and gender came back with no statistical signficance, I don't feel comfortable assuming that gender is not a significant factor when it comes to stroke risk, so I will only focus on what happens if I were to remove the Residence_type column. I want to keep both dataframes to see the difference in results further down the line. 

In [4]:
df_nores = df.drop(columns='Residence_type')
df_nores.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9046,Male,67.0,0,1,Yes,Private,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,202.21,28.893237,never smoked,1
31112,Male,80.0,0,1,Yes,Private,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,174.12,24.0,never smoked,1


# Creating indicator variables for categorical columns using Binary Encoder

* Went with Binary Encoder over Ordinal Encoding here since my data does not necesarily comprised of ordered variables. Binary encoding keeps the dimensionality down (vs. One Hot Encoding) and saves on memory. 

In [5]:
encoder = ce.BinaryEncoder(cols=['gender','ever_married','work_type','Residence_type','smoking_status'],return_df=True,drop_invariant=False)
encoded_df = encoder.fit_transform(df)
encoded_df.head()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,gender_0,gender_1,gender_2,age,hypertension,heart_disease,ever_married_0,ever_married_1,work_type_0,work_type_1,work_type_2,work_type_3,Residence_type_0,Residence_type_1,avg_glucose_level,bmi,smoking_status_0,smoking_status_1,smoking_status_2,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9046,0,0,1,67.0,0,1,0,1,0,0,0,1,0,1,228.69,36.6,0,0,1,1
51676,0,1,0,61.0,0,0,0,1,0,0,1,0,1,0,202.21,28.893237,0,1,0,1
31112,0,0,1,80.0,0,1,0,1,0,0,0,1,1,0,105.92,32.5,0,1,0,1
60182,0,1,0,49.0,0,0,0,1,0,0,0,1,0,1,171.23,34.4,0,1,1,1
1665,0,1,0,79.0,1,0,0,1,0,0,1,0,1,0,174.12,24.0,0,1,0,1


In [6]:
encoder = ce.BinaryEncoder(cols=['gender','ever_married','work_type','smoking_status'],return_df=True)
encoded_df_nores = encoder.fit_transform(df_nores)
encoded_df_nores.head()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,gender_0,gender_1,gender_2,age,hypertension,heart_disease,ever_married_0,ever_married_1,work_type_0,work_type_1,work_type_2,work_type_3,avg_glucose_level,bmi,smoking_status_0,smoking_status_1,smoking_status_2,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9046,0,0,1,67.0,0,1,0,1,0,0,0,1,228.69,36.6,0,0,1,1
51676,0,1,0,61.0,0,0,0,1,0,0,1,0,202.21,28.893237,0,1,0,1
31112,0,0,1,80.0,0,1,0,1,0,0,0,1,105.92,32.5,0,1,0,1
60182,0,1,0,49.0,0,0,0,1,0,0,0,1,171.23,34.4,0,1,1,1
1665,0,1,0,79.0,1,0,0,1,0,0,1,0,174.12,24.0,0,1,0,1


# Train Test split 

In [7]:
# df with Residence Type 
X = encoded_df.drop(columns = 'stroke')
y = encoded_df['stroke']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [8]:
# df without Residence Type 
X = encoded_df_nores.drop(columns = 'stroke')
y = encoded_df_nores['stroke']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=12)

# Normalizing numerical features
* EDA showed all numerical features were not normally distributed

In [9]:
encoded_df.dtypes

gender_0               int64
gender_1               int64
gender_2               int64
age                  float64
hypertension           int64
heart_disease          int64
ever_married_0         int64
ever_married_1         int64
work_type_0            int64
work_type_1            int64
work_type_2            int64
work_type_3            int64
Residence_type_0       int64
Residence_type_1       int64
avg_glucose_level    float64
bmi                  float64
smoking_status_0       int64
smoking_status_1       int64
smoking_status_2       int64
stroke                 int64
dtype: object

In [10]:
scaler = MinMaxScaler()

In [11]:
cols_to_norm = ['age','avg_glucose_level','bmi']
encoded_df[cols_to_norm] = scaler.fit_transform(encoded_df[cols_to_norm])
encoded_df

Unnamed: 0_level_0,gender_0,gender_1,gender_2,age,hypertension,heart_disease,ever_married_0,ever_married_1,work_type_0,work_type_1,work_type_2,work_type_3,Residence_type_0,Residence_type_1,avg_glucose_level,bmi,smoking_status_0,smoking_status_1,smoking_status_2,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9046,0,0,1,0.816895,0,1,0,1,0,0,0,1,0,1,0.801265,0.301260,0,0,1,1
51676,0,1,0,0.743652,0,0,0,1,0,0,1,0,1,0,0.679023,0.212981,0,1,0,1
31112,0,0,1,0.975586,0,1,0,1,0,0,0,1,1,0,0.234512,0.254296,0,1,0,1
60182,0,1,0,0.597168,0,0,0,1,0,0,0,1,0,1,0.536008,0.276060,0,1,1,1
1665,0,1,0,0.963379,1,0,0,1,0,0,1,0,1,0,0.549349,0.156930,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18234,0,1,0,0.975586,1,0,0,1,0,0,0,1,0,1,0.132167,0.212981,0,1,0,0
44873,0,1,0,0.987793,0,0,0,1,0,0,1,0,0,1,0.323516,0.340206,0,1,0,0
19723,0,1,0,0.426270,0,0,0,1,0,0,1,0,1,0,0.128658,0.232532,0,1,0,0
37544,0,0,1,0.621582,0,0,0,1,0,0,0,1,1,0,0.513203,0.175258,0,0,1,0


In [12]:
encoded_df_nores[cols_to_norm] = scaler.fit_transform(encoded_df_nores[cols_to_norm])
encoded_df_nores

Unnamed: 0_level_0,gender_0,gender_1,gender_2,age,hypertension,heart_disease,ever_married_0,ever_married_1,work_type_0,work_type_1,work_type_2,work_type_3,avg_glucose_level,bmi,smoking_status_0,smoking_status_1,smoking_status_2,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9046,0,0,1,0.816895,0,1,0,1,0,0,0,1,0.801265,0.301260,0,0,1,1
51676,0,1,0,0.743652,0,0,0,1,0,0,1,0,0.679023,0.212981,0,1,0,1
31112,0,0,1,0.975586,0,1,0,1,0,0,0,1,0.234512,0.254296,0,1,0,1
60182,0,1,0,0.597168,0,0,0,1,0,0,0,1,0.536008,0.276060,0,1,1,1
1665,0,1,0,0.963379,1,0,0,1,0,0,1,0,0.549349,0.156930,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18234,0,1,0,0.975586,1,0,0,1,0,0,0,1,0.132167,0.212981,0,1,0,0
44873,0,1,0,0.987793,0,0,0,1,0,0,1,0,0.323516,0.340206,0,1,0,0
19723,0,1,0,0.426270,0,0,0,1,0,0,1,0,0.128658,0.232532,0,1,0,0
37544,0,0,1,0.621582,0,0,0,1,0,0,0,1,0.513203,0.175258,0,0,1,0


In [13]:
encoded_df.to_csv('encodeddf.csv')

In [14]:
encoded_df_nores.to_csv('encodednoresdf.csv')