In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [147]:
path = '../data/raw/heart_2020_cleaned.csv'

In [148]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


The dataset has boolean, categorical, and numerical type features.<br>
The boolean features include HeartDisease, AlcoholDrinking, Stroke, DiffWalking, Diabetic, PhysicalActivity, Asthma, KidneyDisease, and SkinCancer. In preprocessing, these features can be represented with 1 to answer "Yes", or a 0 to answer "No". Sex could also be represented with a 1 for "Male" and a 0 for "Female".<br>

Categorical features include AgeCategory, Race, and GenHealth. AgeCategory and GenHealth could be represented with a numerical scale. Race could also be transformed to be represented with a lookup table but it doesn't seem necessary to transform it.<br>

Numerical features include BMI, PhysicalHealth, MentalHealth, and SleepTime. These will be scaled using sklearn's StandardScaler.


In [149]:
new_df = pd.DataFrame()

In [150]:
#function: create dummy dataframe for boolean-type columns and return the column which indicates an affirmative response.
def yes_dummies(col_name):
    return pd.get_dummies(df[col_name], prefix=col_name)[[col_name+'_Yes']]

In [151]:
new_df = pd.concat([new_df,yes_dummies('Smoking')])
new_df = pd.concat([new_df,yes_dummies('AlcoholDrinking')],axis=1)

In [152]:
#loop through all boolean-type features and create dummy-like responses.
bool_cat = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Diabetic', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
for cat in bool_cat:
    new_df = pd.concat([new_df,yes_dummies(cat)],axis=1)
new_df.columns

Index(['Smoking_Yes', 'AlcoholDrinking_Yes', 'HeartDisease_Yes', 'Smoking_Yes',
       'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes', 'Diabetic_Yes',
       'PhysicalActivity_Yes', 'Asthma_Yes', 'KidneyDisease_Yes',
       'SkinCancer_Yes'],
      dtype='object')

In [153]:
new_df.head()

Unnamed: 0,Smoking_Yes,AlcoholDrinking_Yes,HeartDisease_Yes,Smoking_Yes.1,AlcoholDrinking_Yes.1,Stroke_Yes,DiffWalking_Yes,Diabetic_Yes,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,1,0,0,1,0,0,0,1,1,1,0,1
1,0,0,0,0,0,1,0,0,1,0,0,0
2,1,0,0,1,0,0,0,1,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,1,0,0,0


In [154]:
df['Sex'].unique()

array(['Female', 'Male'], dtype=object)

In [155]:
dummy_sex = pd.get_dummies(df['Sex'],drop_first=True).rename(columns={'Male':'Dummy_Sex'})
dummy_sex.head(3)

Unnamed: 0,Dummy_Sex
0,0
1,0
2,1


In [156]:
new_df = pd.concat([new_df, dummy_sex], axis=1)
new_df.columns

Index(['Smoking_Yes', 'AlcoholDrinking_Yes', 'HeartDisease_Yes', 'Smoking_Yes',
       'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes', 'Diabetic_Yes',
       'PhysicalActivity_Yes', 'Asthma_Yes', 'KidneyDisease_Yes',
       'SkinCancer_Yes', 'Dummy_Sex'],
      dtype='object')

Scaling the numerical features: BMI, PhysicalHealth, MentalHealth, and SleepTime.

In [157]:
to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
for x in to_scale:
    scaler = StandardScaler()
    scaler_df = pd.DataFrame(scaler.fit_transform(df[[x]]),columns=[x])
    new_df = pd.concat([new_df, scaler_df], axis=1)
new_df.columns

Index(['Smoking_Yes', 'AlcoholDrinking_Yes', 'HeartDisease_Yes', 'Smoking_Yes',
       'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes', 'Diabetic_Yes',
       'PhysicalActivity_Yes', 'Asthma_Yes', 'KidneyDisease_Yes',
       'SkinCancer_Yes', 'Dummy_Sex', 'BMI', 'PhysicalHealth', 'MentalHealth',
       'SleepTime'],
      dtype='object')

In [158]:
new_df.head(3)

Unnamed: 0,Smoking_Yes,AlcoholDrinking_Yes,HeartDisease_Yes,Smoking_Yes.1,AlcoholDrinking_Yes.1,Stroke_Yes,DiffWalking_Yes,Diabetic_Yes,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes,Dummy_Sex,BMI,PhysicalHealth,MentalHealth,SleepTime
0,1,0,0,1,0,0,0,1,1,1,0,1,0,-1.84475,-0.046751,3.281069,-1.460354
1,0,0,0,0,0,1,0,0,1,0,0,0,0,-1.256338,-0.42407,-0.490039,-0.067601
2,1,0,0,1,0,0,0,1,1,1,0,0,1,-0.274603,2.091388,3.281069,0.628776


Transforming the categorical responses to a numbered scale: AgeCategory, Race, GenHealth.

In [159]:
#AgeCategory
enum_age = enumerate(sorted(df['AgeCategory'].unique()))
age_lookup = {v: i for i,v in enum_age}

In [160]:
enum_age_dict

{'18-24': 0,
 '25-29': 1,
 '30-34': 2,
 '35-39': 3,
 '40-44': 4,
 '45-49': 5,
 '50-54': 6,
 '55-59': 7,
 '60-64': 8,
 '65-69': 9,
 '70-74': 10,
 '75-79': 11,
 '80 or older': 12}

In [161]:
new_df = pd.concat([new_df, df[['AgeCategory']].replace(age_lookup).rename(columns={'AgeCategory':'Code_AgeCategory'})], axis=1)

In [162]:
new_df.head(3)

Unnamed: 0,Smoking_Yes,AlcoholDrinking_Yes,HeartDisease_Yes,Smoking_Yes.1,AlcoholDrinking_Yes.1,Stroke_Yes,DiffWalking_Yes,Diabetic_Yes,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes,Dummy_Sex,BMI,PhysicalHealth,MentalHealth,SleepTime,Code_AgeCategory
0,1,0,0,1,0,0,0,1,1,1,0,1,0,-1.84475,-0.046751,3.281069,-1.460354,7
1,0,0,0,0,0,1,0,0,1,0,0,0,0,-1.256338,-0.42407,-0.490039,-0.067601,12
2,1,0,0,1,0,0,0,1,1,1,0,0,1,-0.274603,2.091388,3.281069,0.628776,9


In [163]:
df['GenHealth'].unique()

array(['Very good', 'Fair', 'Good', 'Poor', 'Excellent'], dtype=object)

In [164]:
GenHealth_lookup = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Very good': 3, 'Excellent':4}
new_df = pd.concat([new_df, df[['GenHealth']].replace(GenHealth_lookup).rename(columns={'GenHealth': 'Code_GenHealth'})], axis=1)

In [165]:
#The Race feature won't be codified as there isn't as much sense to transform Race to a numerical scale like what was done for the AgeCategory and GenHealth features.
#The Race feature will just be added to the new dataframe.
df['Race'].unique()

array(['White', 'Black', 'Asian', 'American Indian/Alaskan Native',
       'Other', 'Hispanic'], dtype=object)

In [166]:
new_df = pd.concat([new_df, df['Race']],axis=1)

In [167]:
new_df.head(3)

Unnamed: 0,Smoking_Yes,AlcoholDrinking_Yes,HeartDisease_Yes,Smoking_Yes.1,AlcoholDrinking_Yes.1,Stroke_Yes,DiffWalking_Yes,Diabetic_Yes,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes,Dummy_Sex,BMI,PhysicalHealth,MentalHealth,SleepTime,Code_AgeCategory,Code_GenHealth,Race
0,1,0,0,1,0,0,0,1,1,1,0,1,0,-1.84475,-0.046751,3.281069,-1.460354,7,3,White
1,0,0,0,0,0,1,0,0,1,0,0,0,0,-1.256338,-0.42407,-0.490039,-0.067601,12,3,White
2,1,0,0,1,0,0,0,1,1,1,0,0,1,-0.274603,2.091388,3.281069,0.628776,9,1,White


In [168]:
#export dataframe as a data set.
new_df.to_csv('../data/processed/heart_2020_preprocessed.csv')