# 1.Setup & Imports
Setting all properties and imports necessaries for the model

In [75]:
import pandas as pd
import numpy as np
import re

%matplotlib inline

In [76]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# 2.Data read
Reading all data using pandas

In [77]:
data = pd.read_csv('./data/heart_disease_data.csv', sep = ',', encoding= 'utf-8')
data.shape

(67500, 19)

In [78]:
data = data.drop_duplicates()


BMI Fix

In [79]:
#Filling NA values with 0.0
data['BMI'] = data['BMI'].fillna('0.0')
#Extracting all numeric float numbers
data['BMI'] = data['BMI'].str.extract('(\d+\.\d+)')
#Converting to floar
data['BMI'] = data['BMI'].astype(float)
#Replacing NA values replaced as 0.0 to the mean of the column
data['BMI'].replace(to_replace= 0.0, value = data['BMI'].mean(), inplace= True)

In [80]:
#Changes the values from 'Yes' or 'No' to True or False
def fixing_boolean_columns(data):
    if( 'Yes' in data or 'yes' in data ):
        return True
    elif( 'No' in data or 'no' in data ):
        return False
    else:
        return False

Smoking Fix

In [81]:
#Filling NA with False assuming that he/she doesn't smoke
data['Smoking'] = data['Smoking'].fillna('No')
#Applies the function to the column
data['Smoking'] = data['Smoking'].apply(fixing_boolean_columns)

AlcoholDrinking Fix

In [82]:
#Filling NA with False assuming that he/she doesn't drink
data['AlcoholDrinking'] = data['AlcoholDrinking'].fillna('No')
#Applies the function to the column
data['AlcoholDrinking'] = data['AlcoholDrinking'].apply(fixing_boolean_columns)


Stroke Fix

In [83]:
#Filling NA with False assuming that he/she doesn't stroke
data['Stroke'] = data['Stroke'].fillna('No')
#Applies the function to the column
data['Stroke'] = data['Stroke'].apply(fixing_boolean_columns)

PhysicalHealth Fix

In [84]:
data['PhysicalHealth'] = data['PhysicalHealth'].str.extract('(\d+)')
data['PhysicalHealth'] = data['PhysicalHealth'].astype(int)
def fixing_physical_health(n):
    if(n > 30):
        return data['PhysicalHealth'].median()
    else:
        return n
data['PhysicalHealth'] = data['PhysicalHealth'].apply(fixing_physical_health)
data['PhysicalHealth'] = data['PhysicalHealth'].astype(int)

MentalHealth Fix

In [85]:
data['MentalHealth'] = data['MentalHealth'].str.extract('(\d+)')
data['MentalHealth'] = data['MentalHealth'].astype(int)
def fixing_mental_health(n):
    if(n > 30):
        return data['MentalHealth'].median()
    else:
        return n
data['MentalHealth'] = data['MentalHealth'].apply(fixing_mental_health)
data['MentalHealth'] = data['MentalHealth'].astype(int)

DiffWalking Fix

In [86]:
#Filling NA with False assuming that he/she doesn't Diff Walking
data['DiffWalking'] = data['DiffWalking'].fillna('No')
#Applies the function to the column
data['DiffWalking'] = data['DiffWalking'].apply(fixing_boolean_columns)

Sex Fix

In [87]:
data['Sex'].isnull().sum()
def fixing_sex(data):
    if( 'Female' in data or 'female' in data ):
        return 'Female'
    elif( 'Male' in data or 'male' in data ):
        return 'Male'
    else:
        return np.NaN
data.dropna(subset = ['Sex'], inplace= True)
data['Sex'] = data['Sex'].apply(fixing_sex)
data.dropna(subset = ['Sex'], inplace= True)
data['Sex'].unique()

array(['Male', 'Female'], dtype=object)

Age Category Fix

In [88]:
def change_age_category(data):
    if( data == 'or older 80'):
        return '80 or older'
    else:
        return data
data['AgeCategory'] = data['AgeCategory'].apply(change_age_category)
age_ranges = data['AgeCategory'].unique()
for e in age_ranges:
    if( e != '80 or older'):
        if(int(e.split('-')[1]) < 50):
            data.drop(data[data['AgeCategory'] == e].index, inplace= True)
    

Race Fixing

In [89]:
data["Race"] = data["Race"].fillna("Other")
def fixing_race(row):
    if "White" in row or "white" in row:
        return "White"
    elif "Hispanic" in row or "hispanic" in row:
        return "Hispanic"
    elif "Black" in row or "black" in row:
        return "Black"
    elif "Asian" in row or "asian" in row:
        return "Asian"
    else:
        return "Other"
data['Race'] = data['Race'].apply(fixing_race)

Diabetic Fixing

In [90]:
data["Diabetic"] = data["Diabetic"].fillna("No")
def fixing_diabetic(row):
    if "Yes (during pregnancy)" in row:
        return "Yes (during pregnancy)"
    elif "No, borderline diabetes" in row:
        return "No, borderline diabetes"
    elif "No" in row or "no" in row:
        return "No"
    elif "Yes" in row or "yes" in row:
        return "Yes"
    else:
        return "No"
data['Diabetic'] = data['Diabetic'].apply(fixing_diabetic)

Physical Activity fix

In [91]:
#Filling NA with False assuming that he/she doesn't do excercise
data['PhysicalActivity'] = data['PhysicalActivity'].fillna('No')
#Applies the function to the column
data['PhysicalActivity'] = data['PhysicalActivity'].apply(fixing_boolean_columns)

GenHealth fix

In [92]:
data["GenHealth"] = data["GenHealth"].fillna("Fair")
def fixing_gen_health(row):
    if "Very good" in row:
        return "Very good"
    elif "Good" in row:
        return "Good"
    elif "Excellent" in row:
        return "Excellent"
    elif "Fair" in row:
        return "Fair"
    elif "Poor" in row:
        return "Poor"
    else:
        return "Fair"
data['GenHealth'] = data['GenHealth'].apply(fixing_gen_health)

SleepTime fix

In [93]:
data['SleepTime'] = data['SleepTime'].fillna('0.0')
data['SleepTime'] = data['SleepTime'].astype(float)
mean = data['SleepTime'].mean()
def fixing_negatives_sleeptime(row):
    if row < 0:
        return row * -1
    elif row > 24:
        return mean
    else:
        return row
data['SleepTime'] = data['SleepTime'].apply(fixing_negatives_sleeptime)
data['SleepTime'].replace(to_replace= 0.0, value = mean, inplace= True)

Asthma fix

In [94]:
#Filling NA with False assuming that he/she doesn't have asthma
data['Asthma'] = data['Asthma'].fillna('No')
#Applies the function to the column
data['Asthma'] = data['Asthma'].apply(fixing_boolean_columns)

KidneyDisease fix

In [95]:
#Filling NA with False assuming that he/she doesn't have kidnet disease
data['KidneyDisease'] = data['KidneyDisease'].fillna('No')
#Applies the function to the column
data['KidneyDisease'] = data['KidneyDisease'].apply(fixing_boolean_columns)

SkinCancer fix

In [96]:
#Filling NA with False assuming that he/she doesn't have skin cancer
data['SkinCancer'] = data['SkinCancer'].fillna('No')
#Applies the function to the column
data['SkinCancer'] = data['SkinCancer'].apply(fixing_boolean_columns)

HeartDisease fix

In [97]:

#Changes the values from 'Yes' or 'No' to True or False
def fixing_boolean_columns2(data):
    if( 'Yes' in data or 'yes' in data ):
        return True
    elif( 'No' in data or 'no' in data ):
        return False
    else:
        return True

#Filling NA with False assuming that he/she doesn't have hearth disease
data['HeartDisease'] = data['HeartDisease'].fillna('Yes')
#Applies the function to the column
data['HeartDisease'] = data['HeartDisease'].apply(fixing_boolean_columns2)

In [98]:
#Convert columns to best possible dtypes using dtypes supporting
data = data.convert_dtypes()
data.shape

(40604, 19)

In [99]:
data.to_csv('./data/heart_disease_data_clean.csv', encoding='utf-8', index= False)