In [160]:
import tensorflow as tf
from tensorflow import keras 

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

import matplotlib.pyplot as plt




### Create df

In [144]:
# define both features and labels dataframes, save as original dfs

df_ori_features = pd.read_csv('heart-disease-values.csv')

df_ori_labels = pd.read_csv('heart-disease-labels.csv')


# define two dfs which we will manipulate

df_features = df_ori_features

df_labels = df_ori_labels

In [145]:
# merge both dfs (NB we don't need to do this to feed X and y into our model. 
# But it's good to know how to.)



In [146]:
df_labels

Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0
1,ryoo3j,0
2,yt1s1x,1
3,l2xjde,1
4,oyt4ek,0
...,...,...
175,5qfar3,1
176,2s2b1f,1
177,nsd00i,1
178,0xw93k,0


### define X and y

In [147]:
# define X and y

X = df_features.drop(['patient_id'], axis='columns') # drop patient_id col

y = df_labels.drop(columns=['patient_id']) # drop patient_id col

In [148]:
# inspect df
    
X

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
1,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
2,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
3,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
4,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,2,reversible_defect,125,4,2,1,0,254,0.2,1,67,163,0
176,2,normal,180,4,0,0,1,327,3.4,0,55,117,1
177,2,reversible_defect,125,3,0,0,0,309,1.8,1,64,131,1
178,1,normal,124,3,2,1,0,255,0.0,1,48,175,0


In [149]:
# inspect df    

y 

# seeing that y has 1 or 0 (integers), label encoding won't be necessary

Unnamed: 0,heart_disease_present
0,0
1,0
2,1
3,1
4,0
...,...
175,1
176,1
177,1
178,0


In [150]:
# inspect data some more, how'as the distribution? Influences the feature scaling I'll use

X.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
slope_of_peak_exercise_st_segment,180.0,1.55,0.618838,1.0,1.0,1.0,2.0,3.0
resting_blood_pressure,180.0,131.311111,17.010443,94.0,120.0,130.0,140.0,180.0
chest_pain_type,180.0,3.155556,0.938454,1.0,3.0,3.0,4.0,4.0
num_major_vessels,180.0,0.694444,0.969347,0.0,0.0,0.0,1.0,3.0
fasting_blood_sugar_gt_120_mg_per_dl,180.0,0.161111,0.368659,0.0,0.0,0.0,0.0,1.0
resting_ekg_results,180.0,1.05,0.998742,0.0,0.0,2.0,2.0,2.0
serum_cholesterol_mg_per_dl,180.0,249.211111,52.717969,126.0,213.75,245.5,281.25,564.0
oldpeak_eq_st_depression,180.0,1.01,1.121357,0.0,0.0,0.8,1.6,6.2
sex,180.0,0.688889,0.464239,0.0,0.0,1.0,1.0,1.0
age,180.0,54.811111,9.334737,29.0,48.0,55.0,62.0,77.0


### remove outliers

In [151]:
# serum_cholesterol_mg_per_dl	180.0	249.211111	52.717969	126.0	213.75	245.5	281.25	564.0
# max is so much higher than 75% (upper quartile) -- could this indicate many outliers in this row?

X['serum_cholesterol_mg_per_dl']

# drop rows 3 std devs from the mean (std = 53, mean = 249, 3 std devs = 159, 
# therefore anything over 408 and under 90 is an outlier)


0      308
1      214
2      304
3      223
4      270
      ... 
175    254
176    327
177    309
178    255
179    201
Name: serum_cholesterol_mg_per_dl, Length: 180, dtype: int64

In [152]:
# get index of all rows in 'serum_cholesterol_mg_per_dl' > 408 


condition1 = X['serum_cholesterol_mg_per_dl'] > 408

condition1_idx = X.index[condition1]

list_over_408 = condition1_idx.to_list()

print(list_over_408)


# get index of all rows in 'serum_cholesterol_mg_per_dl' < 90

condition2 = X['serum_cholesterol_mg_per_dl'] < 90

condition2_idx = X.index[condition2]

list_under_90 = condition2_idx.to_list()

print(list_under_90) # no values under 90, cf describe().transpose() above, min = 126



[43, 60]
[]


In [153]:
# remove rows on index position [43, 60] from X and y

X = X.drop(list_over_408) 

In [154]:
# inspect X without outliers

y = y.drop(list_over_408)

### define X_train, y_train, X_val, y_val, X_test, y_test

In [155]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((142, 13), (36, 13), (142, 1), (36, 1))

In [156]:
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, random_state = 0)

X_val.shape, X_test.shape, y_val.shape, y_test.shape

((18, 13), (18, 13), (18, 1), (18, 1))

In [157]:
# inspect X_train

X_train

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
163,2,reversible_defect,142,4,3,0,2,309,0.0,1,45,147,1
94,2,normal,108,3,0,0,0,141,0.6,0,44,175,0
96,2,fixed_defect,145,4,2,0,2,212,2.0,1,64,132,0
176,2,normal,180,4,0,0,1,327,3.4,0,55,117,1
24,1,normal,156,2,0,0,2,245,0.0,1,70,143,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,2,normal,130,4,2,0,0,303,2.0,0,64,122,0
69,2,reversible_defect,120,3,3,0,0,188,2.0,1,49,139,0
119,2,normal,138,4,3,1,0,294,1.9,0,62,106,0
48,2,reversible_defect,120,2,1,0,2,281,1.4,1,62,103,0


In [180]:
# inspect X_val

X_val

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
18,1,normal,130,2,0,0,2,204,0.0,1,29,202,0
170,1,reversible_defect,150,4,0,0,2,270,0.8,1,58,111,1
162,3,reversible_defect,160,4,3,0,2,164,6.2,0,62,145,0
128,3,reversible_defect,145,4,0,0,0,174,2.6,1,70,125,1
153,1,normal,112,4,1,0,2,290,0.0,1,44,153,0
5,1,normal,130,3,0,0,0,180,0.0,1,42,150,0
113,2,reversible_defect,180,3,0,1,2,274,1.6,1,68,150,1
62,3,reversible_defect,130,4,0,1,2,283,1.6,1,56,103,1
7,2,fixed_defect,150,4,1,0,2,276,0.6,1,57,112,1
106,1,reversible_defect,150,3,0,0,2,232,1.6,1,54,165,0


### feature scale the values

In [175]:

ct = make_column_transformer(
    ( OneHotEncoder(handle_unknown='ignore', sparse=False), ['thal' ] ),
    ( MinMaxScaler(), [ 'slope_of_peak_exercise_st_segment', 'resting_blood_pressure',  'chest_pain_type', 'num_major_vessels', 'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results', 'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex',	'age',	'max_heart_rate_achieved', 'exercise_induced_angina' ] )
)


'''ct = make_column_transformer(
    ( OneHotEncoder(handle_unknown='ignore', sparse=False), ['thal' ] ),
    ( StandardScaler(), [ 'slope_of_peak_exercise_st_segment', 'resting_blood_pressure',  'chest_pain_type', 'num_major_vessels', 'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results', 'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex',	'age',	'max_heart_rate_achieved', 'exercise_induced_angina' ] )
)'''


# let's use MinMax because we've removed outliers from the dataframe
# but let's try running our model with both types and see outcomes

In [179]:
ct.fit(X_train)


# feature scale the values

X_train_scaled = ct.transform(X_train)

X_val_scaled = ct.transform(X_val)

X_test_scaled = ct.transform(X_test)


# inspect 

X_train_scaled[0:5], X_train_scaled.shape, X_val_scaled[0:5], X_val_scaled.shape

(array([[0.        , 0.        , 1.        , 0.5       , 0.55813953,
         1.        , 1.        , 0.        , 1.        , 0.8061674 ,
         0.        , 1.        , 0.25581395, 0.53125   , 1.        ],
        [0.        , 1.        , 0.        , 0.5       , 0.1627907 ,
         0.66666667, 0.        , 0.        , 0.        , 0.0660793 ,
         0.10714286, 0.        , 0.23255814, 0.82291667, 0.        ],
        [1.        , 0.        , 0.        , 0.5       , 0.59302326,
         1.        , 0.66666667, 0.        , 1.        , 0.37885463,
         0.35714286, 1.        , 0.69767442, 0.375     , 0.        ],
        [0.        , 1.        , 0.        , 0.5       , 1.        ,
         1.        , 0.        , 0.        , 0.5       , 0.88546256,
         0.60714286, 0.        , 0.48837209, 0.21875   , 1.        ],
        [0.        , 1.        , 0.        , 0.        , 0.72093023,
         0.33333333, 0.        , 0.        , 1.        , 0.52422907,
         0.        , 1.       