# Project 7 - Notes for choosing hyperparameter for other variables _MLP

**Author: Linh Nguyen**<br>

**StudentID: 23161711**<br>

**Date: Sep 2022**

# 1. Load data and import package


In [37]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Capstone_project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Capstone_project


In [94]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
import seaborn as sns
from matplotlib import pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
import  tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,ConfusionMatrixDisplay,classification_report, roc_auc_score



In [41]:
! ls /content/drive/MyDrive/Capstone_project/raw

diagnosis.csv  icd9toicd10cmgem.csv  pyxis.csv	 vitalsign.csv
edstays.csv    medrecon.csv	     triage.csv


# 2. Data cleansing


In [42]:
edstays = pd.read_csv('raw/edstays.csv')
edstays['y_var_adm_text'] = edstays['disposition']
edstays.loc[edstays['y_var_adm_text'] == 'TRANSFER', 'y_var_adm_text'] = 'ADMITTED'
edstays.loc[edstays['y_var_adm_text'] == 'ELOPED','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'LEFT WITHOUT BEING SEEN','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'OTHER','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'LEFT AGAINST MEDICAL ADVICE','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'EXPIRED','y_var_adm_text'] = 'MISSING'
edstays_admit = edstays[edstays['y_var_adm_text'] !='MISSING'].reset_index(drop = True)
edstays_admit['key'] = edstays_admit['subject_id'].astype(str)+'_'+edstays_admit['stay_id'].astype(str)

In [43]:
edstays_admit.loc[edstays_admit['race'].str.contains('WHITE'), 'race_color'] = 'WHITE'
edstays_admit.loc[edstays_admit['race'].str.contains('BLACK'), 'race_color'] = 'BLACK'
edstays_admit.loc[edstays_admit['race'].str.contains('ASIAN'), 'race_color'] = 'ASIAN'
edstays_admit.loc[edstays_admit['race'].str.contains('LATINO'), 'race_color'] = 'HISPANIC/LATINO'
edstays_admit.loc[edstays_admit['race'].str.contains('HISPANIC'), 'race_color'] = 'HISPANIC/LATINO'
edstays_admit.loc[edstays_admit['race_color'].isna(), 'race_color'] = 'OTHER'

In [44]:
edstays_admit['DateTime_in'] = pd.to_datetime(edstays_admit['intime'], format='%d/%m/%Y %H:%M')
edstays_admit['DateTime_out'] = pd.to_datetime(edstays_admit['outtime'], format='%d/%m/%Y %H:%M')

In [45]:
edstays_admit['stay'] = edstays_admit['DateTime_out'] - edstays_admit['DateTime_in']

In [46]:
edstays_admit_1 = edstays_admit.sort_values(by = ['subject_id','DateTime_in'], ascending=True).reset_index(drop=True)

In [47]:
edstay_hist = {}
for i in range(len(edstays_admit_1)):
    if i == 0:
        edstay_hist[i] = 'NO HISTORY'
    else:
        if edstays_admit_1['subject_id'][i] == edstays_admit_1['subject_id'][i-1]:
            edstay_hist[i] = edstays_admit_1['y_var_adm_text'][i-1]
        else:
            edstay_hist[i] = 'NO HISTORY'

In [48]:
edstay_laststay = {}
for i in range(len(edstays_admit_1)):
    if i == 0:
        edstay_laststay[i] = np.timedelta64(0, 'D')
    else:
        if edstays_admit_1['subject_id'][i] == edstays_admit_1['subject_id'][i-1]: 
          edstay_laststay[i] = edstays_admit_1['stay'][i-1]+edstay_laststay[i-1]
        else:
            edstay_laststay[i] = np.timedelta64(0, 'D')

In [49]:
df1 = pd.DataFrame.from_dict(edstay_hist,orient='index')
df1.columns = ['historical_stay_status']
edstays_admit_1 = pd.merge(edstays_admit_1,df1,how = 'left',left_index=True,right_index=True) 

In [50]:
df1 = pd.DataFrame.from_dict(edstay_laststay,orient='index')
df1.columns = ['historical_stay_length']
edstays_admit_1 = pd.merge(edstays_admit_1,df1,how = 'left',left_index=True,right_index=True) 

In [51]:
edstays_admit_1['historical_stay_length_in_day'] = edstays_admit_1['historical_stay_length']/(np.timedelta64(1, 'D'))

In [52]:
edstays_admit_2 = pd.merge(edstays_admit,edstays_admit_1[['historical_stay_status','historical_stay_length','historical_stay_length_in_day','subject_id','stay_id']],how = 'left',on=['subject_id', 'stay_id'])

In [53]:
edstays_admit_2.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'gender',
       'race', 'arrival_transport', 'disposition', 'y_var_adm_text', 'key',
       'race_color', 'DateTime_in', 'DateTime_out', 'stay',
       'historical_stay_status', 'historical_stay_length',
       'historical_stay_length_in_day'],
      dtype='object')

In [54]:
triage = pd.read_csv('raw/triage.csv')


In [55]:
edstay_ad = pd.merge(edstays_admit_2,triage,how = 'left',on=['subject_id', 'stay_id'])

In [56]:
edstay_ad['y_var'] = 0
edstay_ad.loc[edstay_ad['y_var_adm_text'] == 'ADMITTED', 'y_var']  = 1

In [57]:
edstay_ad.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'gender',
       'race', 'arrival_transport', 'disposition', 'y_var_adm_text', 'key',
       'race_color', 'DateTime_in', 'DateTime_out', 'stay',
       'historical_stay_status', 'historical_stay_length',
       'historical_stay_length_in_day', 'temperature', 'heartrate', 'resprate',
       'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'chiefcomplaint', 'y_var'],
      dtype='object')

In [58]:
edstay_ad = edstay_ad.drop(['subject_id','hadm_id','stay_id','intime','outtime','race', 'race_color', 'gender',	'disposition','y_var_adm_text','key', 'DateTime_in', 'DateTime_out','stay','historical_stay_length','chiefcomplaint', 'pain'], axis = 1)

In [59]:
edstay_ad

Unnamed: 0,arrival_transport,historical_stay_status,historical_stay_length_in_day,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,y_var
0,AMBULANCE,NO HISTORY,0.000000,98.4,70.0,16.0,97.0,106.0,63.0,3.0,1
1,AMBULANCE,ADMITTED,0.175694,98.9,88.0,18.0,97.0,116.0,88.0,3.0,1
2,AMBULANCE,ADMITTED,1.309722,99.4,105.0,18.0,96.0,106.0,57.0,3.0,1
3,AMBULANCE,ADMITTED,0.409722,97.8,87.0,14.0,97.0,71.0,43.0,2.0,0
4,AMBULANCE,HOME,0.972222,98.7,77.0,16.0,98.0,96.0,50.0,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...
428279,WALK IN,ADMITTED,0.551389,98.8,92.0,18.0,100.0,122.0,77.0,3.0,1
428280,WALK IN,NO HISTORY,0.000000,98.6,80.0,18.0,100.0,161.0,100.0,3.0,1
428281,AMBULANCE,ADMITTED,1.011111,96.6,112.0,18.0,100.0,110.0,82.0,2.0,1
428282,WALK IN,NO HISTORY,0.000000,98.1,83.0,18.0,100.0,107.0,75.0,2.0,1


In [60]:
edstay_ad.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428284 entries, 0 to 428283
Data columns (total 11 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   arrival_transport              428284 non-null  object 
 1   historical_stay_status         428284 non-null  object 
 2   historical_stay_length_in_day  428284 non-null  float64
 3   temperature                    405762 non-null  float64
 4   heartrate                      412132 non-null  float64
 5   resprate                       408822 non-null  float64
 6   o2sat                          408594 non-null  float64
 7   sbp                            410935 non-null  float64
 8   dbp                            410124 non-null  float64
 9   acuity                         422216 non-null  float64
 10  y_var                          428284 non-null  int64  
dtypes: float64(8), int64(1), object(2)
memory usage: 39.2+ MB


In [61]:
updated_edstay_ad = edstay_ad
updated_edstay_ad['temperaturemissing'] = updated_edstay_ad['temperature'].isnull().astype(int)
updated_edstay_ad['heartratemissing'] = updated_edstay_ad['heartrate'].isnull().astype(int)
updated_edstay_ad['respratemissing'] = updated_edstay_ad['resprate'].isnull().astype(int)
updated_edstay_ad['o2satmissing'] = updated_edstay_ad['o2sat'].isnull().astype(int)
updated_edstay_ad['sbpmissing'] = updated_edstay_ad['sbp'].isnull().astype(int)
updated_edstay_ad['dbpmissing'] = updated_edstay_ad['dbp'].isnull().astype(int)
updated_edstay_ad['acuitymissing'] = updated_edstay_ad['acuity'].isnull().astype(int)

In [62]:
num_attr = edstay_ad.select_dtypes(include=['float64']).columns
cat_attr = edstay_ad.select_dtypes(include=['O']).columns
#bool_attr = edstay_ad.select_dtypes(include=['bool']).columns

In [63]:
my_imputer = SimpleImputer(strategy = 'median')
updated_edstay_ad[num_attr] = my_imputer.fit_transform(updated_edstay_ad[num_attr])
updated_edstay_ad.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428284 entries, 0 to 428283
Data columns (total 18 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   arrival_transport              428284 non-null  object 
 1   historical_stay_status         428284 non-null  object 
 2   historical_stay_length_in_day  428284 non-null  float64
 3   temperature                    428284 non-null  float64
 4   heartrate                      428284 non-null  float64
 5   resprate                       428284 non-null  float64
 6   o2sat                          428284 non-null  float64
 7   sbp                            428284 non-null  float64
 8   dbp                            428284 non-null  float64
 9   acuity                         428284 non-null  float64
 10  y_var                          428284 non-null  int64  
 11  temperaturemissing             428284 non-null  int64  
 12  heartratemissing              

# 3. Split dataset

In [70]:
#set random index, create train set and validation set from the original train set

#create array of random index
np.random.seed(123)
ind = np.arange(len(updated_edstay_ad))
np.random.shuffle(ind)
#create testing index  = 0 to 0.20 * random index
test_index = ind[:int(len(updated_edstay_ad) * 0.20)]
#create validation index  = 0.20 to 0.30 * random index
val_index = ind[int(len(updated_edstay_ad) * 0.20):int(len(updated_edstay_ad) * 0.30)]
#create train index = 0.30 * random index to end
train_index = ind[int(len(updated_edstay_ad) * 0.30):]

In [71]:
X_train = updated_edstay_ad.drop('y_var',axis = 1).iloc[train_index]
y_train = updated_edstay_ad['y_var'].iloc[train_index]
X_val = updated_edstay_ad.drop('y_var',axis = 1).iloc[val_index]
y_val = updated_edstay_ad['y_var'].iloc[val_index]
X_test = updated_edstay_ad.drop('y_var',axis = 1).iloc[test_index]
y_test = updated_edstay_ad['y_var'].iloc[test_index]
#Train = updated_edstay_ad.iloc[train_index]
#Test = updated_edstay_ad.iloc[test_index]

5. Transform dataset

In [72]:
trans_pip = ColumnTransformer([
    ("num",StandardScaler(), num_attr),
    ("cat",OneHotEncoder(),cat_attr)
],remainder='passthrough')



# 4. MLP

In [73]:
X_train_in = trans_pip.fit_transform(X_train)
X_val = trans_pip.transform(X_val)
X_test = trans_pip.transform(X_test)

In [74]:
grid_param = [["glorot_uniform",0,"elu"],["glorot_uniform",0.2,"relu"],["he_normal",0,"elu"],["he_normal",0.2,"relu"],
              ["glorot_uniform",0.2,"elu"],["glorot_uniform",0,"relu"],["he_normal",0.2,"elu"],["he_normal",0,"relu"],
              ]

In [108]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)
#create model structure as above explanation
def build_model(kernel_initializer = "glorot_uniform", rate = 0.2, activation = "relu"):
    #set initial hyperparameter
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(100, activation=activation,kernel_initializer = kernel_initializer))
    #create first hidden layers
    model.add(keras.layers.Dropout(rate = rate))
    model.add(keras.layers.Dense(50, activation=activation,kernel_initializer = kernel_initializer))
    #create second hidden layers
    model.add(keras.layers.Dropout(rate = rate))
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    #create output layers
    optimizer = tf.keras.optimizers.SGD()
    #create optimizer with SGD
    
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ["accuracy"])
    #model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy"])
    #Compile model with loss function, optimizer and metrics of accuracy
    return model

In [110]:
for i in range(len(grid_param)):
  model = build_model(kernel_initializer = grid_param[i][0], rate = grid_param[i][1], activation = grid_param[i][2])
  weights = {0:1-(sum(y_train)/len(y_train)),1:sum(y_train)/len(y_train)}
  model.fit(X_train_in, y_train, epochs=100,validation_data=(X_val, y_val), callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)], class_weight = weights)
  y_pred_valid = model.predict(X_val)
  print(grid_param[i])
  print("roc auc for validation set:", roc_auc_score(y_val, y_pred_valid))
  y_pred_valid_class = np.where(y_pred_valid>0.5, 1, 0)
  print("f1 score for validation set:", f1_score(y_val, y_pred_valid_class))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
['glorot_uniform', 0, 'elu']
roc auc for validation set: 0.7972900283875811
f1 score for validation set: 0.5987396397013494
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
['glorot_uniform', 0.2, 'relu']
roc auc for validation set: 0.8005821610731503
f1 score for validation set: 0.5897400055912775
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch

In [116]:
 y_pred_test = model.predict(X_test)
  
print("roc auc for validation set:", roc_auc_score(y_test, y_pred_test))
y_pred_test_class = np.where(y_pred_test>0.5, 1, 0)
print("accuracy score for test set:", accuracy_score(y_test, y_pred_test_class))
print("f1 score for test set:", f1_score(y_test, y_pred_test_class))

roc auc for validation set: 0.802862580087294
accuracy score for test set: 0.7265340431493416
f1 score for test set: 0.5909757630788574
