In [27]:
import os
import re
import scipy
from scipy import stats
import pickle
import subprocess
import shlex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [28]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [29]:
static_df = pd.read_csv('static.csv')
dynamic_df = pd.read_csv('dynamic.csv')
notes_df = pd.read_csv('notes.csv')


In [30]:
static_df.head()
static_df.describe()

Unnamed: 0,id,los_icu,icu_death,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke
count,20414.0,20414.0,20414.0,20414.0,19802.0,11000.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0
mean,24994260.0,4.854449,0.104732,67.179316,85.476669,169.445921,5.791026,0.268982,0.128833,0.346527,0.427305,0.11208,0.282012,0.334427,0.719114,0.47198,0.121387
std,2871874.0,5.973425,0.306215,15.588895,28.316987,10.813324,2.914619,0.443441,0.335023,0.475875,0.494699,0.315473,0.44999,0.471801,0.449443,0.499226,0.326585
min,20001300.0,1.0,0.0,18.009528,1.0,122.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22516580.0,1.8,0.0,57.357844,68.0,163.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25009580.0,2.88,0.0,68.484516,81.4,170.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,27461540.0,5.28,0.0,78.898969,98.3,178.0,8.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
max,29999620.0,101.73,1.0,100.058421,1010.0,208.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
# getting number of unique patient ID
print(static_df.shape)
static_df.id.unique

(20414, 25)


<bound method Series.unique of 0        24099382
1        27824879
2        27146619
3        22476441
4        23169158
           ...   
20409    29968077
20410    29975784
20411    29987115
20412    29990494
20413    29994623
Name: id, Length: 20414, dtype: int64>

In [32]:
from sklearn.model_selection import train_test_split

#splitting the data into 70 20 10 based on unique ID

XY_features = ['id', 'los_icu', 'icu_death']
static_split = static_df[XY_features]

X = static_split.id
y = static_split.los_icu

# Split dataset into training set, test set, and holdout set

X_train_temp, X_holdout = train_test_split(static_split, test_size=0.1, random_state=42)
X_train, X_test = train_test_split(X_train_temp, test_size=0.22222, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(X_holdout.shape)

(14289, 3)
(4083, 3)
(2042, 3)


In [33]:
# Getting the different unique id from training, testing and holdout data

train_id = X_train.id.unique().tolist()
test_id = X_test.id.unique().tolist()
holdout_id = X_holdout.id.unique().tolist()

print(len(train_id))
print(len(test_id))
print(len(holdout_id))

14289
4083
2042


In [34]:
# encoding static_df female to 0, male to 1

static_df.loc[static_df.gender == 'F', 'gender'] = 0
static_df.loc[static_df.gender == 'M', 'gender'] = 1



In [35]:
# Reducing number of race
static_df[["race_encode"]] = 'placeholder'
static_df.loc[static_df.race.astype(str).str.lower().str.contains('asian|indian'), "race_encode"] = 'Asian'
static_df.loc[static_df.race.astype(str).str.lower().str.contains('black'), "race_encode"] = 'African'
static_df.loc[static_df.race.astype(str).str.lower().str.contains('white|portuguese'), "race_encode"] = 'Caucasian'
static_df.loc[static_df.race.astype(str).str.lower().str.contains('hispanic'), "race_encode"] = 'Hispanic'
static_df.loc[static_df.race.astype(str).str.lower().str.contains('other|multiple|unknown|declined|unable'), "race_encode"] = 'Not Specified'
static_df.loc[static_df.race.astype(str).str.lower().str.contains('south american'), "race_encode"] = 'South American'

In [36]:
# smoothing icu_los followed by encoding ICU outcome
# los_icu 25th percentile = 1.8, 50th = 2.88, 75th = 5.28


static_df[['icu_outcome']] = 'placeholder'
static_df.loc[(static_df['los_icu'] <= 1.8) & (static_df['icu_death'] == 0), 'icu_outcome'] = 1
static_df.loc[(static_df['los_icu'] > 1.8) & (static_df['los_icu'] <= 2.88) & (static_df['icu_death'] == 0), 'icu_outcome'] = 2
static_df.loc[(static_df['los_icu'] > 2.88) & (static_df['los_icu'] <= 5.28) & (static_df['icu_death'] == 0), 'icu_outcome'] = 3
static_df.loc[(static_df['los_icu'] > 5.28) & (static_df['icu_death'] == 0), 'icu_outcome'] = 4
static_df.loc[(static_df['los_icu'] <= 1.8) & (static_df['icu_death'] == 1), 'icu_outcome'] = 5
static_df.loc[(static_df['los_icu'] > 1.8) & (static_df['los_icu'] <= 2.88) & (static_df['icu_death'] == 1), 'icu_outcome'] = 6
static_df.loc[(static_df['los_icu'] > 2.88) & (static_df['los_icu'] <= 5.28) & (static_df['icu_death'] == 1), 'icu_outcome'] = 7
static_df.loc[(static_df['los_icu'] > 5.28) & (static_df['icu_death'] == 1), 'icu_outcome'] = 8

static_df.head()

Unnamed: 0,id,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,race,admission_age,weight_admit,height,admission_type,first_careunit,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,race_encode,icu_outcome
0,24099382,6/11/11 7:15,6/18/11 21:45,6/11/11 14:31,6/13/11 18:05,2.15,0,1,OTHER,58.441631,92.0,170.0,ELECTIVE,Trauma SICU (TSICU),11,0,0,0,0,0,1,1,1,0,0,Not Specified,2
1,27824879,9/30/28 20:55,10/10/28 15:54,9/30/28 20:58,10/5/28 18:39,4.9,0,0,WHITE,79.748598,55.0,,EW EMER.,Trauma SICU (TSICU),5,0,0,0,0,0,1,0,1,1,0,Caucasian,3
2,27146619,9/20/41 0:34,9/22/41 12:05,9/20/41 7:13,9/21/41 18:10,1.46,0,0,WHITE,81.717397,59.0,,EW EMER.,Medical/Surgical Intensive Care Unit (MICU/SICU),8,0,0,1,1,0,0,0,0,1,0,Caucasian,1
3,22476441,10/8/73 16:46,10/16/73 13:05,10/8/73 18:28,10/10/73 13:26,1.79,0,0,BLACK/AFRICAN AMERICAN,63.769278,73.6,160.0,EW EMER.,Medical/Surgical Intensive Care Unit (MICU/SICU),8,0,0,1,1,0,1,1,1,1,0,African,1
4,23169158,3/12/27 1:19,3/19/27 13:15,3/12/27 2:43,3/15/27 1:53,2.97,0,0,WHITE,84.191979,79.1,160.0,OBSERVATION ADMIT,Medical/Surgical Intensive Care Unit (MICU/SICU),6,1,0,1,1,0,0,1,1,0,0,Caucasian,3


In [37]:
print(static_df.shape)
static_df.race_encode.unique()

(20414, 27)


array(['Not Specified', 'Caucasian', 'African', 'Asian', 'Hispanic',
       'South American'], dtype=object)

In [38]:
# one-hot encoding of race (6), admission_type (8), first_careunit (9)
# expecting to see 27 - 1 + 6 - 1 + 8 - 1 + 9 = 47 features

static_df = pd.get_dummies(static_df, columns=['race_encode'])
static_df = pd.get_dummies(static_df, columns=['admission_type'])
static_df = pd.get_dummies(static_df, columns=['first_careunit'])
print(static_df.shape)
static_df.head()

(20414, 47)


Unnamed: 0,id,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,race,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,icu_outcome,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU)
0,24099382,6/11/11 7:15,6/18/11 21:45,6/11/11 14:31,6/13/11 18:05,2.15,0,1,OTHER,58.441631,92.0,170.0,11,0,0,0,0,0,1,1,1,0,0,2,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,27824879,9/30/28 20:55,10/10/28 15:54,9/30/28 20:58,10/5/28 18:39,4.9,0,0,WHITE,79.748598,55.0,,5,0,0,0,0,0,1,0,1,1,0,3,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2,27146619,9/20/41 0:34,9/22/41 12:05,9/20/41 7:13,9/21/41 18:10,1.46,0,0,WHITE,81.717397,59.0,,8,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,22476441,10/8/73 16:46,10/16/73 13:05,10/8/73 18:28,10/10/73 13:26,1.79,0,0,BLACK/AFRICAN AMERICAN,63.769278,73.6,160.0,8,0,0,1,1,0,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,23169158,3/12/27 1:19,3/19/27 13:15,3/12/27 2:43,3/15/27 1:53,2.97,0,0,WHITE,84.191979,79.1,160.0,6,1,0,1,1,0,0,1,1,0,0,3,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


In [39]:
#imputing height and weight with median values

static_df['height'] = static_df['height'].fillna(static_df['height'].median())
static_df['weight_admit'] = static_df['weight_admit'].fillna(static_df['weight_admit'].median())
static_df.head()

Unnamed: 0,id,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,race,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,icu_outcome,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU)
0,24099382,6/11/11 7:15,6/18/11 21:45,6/11/11 14:31,6/13/11 18:05,2.15,0,1,OTHER,58.441631,92.0,170.0,11,0,0,0,0,0,1,1,1,0,0,2,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,27824879,9/30/28 20:55,10/10/28 15:54,9/30/28 20:58,10/5/28 18:39,4.9,0,0,WHITE,79.748598,55.0,170.0,5,0,0,0,0,0,1,0,1,1,0,3,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2,27146619,9/20/41 0:34,9/22/41 12:05,9/20/41 7:13,9/21/41 18:10,1.46,0,0,WHITE,81.717397,59.0,170.0,8,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,22476441,10/8/73 16:46,10/16/73 13:05,10/8/73 18:28,10/10/73 13:26,1.79,0,0,BLACK/AFRICAN AMERICAN,63.769278,73.6,160.0,8,0,0,1,1,0,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,23169158,3/12/27 1:19,3/19/27 13:15,3/12/27 2:43,3/15/27 1:53,2.97,0,0,WHITE,84.191979,79.1,160.0,6,1,0,1,1,0,0,1,1,0,0,3,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


In [40]:
static_train_df = static_df[static_df['id'].isin(train_id)]
static_test_df = static_df[static_df['id'].isin(test_id)]
static_holdout_df = static_df[static_df['id'].isin(holdout_id)]
print(static_train_df.shape)
print(static_test_df.shape)
print(static_holdout_df.shape)

(14289, 47)
(4083, 47)
(2042, 47)


In [41]:
# merging dynamic data with encoded icu_outcome from static

static_icu_outcome = static_df[['id', 'icu_outcome']]

dynamic_df = pd.merge(dynamic_df, static_icu_outcome, on='id', how='left')

print(dynamic_df.shape)
dynamic_df.head()

(93604, 71)


Unnamed: 0,id,charttime,albumin,globulin,total_protein,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,heart_rate,sbp,dbp,mbp,resp_rate,temperature,spo2,hematocrit,hemoglobin,mch,mchc,mcv,platelet,rbc,rdw,wbc,alt,ast,alp,amylase,bilirubin_total,bilirubin_direct,bilirubin_indirect,ck_cpk,ck_mb,ggt,ld_ldh,so2,po2,pco2,fio2_chartevents,aado2,aado2_calc,pao2fio2ratio,ph,baseexcess,bicarbonate_bg,totalco2,hematocrit_bg,hemoglobin_bg,carboxyhemoglobin,methemoglobin,chloride_bg,calcium_bg,temperature_bg,potassium_bg,sodium_bg,lactate_bg,glucose_bg,d_dimer,fibrinogen,thrombin,inr,pt,ptt,urineoutput,icu_outcome
0,28793466,4/12/29 3:35,,,,16.0,21.0,12.0,8.8,106.0,0.6,154.0,139.0,3.8,,,,,,,,30.0,9.6,30.3,32.0,95.0,256.0,3.17,12.5,12.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,10.9,27.8,,4
1,25611175,11/29/40 3:50,,,,15.0,27.0,28.0,8.5,101.0,1.7,223.0,139.0,3.5,,,,,,,,28.0,9.1,30.3,32.5,93.0,216.0,3.0,13.8,6.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3
2,26115624,9/7/50 0:22,3.5,,,12.0,22.0,9.0,7.9,111.0,0.6,97.0,141.0,3.6,,,,,,,,26.4,9.6,31.3,36.5,86.0,115.0,3.07,15.4,0.8,18.0,17.0,141.0,,0.8,,,,,,155.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.2,13.9,25.8,,2
3,28164589,3/11/59 1:11,,,,12.0,34.0,58.0,8.3,103.0,2.1,89.0,146.0,3.4,,,,,,,,30.4,8.9,30.7,29.3,105.0,105.0,2.9,20.0,4.8,9.0,20.0,75.0,,0.7,,,,4.0,,250.0,,,,,,,,,,,,,,,,,,,,,,,,,,3.8,41.7,40.2,,2
4,26115624,9/1/50 20:14,,,,,,11.0,7.8,,,,,3.3,,,,,,,,,,,,,,,,,,,,,,,,,,,201.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2


In [42]:
# Imputing remaining missing values with the median values from the same class label
nan_counts_per_feature = dynamic_df.isna().sum(axis=0)
features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
print("NaN counts for each feature:")
print(nan_counts_per_feature)
print(features_with_nan)

NaN counts for each feature:
id                        0
charttime                 0
albumin               74954
globulin              93085
total_protein         92629
aniongap               6226
bicarbonate            5861
bun                    5207
calcium               15185
chloride               3511
creatinine             4830
glucose                7908
sodium                 3606
potassium              2538
heart_rate            89345
sbp                   90223
dbp                   90225
mbp                   90201
resp_rate             89367
temperature           92300
spo2                  89609
hematocrit            19188
hemoglobin            21569
mch                   21880
mchc                  21861
mcv                   21863
platelet              21583
rbc                   21862
rdw                   21939
wbc                   21849
alt                   61830
ast                   61544
alp                   62109
amylase               90772
bilirubin_total    

In [16]:
# dropping features with > 50% missing values in dynamic_df
missing_percentage = (dynamic_df.isna().sum() / len(dynamic_df)) * 100

# Identify features with more than 50% missing values
features_to_drop = missing_percentage[missing_percentage >= 50].index

print(len(features_to_drop))

dynamic_dropped = dynamic_df.drop(columns=features_to_drop)
print(dynamic_df.shape)
print(dynamic_dropped.shape)

47
(93604, 71)
(93604, 24)


In [17]:
# Imputing missing values in dynamic_dropped with simple median values of each column

dynamic_dropped = dynamic_dropped.fillna(dynamic_df.median())

  dynamic_dropped = dynamic_dropped.fillna(dynamic_df.median())


In [18]:
# # Extract median imputation method to Excel for comparison of different imputation method

# dynamic_dropped.head()

# dynamic_df_median = dynamic_dropped.drop(columns=['icu_outcome'])
# print(dynamic_df_median.shape)

# dynamic_df_median.to_excel('dynamic_full_median.xlsx', index=False)

In [19]:
# manually splitting dynamic_change_df into training, testing and holdout set based on previous split unique id above

dynamic_train_df = dynamic_dropped[dynamic_dropped['id'].isin(train_id)]
dynamic_test_df = dynamic_dropped[dynamic_dropped['id'].isin(test_id)]
dynamic_holdout_df = dynamic_dropped[dynamic_dropped['id'].isin(holdout_id)]
print(dynamic_train_df.shape)
print(dynamic_test_df.shape)
print(dynamic_holdout_df.shape)

(64985, 24)
(18781, 24)
(9838, 24)


In [20]:
# Imputing remaining missing values with the median values from the same class label
nan_counts_per_feature = static_df.isna().sum(axis=0)
features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
print("NaN counts for each feature:")
print(nan_counts_per_feature)
print(features_with_nan)

NaN counts for each feature:
id                                                                 0
hosp_admittime                                                     0
hosp_dischtime                                                     0
icu_intime                                                         0
icu_outtime                                                        0
los_icu                                                            0
icu_death                                                          0
gender                                                             0
race                                                               0
admission_age                                                      0
weight_admit                                                       0
height                                                             0
charlson_score                                                     0
atrial_fibrillation                                                0
malig

In [21]:
# nan count for dynamic_dropped
nan_counts_per_feature = dynamic_dropped.isna().sum(axis=0)
features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
print("NaN counts for each feature:")
print(nan_counts_per_feature)
print(features_with_nan)

NaN counts for each feature:
id             0
charttime      0
aniongap       0
bicarbonate    0
bun            0
calcium        0
chloride       0
creatinine     0
glucose        0
sodium         0
potassium      0
hematocrit     0
hemoglobin     0
mch            0
mchc           0
mcv            0
platelet       0
rbc            0
rdw            0
wbc            0
inr            0
pt             0
ptt            0
icu_outcome    0
dtype: int64
[]


In [22]:
# # imputing missing values based on median value of the same icu_outcome from the training dataset


# # Group the data by the class label
# grouped = dynamic_train_df.groupby('icu_outcome')

# grouped.describe()

# # nan count for dynamic_train_df
# nan_counts_per_feature = dynamic_train_df.isna().sum(axis=0)
# features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
# print("NaN counts for each feature:")
# print(nan_counts_per_feature)
# print(features_with_nan)


# # Iterate over each feature
# for feature in features_with_nan:  # Exclude the last column which is the class label
#     # Calculate the median value of the feature within each group
#     median_per_class = grouped[feature].median()
#     #print(grouped[feature].median())

    
#     # Replace missing values in each feature with the corresponding median value from the appropriate class label
#     # also replace missing values in test and holdout dataset with median values from the training dataset with the same 'icu_outcome'
#     for class_label, median_value in median_per_class.items():
#         dynamic_train_df.loc[(dynamic_train_df[feature].isnull()) & (dynamic_train_df['icu_outcome'] == class_label), feature] = median_value
#         dynamic_test_df.loc[(dynamic_test_df[feature].isnull()) & (dynamic_test_df['icu_outcome'] == class_label), feature] = median_value
#         dynamic_holdout_df.loc[(dynamic_holdout_df[feature].isnull()) & (dynamic_holdout_df['icu_outcome'] == class_label), feature] = median_value

In [23]:
nan_counts_per_feature = dynamic_test_df.isna().sum(axis=0)
features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
print("NaN counts for each feature in test set:")
print(nan_counts_per_feature)
print(features_with_nan)

nan_counts_per_feature = dynamic_holdout_df.isna().sum(axis=0)
features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
print("NaN counts for each feature in holdout set:")
print(nan_counts_per_feature)
print(features_with_nan)

NaN counts for each feature in test set:
id             0
charttime      0
aniongap       0
bicarbonate    0
bun            0
calcium        0
chloride       0
creatinine     0
glucose        0
sodium         0
potassium      0
hematocrit     0
hemoglobin     0
mch            0
mchc           0
mcv            0
platelet       0
rbc            0
rdw            0
wbc            0
inr            0
pt             0
ptt            0
icu_outcome    0
dtype: int64
[]
NaN counts for each feature in holdout set:
id             0
charttime      0
aniongap       0
bicarbonate    0
bun            0
calcium        0
chloride       0
creatinine     0
glucose        0
sodium         0
potassium      0
hematocrit     0
hemoglobin     0
mch            0
mchc           0
mcv            0
platelet       0
rbc            0
rdw            0
wbc            0
inr            0
pt             0
ptt            0
icu_outcome    0
dtype: int64
[]


In [24]:
nan_counts_per_feature = dynamic_train_df.isna().sum(axis=0)
features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
print("NaN counts for each feature in training dataset:")
print(nan_counts_per_feature)
print(features_with_nan)

NaN counts for each feature in training dataset:
id             0
charttime      0
aniongap       0
bicarbonate    0
bun            0
calcium        0
chloride       0
creatinine     0
glucose        0
sodium         0
potassium      0
hematocrit     0
hemoglobin     0
mch            0
mchc           0
mcv            0
platelet       0
rbc            0
rdw            0
wbc            0
inr            0
pt             0
ptt            0
icu_outcome    0
dtype: int64
[]


In [25]:
# combining dynamic train, test and holdout into a single dynamic_imputed_df

dynamic_imputed_df = pd.concat([dynamic_train_df, dynamic_test_df, dynamic_holdout_df], axis=0)
dynamic_imputed_df.reset_index(drop=True, inplace=True)
dynamic_imputed_df.shape

(93604, 24)

In [26]:
nan_counts_per_feature = dynamic_imputed_df.isna().sum(axis=0)
features_with_nan = nan_counts_per_feature[nan_counts_per_feature > 0].index.tolist()
print("NaN counts for each feature in dynamic_imputed dataset:")
print(nan_counts_per_feature)
print(features_with_nan)

NaN counts for each feature in dynamic_imputed dataset:
id             0
charttime      0
aniongap       0
bicarbonate    0
bun            0
calcium        0
chloride       0
creatinine     0
glucose        0
sodium         0
potassium      0
hematocrit     0
hemoglobin     0
mch            0
mchc           0
mcv            0
platelet       0
rbc            0
rdw            0
wbc            0
inr            0
pt             0
ptt            0
icu_outcome    0
dtype: int64
[]


In [27]:
# finding out the first and last dynamic measurement for each patient ID

from datetime import datetime

def hours_difference(date_string):
    date_obj = datetime.strptime(date_string, '%m/%d/%y %H:%M')
    current_year = datetime.now().year
    if date_obj.year > current_year:
        date_obj = date_obj.replace(year=date_obj.year - 100)
    current_time = datetime.now()
    time_difference = current_time - date_obj
    hours_difference = time_difference.total_seconds() / 3600

    return hours_difference

dynamic_imputed_df['charttime_hours'] = dynamic_imputed_df['charttime'].apply(lambda x: hours_difference(x))

dynamic_imputed_df.head()

Unnamed: 0,id,charttime,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,hemoglobin,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt,icu_outcome,charttime_hours
0,28793466,4/12/29 3:35,16.0,21.0,12.0,8.8,106.0,0.6,154.0,139.0,3.8,30.0,9.6,30.3,32.0,95.0,256.0,3.17,12.5,12.1,1.0,10.9,27.8,4,832532.163676
1,25611175,11/29/40 3:50,15.0,27.0,28.0,8.5,101.0,1.7,223.0,139.0,3.5,28.0,9.1,30.3,32.5,93.0,216.0,3.0,13.8,6.4,1.4,14.9,33.4,3,730555.913676
2,26115624,9/7/50 0:22,12.0,22.0,9.0,7.9,111.0,0.6,97.0,141.0,3.6,26.4,9.6,31.3,36.5,86.0,115.0,3.07,15.4,0.8,1.2,13.9,25.8,2,644903.380342
3,28164589,3/11/59 1:11,12.0,34.0,58.0,8.3,103.0,2.1,89.0,146.0,3.4,30.4,8.9,30.7,29.3,105.0,105.0,2.9,20.0,4.8,3.8,41.7,40.2,2,570334.563676
4,26115624,9/1/50 20:14,15.0,23.0,11.0,7.8,102.0,1.3,127.0,138.0,3.3,30.0,9.8,30.1,32.7,92.0,181.0,3.29,15.6,9.9,1.4,14.9,33.4,2,645027.513676


In [28]:
def get_last_measurement_index(group):
    last_measurement = group['charttime_hours'].min()
    last_measurement_index = group[group['charttime_hours'] == last_measurement].index[-1]
    return last_measurement_index

def get_first_measurement_index(group):
    first_measurement = group['charttime_hours'].max()
    first_measurement_index = group[group['charttime_hours'] == first_measurement].index[-1]
    return first_measurement_index

last_measurement_indices = dynamic_imputed_df.groupby('id').apply(get_last_measurement_index)
first_measurement_indices = dynamic_imputed_df.groupby('id').apply(get_first_measurement_index)

last_measurement_indices_list = last_measurement_indices.tolist()
first_measurement_indices_list = first_measurement_indices.tolist()

print(last_measurement_indices_list)
print(first_measurement_indices_list)

[8267, 59907, 70340, 61551, 15648, 33984, 70288, 978, 63763, 83458, 10307, 44561, 7167, 10675, 31436, 35877, 66503, 48700, 45690, 71959, 4623, 40582, 28587, 84681, 38973, 78154, 65013, 18183, 21662, 69305, 12427, 5919, 14850, 46494, 11128, 46527, 12404, 15285, 60953, 40358, 62430, 55874, 21968, 25546, 61782, 37457, 54559, 38233, 40812, 78506, 31313, 46526, 45813, 25223, 31318, 29113, 49213, 68849, 72098, 14626, 46246, 42873, 75152, 32119, 9715, 63690, 70438, 11389, 40773, 91739, 23248, 47006, 84045, 49333, 89611, 42717, 10735, 38476, 12445, 83152, 70391, 86859, 65427, 76217, 31473, 75926, 31030, 70008, 55303, 18794, 87810, 39052, 85994, 42078, 48640, 49370, 8666, 57972, 69518, 74294, 49387, 58073, 53749, 60426, 59475, 28092, 40697, 63142, 11082, 44280, 74260, 39302, 22136, 78222, 68, 62431, 20744, 52517, 10945, 24586, 27568, 38353, 64585, 23801, 78325, 8599, 20867, 22167, 25008, 92422, 76811, 60410, 45004, 28942, 55509, 67245, 71195, 38946, 76837, 34879, 53554, 27004, 15647, 35736, 598

In [29]:
set_last = set(last_measurement_indices_list)
set_first = set(first_measurement_indices_list)
index_with_1_measurement = set_last.intersection(set_first)

print(index_with_1_measurement)
print(len(index_with_1_measurement))

{1, 8, 9, 90120, 8206, 16399, 24592, 73746, 32787, 16406, 32, 33, 65570, 81956, 49193, 32811, 46, 49198, 24633, 57402, 73788, 57406, 73793, 32837, 70, 24647, 81989, 32841, 32842, 90184, 57433, 41052, 32865, 8292, 8303, 24690, 116, 82040, 90234, 57468, 65669, 73866, 65675, 82060, 16528, 65683, 8345, 82078, 65696, 24737, 49317, 41127, 57514, 32942, 57519, 65713, 57522, 49331, 73910, 49338, 65726, 8383, 82110, 24770, 41154, 41156, 57539, 73927, 24776, 90316, 57555, 73941, 41178, 49370, 82144, 90336, 227, 230, 16614, 57575, 33001, 49386, 41197, 49390, 82170, 57595, 90362, 57602, 57603, 41221, 90387, 16660, 57620, 24855, 65815, 82200, 24862, 82212, 49447, 57642, 65835, 90410, 65837, 74029, 309, 41271, 49463, 65848, 82232, 74046, 65859, 8520, 49490, 8531, 41312, 49505, 24933, 82279, 90472, 364, 41328, 82288, 41331, 74102, 24951, 49528, 49531, 49538, 33159, 8584, 33160, 33162, 49545, 82311, 90504, 8590, 90512, 57745, 33171, 41365, 57749, 57751, 74136, 49561, 90517, 16798, 33183, 49568, 24993,

In [30]:
dynamic_df_last = dynamic_imputed_df.iloc[last_measurement_indices_list]
print(dynamic_df_last.id.nunique())

dynamic_df_first = dynamic_imputed_df.iloc[first_measurement_indices_list]
print(dynamic_df_first.id.nunique())

20414
20414


In [31]:
# Merge the two DataFrames based on the ID column
merged_dynamic_df = pd.merge(dynamic_df_last, dynamic_df_first, on='id', suffixes=('_last', '_first'))

# merged_dynamic_df.info()

dynamic_change_df = merged_dynamic_df.copy()


for column in dynamic_df_last.columns:
    if column != 'id' and column != 'charttime':
        dynamic_change_df[column + '_delta'] = merged_dynamic_df[column + '_last'] - merged_dynamic_df[column + '_first']
        # change_df.drop(column + '_first')

# Droping the measurement from the first reading
dynamic_change_df.drop(columns=[column + '_first' for column in dynamic_df_first.columns if column not in ['id', 'charttime']], inplace=True)

In [32]:
print(dynamic_change_df.columns.nunique())
dynamic_change_df.head()

49


Unnamed: 0,id,charttime_last,aniongap_last,bicarbonate_last,bun_last,calcium_last,chloride_last,creatinine_last,glucose_last,sodium_last,potassium_last,hematocrit_last,hemoglobin_last,mch_last,mchc_last,mcv_last,platelet_last,rbc_last,rdw_last,wbc_last,inr_last,pt_last,ptt_last,icu_outcome_last,charttime_hours_last,charttime_first,aniongap_delta,bicarbonate_delta,bun_delta,calcium_delta,chloride_delta,creatinine_delta,glucose_delta,sodium_delta,potassium_delta,hematocrit_delta,hemoglobin_delta,mch_delta,mchc_delta,mcv_delta,platelet_delta,rbc_delta,rdw_delta,wbc_delta,inr_delta,pt_delta,ptt_delta,icu_outcome_delta,charttime_hours_delta
0,20001305,3/25/78 21:55,13.0,24.0,50.0,10.8,108.0,0.9,131.0,141.0,4.1,30.0,9.8,30.1,32.7,92.0,181.0,3.29,15.6,9.9,1.3,14.1,33.1,6,403417.830359,3/25/78 8:20,-2.0,1.0,3.0,-0.6,0.0,0.1,-23.0,-1.0,-0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1,-0.8,-0.3,0,-13.583349
1,20001361,5/5/43 15:02,18.0,23.0,40.0,7.9,107.0,3.7,123.0,144.0,3.8,30.0,9.8,30.1,32.7,92.0,181.0,3.29,15.6,9.9,1.6,17.7,30.0,4,709256.71379,5/4/43 17:24,4.0,1.0,12.0,1.6,0.0,1.2,-38.0,7.0,-2.0,-3.9,-1.8,-0.6,-1.4,2.0,-35.0,-0.47,2.5,-10.9,-0.2,-1.7,-3.5,0,-21.633307
2,20001770,1/26/17 3:56,14.0,22.0,23.0,7.4,86.0,1.3,80.0,122.0,4.5,27.9,9.8,34.6,35.1,99.0,173.0,2.83,20.2,13.8,1.7,18.5,49.8,2,62947.81381,1/26/17 3:56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
3,20002506,3/20/32 1:01,14.0,22.0,9.0,8.8,108.0,0.7,94.0,144.0,4.0,37.8,12.6,30.1,33.3,90.0,175.0,4.18,12.0,8.3,1.2,13.2,30.9,4,806782.73046,3/19/32 6:13,-2.0,0.0,-4.0,-0.5,7.0,0.0,-6.0,5.0,-0.1,-2.2,-1.2,0.0,-1.2,3.0,-48.0,-0.4,0.2,-2.4,0.0,0.6,2.2,0,-18.799963
4,20003425,7/23/55 2:19,11.0,21.0,23.0,8.5,101.0,0.8,162.0,133.0,5.0,24.4,7.6,28.0,31.1,90.0,211.0,2.71,15.7,10.0,1.4,14.9,33.4,3,602181.430373,7/21/55 23:27,3.0,-8.0,-3.0,-0.9,4.0,0.0,45.0,-1.0,0.0,-4.2,-1.3,-0.2,0.0,-1.0,-23.0,-0.45,0.2,4.4,0.0,0.2,-2.2,0,-26.866735


In [33]:
print(static_df.columns.nunique())
print(dynamic_change_df.columns.nunique())

47
49


In [34]:
static_df_dropped = static_df.drop(columns=['hosp_admittime',
                                            'hosp_dischtime',
                                            'icu_intime',
                                            'icu_outtime',
                                            'race',
                                            'icu_outcome'
                                            ])
print(static_df_dropped.columns.nunique())


dynamic_df_dropped = dynamic_change_df.drop(columns=['charttime_last',
                                                    'icu_outcome_last',
                                                    'charttime_hours_last',
                                                    'charttime_first',
                                                    'icu_outcome_delta',
                                                    'charttime_hours_delta'
                                                    ])
print(dynamic_df_dropped.columns.nunique())

41
43


In [35]:
# combining both static and dynamic df into a single df
combined_df = pd.merge(static_df_dropped, dynamic_df_dropped, on='id', how='left')

combined_df.shape

(20414, 83)

In [36]:
# splitting pre-processed data back into train, test, holdout based on patient id
Train = combined_df[combined_df['id'].isin(train_id)]
Test = combined_df[combined_df['id'].isin(test_id)]
Holdout = combined_df[combined_df['id'].isin(holdout_id)]

In [37]:
X_features_to_drop = ['id', 'los_icu', 'icu_death']

X_train = Train.drop(columns=X_features_to_drop)
y_train = Train.los_icu

X_test = Test.drop(columns=X_features_to_drop)
y_test = Test.los_icu

X_holdout = Holdout.drop(columns=X_features_to_drop)
y_holdout = Holdout.los_icu

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_holdout.shape)
print(y_holdout.shape)
X_train.head()

(14289, 80)
(14289,)
(4083, 80)
(4083,)
(2042, 80)
(2042,)


Unnamed: 0,gender,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),aniongap_last,bicarbonate_last,bun_last,calcium_last,chloride_last,creatinine_last,glucose_last,sodium_last,potassium_last,hematocrit_last,hemoglobin_last,mch_last,mchc_last,mcv_last,platelet_last,rbc_last,rdw_last,wbc_last,inr_last,pt_last,ptt_last,aniongap_delta,bicarbonate_delta,bun_delta,calcium_delta,chloride_delta,creatinine_delta,glucose_delta,sodium_delta,potassium_delta,hematocrit_delta,hemoglobin_delta,mch_delta,mchc_delta,mcv_delta,platelet_delta,rbc_delta,rdw_delta,wbc_delta,inr_delta,pt_delta,ptt_delta
0,1,58.441631,92.0,170.0,11,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13.0,26.0,14.0,8.7,106.0,0.8,102.0,141.0,4.1,35.5,11.3,31.7,31.9,99.0,286.0,3.57,16.6,12.0,1.4,14.9,33.4,-2.0,3.0,0.0,0.6,-1.0,-0.1,-61.0,0.0,-0.3,-0.5,1.5,1.6,-0.8,7.0,105.0,0.28,1.0,2.1,0.0,0.0,0.0
1,0,79.748598,55.0,170.0,5,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,15.0,22.0,17.0,7.9,103.0,0.7,155.0,136.0,3.9,41.8,14.2,31.7,34.1,93.0,248.0,4.49,15.1,27.3,1.1,11.6,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,81.717397,59.0,170.0,8,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,16.0,19.0,60.0,7.7,113.0,2.3,97.0,143.0,5.1,30.9,10.1,30.3,32.5,93.0,153.0,3.32,13.3,9.3,1.5,15.7,27.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,84.191979,79.1,160.0,6,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,17.0,12.0,44.0,10.1,112.0,2.2,96.0,136.0,4.7,27.9,8.4,26.9,30.1,89.0,157.0,3.12,17.6,8.8,1.2,13.4,32.6,2.0,-3.0,-2.0,-0.8,5.0,-0.3,-57.0,4.0,-0.1,-6.0,-1.7,-0.3,0.3,-2.0,-38.0,-0.59,0.2,-4.1,0.1,1.4,4.5
6,0,58.03979,61.5,150.0,5,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,15.0,23.0,27.0,8.4,90.0,1.3,143.0,128.0,3.4,21.1,6.9,30.0,32.7,92.0,48.0,2.3,20.9,7.6,1.7,18.6,84.0,-2.0,2.0,-1.0,0.2,0.0,-0.1,11.0,0.0,-0.2,-0.6,-0.1,0.3,0.4,0.0,26.0,-0.06,-0.2,0.4,-0.3,-2.7,-66.0


In [38]:
# Perform normalization using data from X_train to transform other X_test and X_holdout

from sklearn.preprocessing import StandardScaler

num_cols = X_train.columns[X_train.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
# print(num_cols)
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
X_holdout[num_cols] = scaler.transform(X_holdout[num_cols])


In [39]:
X_train.head()

Unnamed: 0,gender,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),aniongap_last,bicarbonate_last,bun_last,calcium_last,chloride_last,creatinine_last,glucose_last,sodium_last,potassium_last,hematocrit_last,hemoglobin_last,mch_last,mchc_last,mcv_last,platelet_last,rbc_last,rdw_last,wbc_last,inr_last,pt_last,ptt_last,aniongap_delta,bicarbonate_delta,bun_delta,calcium_delta,chloride_delta,creatinine_delta,glucose_delta,sodium_delta,potassium_delta,hematocrit_delta,hemoglobin_delta,mch_delta,mchc_delta,mcv_delta,platelet_delta,rbc_delta,rdw_delta,wbc_delta,inr_delta,pt_delta,ptt_delta
0,1,-0.563436,0.232553,0.031491,1.774812,-0.60449,-0.385171,-0.724663,-0.861473,-0.35647,1.58892,1.412212,0.623726,-0.944235,-0.374677,-0.388041,-0.164393,-1.40008,-0.197225,2.453196,-0.026464,-0.211895,-0.011832,6.041849,-0.027756,-1.127697,-0.369925,-0.273432,-0.468075,-0.363662,-0.37078,-0.600822,-0.489721,-0.083527,-0.054295,-0.148916,-0.421541,2.692582,-0.44616,0.572232,-0.787179,0.510644,0.406851,-0.575701,-0.624524,0.557767,-0.232457,0.966095,0.753433,0.744964,-0.564587,1.17674,0.968523,0.340908,0.389384,0.046106,-0.169646,-0.198219,-0.189829,-0.295891,0.806992,-0.026762,0.896634,-0.402932,-0.101381,-0.540333,-0.023639,-0.298955,0.309287,1.251619,0.918505,-0.754492,1.543857,1.583985,0.840607,0.57489,0.295461,0.037331,0.0491,-0.00331
1,0,0.802145,-1.059453,0.031491,-0.277074,-0.60449,-0.385171,-0.724663,-0.861473,-0.35647,1.58892,-0.708109,0.623726,1.059058,-0.374677,-0.388041,-0.164393,0.714245,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,0.886763,-0.369925,-0.273432,-0.468075,-0.363662,-0.37078,-0.600822,-0.489721,-0.083527,-0.054295,-0.148916,-0.421541,2.692582,0.027162,-0.240756,-0.661702,-0.546756,-0.05332,-0.62805,0.290062,-0.425034,-0.548031,2.18509,2.428902,0.744964,0.913484,0.239148,0.577903,1.861534,-0.320388,1.867253,-0.605488,-0.66191,-0.588194,0.203428,0.031239,-0.026762,0.139565,-0.193431,0.002449,0.146179,-0.023639,0.081062,0.408876,0.35966,-0.022089,-0.127666,0.071052,0.246367,0.356865,-0.090573,0.018995,0.037331,0.0491,-0.00331
2,0,0.928327,-0.919777,0.031491,0.748869,-0.60449,-0.385171,1.379952,1.160802,-0.35647,-0.629358,-0.708109,-1.603267,1.059058,-0.374677,-0.388041,-0.164393,0.714245,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,0.886763,-0.369925,-0.273432,-0.468075,-0.363662,-0.37078,-0.600822,2.04198,-0.083527,-0.054295,-0.148916,-0.421541,-0.371391,0.263824,-0.850497,1.1368,-0.811106,1.480582,0.209541,-0.710806,0.950887,1.345414,0.076036,0.060136,0.154568,-0.161476,0.239148,-0.398646,-0.072306,-1.172115,-0.275272,-0.024366,-0.085809,-0.575745,0.203428,0.031239,-0.026762,0.139565,-0.193431,0.002449,0.146179,-0.023639,0.081062,0.408876,0.35966,-0.022089,-0.127666,0.071052,0.246367,0.356865,-0.090573,0.018995,0.037331,0.0491,-0.00331
4,0,1.086925,-0.217903,-1.230875,0.064907,1.654286,-0.385171,1.379952,1.160802,-0.35647,-0.629358,1.412212,0.623726,-0.944235,-0.374677,-0.388041,-0.164393,0.714245,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,-1.127697,2.70325,-0.273432,-0.468075,-0.363662,-0.37078,-0.600822,2.04198,-0.083527,-0.054295,-0.148916,-0.421541,-0.371391,0.500485,-2.273226,0.46759,2.361094,1.327192,0.157191,-0.728063,-0.425034,0.714266,-0.504438,-0.922035,-1.27925,-1.773917,-0.385914,-0.357528,-0.402877,0.862565,-0.334787,-0.460207,-0.408988,-0.239625,0.702747,-0.744515,-0.177644,-0.869861,0.854072,-0.30904,-0.495316,1.020506,-0.045611,-0.786192,-0.651228,-0.19845,0.107394,-0.349749,-0.237723,-0.662448,0.04252,-0.52077,0.160913,0.216241,0.215927
6,0,-0.58919,-0.832479,-2.493242,-0.277074,-0.60449,-0.385171,1.379952,-0.861473,2.805289,-0.629358,-0.708109,0.623726,-0.944235,-0.374677,-0.388041,-0.164393,0.714245,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,-1.127697,-0.369925,-0.273432,2.136411,-0.363662,-0.37078,1.664387,-0.489721,-0.083527,-0.054295,-0.148916,-0.421541,-0.371391,0.027162,-0.037509,-0.243446,0.114119,-2.047393,-0.313954,0.082986,-1.997516,-1.336966,-1.820179,-1.788657,0.028055,-0.027106,0.082882,-1.47799,-1.758218,2.424064,-0.477622,0.266195,0.321677,2.959744,-0.295891,0.548408,-0.102203,0.391922,-0.193431,-0.101381,0.269976,-0.023639,-0.172283,0.289369,0.300196,0.154272,0.185747,0.071052,0.577587,0.253206,-0.223665,0.071655,-0.333414,-0.273244,-3.21878


In [40]:
X_test.describe()

Unnamed: 0,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),aniongap_last,bicarbonate_last,bun_last,calcium_last,chloride_last,creatinine_last,glucose_last,sodium_last,potassium_last,hematocrit_last,hemoglobin_last,mch_last,mchc_last,mcv_last,platelet_last,rbc_last,rdw_last,wbc_last,inr_last,pt_last,ptt_last,aniongap_delta,bicarbonate_delta,bun_delta,calcium_delta,chloride_delta,creatinine_delta,glucose_delta,sodium_delta,potassium_delta,hematocrit_delta,hemoglobin_delta,mch_delta,mchc_delta,mcv_delta,platelet_delta,rbc_delta,rdw_delta,wbc_delta,inr_delta,pt_delta,ptt_delta
count,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0,4083.0
mean,-0.010129,-0.00515,-0.017707,-0.030828,0.006259,-0.009846,7.1e-05,0.009745,0.002064,-0.011087,-0.001335,-0.0177,-0.009562,-0.019847,-0.005332,0.0437,0.003256,-0.010157,-0.013856,0.00132,-0.018656,0.029575,0.033646,-0.018926,-0.000823,0.028993,-0.012544,-0.019003,0.013797,0.010909,-0.014963,-0.004835,0.005053,0.013567,-0.004339,0.007482,-0.008186,-0.003558,-0.015566,-0.016637,-0.008701,-0.00636,-0.034475,0.002575,-0.018531,0.013247,0.013804,0.017969,0.01216,0.019165,0.001948,0.018603,0.009612,-0.012919,0.037349,0.036276,0.038295,0.018249,0.013142,-0.012207,0.004131,-0.010069,-0.001479,0.020911,0.018049,0.00724,0.016841,-0.014686,-0.009868,-0.013047,0.00939,-0.02176,-0.019075,-0.008954,0.017192,0.015327,0.006589,0.015521,-0.008629
std,0.992835,0.931019,1.002064,0.985506,1.003383,0.989127,1.000146,1.001533,1.002645,0.994726,0.999651,1.0086,0.999528,0.976886,0.994255,1.121177,0.999,0.975003,0.98575,1.024749,0.956967,1.870632,1.094019,0.564264,1.000221,1.032991,0.978587,0.983957,1.016358,1.012675,0.99202,0.996351,1.029712,1.1177,0.985764,1.007367,0.990541,1.01227,0.993517,1.017186,0.960353,0.974983,0.913502,1.123624,0.992046,1.015128,1.021172,1.030721,1.038687,1.020869,1.038186,0.992558,1.016151,1.010604,1.221275,1.01436,1.021341,1.014994,1.010072,1.035389,1.022897,0.939934,0.993556,0.976196,0.981561,0.992777,0.975633,1.029089,1.027068,1.059829,0.990958,1.037706,1.032768,1.029932,0.991105,1.042899,0.955144,0.904105,0.984688
min,-3.110447,-2.945083,-5.396685,-1.98698,-0.60449,-0.385171,-0.724663,-0.861473,-0.35647,-0.629358,-0.708109,-1.603267,-0.944235,-0.374677,-0.388041,-0.164393,-1.40008,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,-1.127697,-0.369925,-0.273432,-0.468075,-0.363662,-0.37078,-0.600822,-0.489721,-0.083527,-0.054295,-0.148916,-0.421541,-0.371391,-3.049434,-3.695955,-1.330912,-8.345083,-4.194856,-0.889797,-1.849725,-5.339039,-2.75705,-4.316216,-4.272972,-5.791562,-4.394134,-4.917611,-1.920007,-4.187915,-2.023841,-1.370341,-1.041329,-1.083448,-1.14217,-7.286357,-5.916205,-8.777876,-7.935841,-7.735456,-13.287756,-10.32032,-4.20022,-6.505901,-5.745724,-5.586739,-7.311692,-5.377332,-6.87217,-12.976937,-4.584214,-8.408857,-25.771261,-15.657553,-11.54333,-6.258861
25%,-0.651384,-0.605505,-0.220982,-0.619056,-0.60449,-0.385171,-0.724663,-0.861473,-0.35647,-0.629358,-0.708109,-1.603267,-0.944235,-0.374677,-0.388041,-0.164393,-1.40008,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,-1.127697,-0.369925,-0.273432,-0.468075,-0.363662,-0.37078,-0.600822,-0.489721,-0.083527,-0.054295,-0.148916,-0.421541,-0.371391,-0.682821,-0.64725,-0.661702,-0.546756,-0.513491,-0.575701,-0.590012,-0.621595,-0.705818,-0.639882,-0.633161,-0.393656,-0.430217,-0.542179,-0.51172,-0.568163,-0.651615,-0.370496,-0.314927,-0.240373,-0.239625,-0.54555,-0.48593,-0.253084,-0.491326,-0.402932,-0.101381,-0.168941,-0.545712,-0.425628,-0.547178,-0.5323,-0.316024,-0.441079,-0.349749,-0.314158,-0.524236,-0.157119,-0.25747,-0.08625,-0.016563,-0.081261
50%,0.070556,-0.137589,0.031491,0.064907,-0.60449,-0.385171,-0.724663,-0.861473,-0.35647,-0.629358,-0.708109,0.623726,-0.944235,-0.374677,-0.388041,-0.164393,0.714245,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,0.886763,-0.369925,-0.273432,-0.468075,-0.363662,-0.37078,-0.600822,-0.489721,-0.083527,-0.054295,-0.148916,-0.421541,-0.371391,-0.209499,-0.037509,-0.285271,0.114119,0.10007,-0.313954,-0.193116,-0.031914,-0.07467,-0.098107,-0.113188,0.070226,-0.027106,0.082882,-0.110821,-0.121892,-0.083798,-0.203855,-0.169646,-0.198219,-0.189829,0.203428,0.031239,-0.026762,0.139565,-0.193431,0.002449,0.146179,-0.023639,0.081062,0.149945,0.181268,-0.022089,-0.127666,0.071052,0.182671,0.166824,-0.090573,0.018995,0.037331,0.0491,-0.00331
75%,0.739931,0.442067,0.031491,0.748869,1.654286,-0.385171,1.379952,1.160802,-0.35647,1.58892,1.412212,0.623726,1.059058,-0.374677,-0.388041,-0.164393,0.714245,-0.197225,-0.407631,-0.026464,-0.211895,-0.011832,-0.165512,-0.027756,0.886763,-0.369925,-0.273432,-0.468075,-0.363662,-0.37078,1.664387,-0.489721,-0.083527,-0.054295,-0.148916,-0.421541,-0.371391,0.500485,0.572232,0.300287,0.510644,0.560241,0.104842,0.290062,0.557767,0.556479,0.501716,0.522334,0.449766,0.577559,0.395413,0.362034,0.506193,0.294747,0.212747,-0.169646,-0.198219,-0.189829,0.453088,0.548408,0.275,0.391922,0.435071,0.210108,0.337502,0.498433,0.461079,0.408876,0.35966,0.271847,0.49916,0.281453,0.322803,0.391418,0.242159,0.308626,0.284495,0.216241,0.167207
max,2.091125,14.479535,4.828485,3.484718,1.654286,2.596251,1.379952,1.160802,2.805289,1.58892,1.412212,0.623726,1.059058,2.668963,2.577048,6.082981,0.714245,5.070346,2.453196,37.787564,4.719321,84.519229,6.041849,36.027767,0.886763,2.70325,3.657219,2.136411,2.749803,2.697014,1.664387,2.04198,11.97219,18.417771,6.715173,2.372246,2.692582,7.363661,5.246913,8.288981,7.648095,5.928899,8.899544,37.701834,6.847692,8.919194,6.790182,5.375415,6.564582,3.936811,6.646029,8.266942,5.762272,8.85933,53.680666,14.358393,14.752307,7.067883,7.942873,6.237267,13.099908,8.088793,8.186596,13.500313,25.14198,9.11263,6.41468,7.499613,6.008739,9.031128,4.730234,6.593473,10.628826,6.507299,9.758276,20.003482,9.0588,7.295861,6.101212


In [41]:
print(len(X_train.columns))
print(len(num_cols))

80
79


In [41]:
#dropping icu_outcome column

# dynamic_train_df = dynamic_train_df.drop('icu_outcome', axis=1, inplace=True)

In [42]:
# converting pre-proceesed & splitted static and dynamic df to CSV

# static_train_df.to_csv('static_train_df.csv')
# static_test_df.to_csv('static_test_df.csv')
# static_holdout_df.to_csv('static_holdout_df.csv')
# dynamic_train_df.to_csv('dynamic_train_df.csv')
# dynamic_test_df.to_csv('dynamic_test_df.csv')
# dynamic_holdout_df.to_csv('dynamic_holdout_df.csv')

In [43]:
# static_train_df.to_excel('static_train_df.xlsx', index=False)
# static_test_df.to_excel('static_test_df.xlsx', index=False)
# static_holdout_df.to_excel('static_holdout_df.xlsx', index=False)

# dynamic_train_df.to_excel('dynamic_train_df.xlsx', index=False)
# dynamic_test_df.to_excel('dynamic_test_df.xlsx', index=False)
# dynamic_holdout_df.to_excel('dynamic_holdout_df.xlsx', index=False)

# Applying vanilla linear regression without regularization

In [44]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(X_train, y_train)

In [45]:
# Testing linear regression with X_test

from sklearn.metrics import mean_squared_error

y_train_pred = linear.predict(X_train)
mse_train_linear = mean_squared_error(y_train, y_train_pred)
print("Linear Regression y_train MSE:", mse_train_linear)

y_test_pred = linear.predict(X_test)
mse_test_linear = mean_squared_error(y_test, y_test_pred)
print("Linear Regression y_test MSE:", mse_test_linear)

y_holdout_pred = linear.predict(X_holdout)
mse_holdout_linear = mean_squared_error(y_holdout, y_holdout_pred)
print("Linear Regression y_holdout MSE:", mse_holdout_linear)

Linear Regression y_train MSE: 34.796165458405504
Linear Regression y_test MSE: 34.356602804573214
Linear Regression y_holdout MSE: 28.736974176381732


In [46]:
linear_coef = linear.coef_

feature_list = X_train.columns.tolist()

linear_coefficients_df = pd.DataFrame({'Feature': feature_list, 'Coefficient': linear_coef})

print(linear_coefficients_df)

                                              Feature   Coefficient
0                                              gender  4.221965e-01
1                                       admission_age -3.478295e-01
2                                        weight_admit  1.778044e-01
3                                              height -1.753110e-01
4                                      charlson_score  4.842271e-02
5                                 atrial_fibrillation  2.488993e-01
6                                    malignant_cancer -1.673190e-01
7                                                 chf  3.095528e-02
8                                                 ckd -6.332692e-02
9                                                 cld -1.461160e-01
10                                               copd  1.657092e-01
11                                           diabetes -8.184950e-02
12                                       hypertension -1.326995e-01
13                                              

# Applying linear regression with L1 Lasso regularization

In [47]:
from sklearn.linear_model import Lasso, Ridge

# Training with L1 lasso regularization

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

coefficients = lasso_model.coef_

print("Coefficients:", coefficients)

Coefficients: [ 0.         -0.17819846  0.12098763 -0.         -0.          0.1375742
 -0.07217239  0.         -0.         -0.          0.05939971 -0.
 -0.06426873 -0.12554869  0.0899759  -0.          0.         -0.
 -0.          0.09734681 -0.         -0.         -0.         -0.0151305
 -0.          0.         -0.         -0.          0.43972567  0.
 -0.         -0.         -0.01286525  0.         -0.          0.28063254
  0.15077891  0.16537898  0.10123311 -0.10738325  0.1736559  -0.17473904
  0.         -0.          0.00070585  0.05287164  0.          0.01192785
  0.         -0.          0.         -0.         -0.          0.
  0.          0.08374917  0.          0.          0.11001539  0.
 -0.          0.          0.06947934  0.          0.10629339 -0.
  0.          0.         -0.06354962 -0.         -0.          0.07388259
 -0.04487873 -0.         -0.         -0.         -0.         -0.
 -0.         -0.        ]


In [48]:
feature_list = X_train.columns.tolist()

coefficients_df = pd.DataFrame({'Feature': feature_list, 'Coefficient': coefficients})

print(coefficients_df)

                                              Feature  Coefficient
0                                              gender     0.000000
1                                       admission_age    -0.178198
2                                        weight_admit     0.120988
3                                              height    -0.000000
4                                      charlson_score    -0.000000
5                                 atrial_fibrillation     0.137574
6                                    malignant_cancer    -0.072172
7                                                 chf     0.000000
8                                                 ckd    -0.000000
9                                                 cld    -0.000000
10                                               copd     0.059400
11                                           diabetes    -0.000000
12                                       hypertension    -0.064269
13                                                ihd    -0.12

In [49]:
# Calculating MSE for L1 Lasso Regression

y_train_pred = lasso_model.predict(X_train)
mse_train_lasso = mean_squared_error(y_train, y_train_pred)
print("L1 Lasso Regression y_train MSE:", mse_train_lasso)

y_test_pred = lasso_model.predict(X_test)
mse_test_lasso = mean_squared_error(y_test, y_test_pred)
print("L1 Lasso Regression y_test MSE:", mse_test_lasso)

y_holdout_pred = lasso_model.predict(X_holdout)
mse_holdout_lasso = mean_squared_error(y_holdout, y_holdout_pred)
print("L1 Lasso Regression y_holdout MSE:", mse_holdout_lasso)

L1 Lasso Regression y_train MSE: 35.27315956854741
L1 Lasso Regression y_test MSE: 34.50933257866222
L1 Lasso Regression y_holdout MSE: 29.083469089978998


# Applying linear regression with L2 Ridge regularization

In [50]:
# Training with L2 ridge regularization

ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)


ridge_coef = ridge_model.coef_

print("Coefficients:", ridge_coef)

Coefficients: [ 0.41479684 -0.34675012  0.1779152  -0.17562258  0.04814804  0.2479838
 -0.16730008  0.03154143 -0.06321129 -0.14555773  0.16508206 -0.08214574
 -0.13191647 -0.21780276  0.18558639  0.03523566  0.02600076 -0.06857361
 -0.06220043  0.08470641 -0.05344598 -0.05318671 -0.04091621 -0.14910381
 -0.09581368 -0.08350441 -0.17273188 -0.08219028  0.40764516  0.00709377
 -0.18166375 -0.13770322 -0.16757444  0.04898225 -0.04751266  0.35103449
  0.18840906  0.19563104  0.19881895 -0.08474947  0.41062057 -0.37459536
 -0.03492169 -0.24149476  0.08944356  0.13609016  0.05053629  0.8232198
 -0.31800181 -1.09663038  0.74184181  0.81193989  0.01174358 -0.38375766
  0.14752538  0.14622311  0.29539669 -0.26081994  0.27009276 -0.22377647
 -0.27887785 -0.05064506  0.26277969 -0.18266342  0.22933568 -0.01540406
  0.1881237   0.0012219  -0.73507914  0.87965861  0.53727361 -0.45903663
 -0.73279512 -0.01926704 -0.33376002 -0.10402063 -0.02740856 -0.13206845
  0.10180934 -0.13906927]


In [51]:
feature_list = X_train.columns.tolist()

ridge_coefficients_df = pd.DataFrame({'Feature': feature_list, 'Coefficient': ridge_coef})

print(ridge_coefficients_df)

                                              Feature  Coefficient
0                                              gender     0.414797
1                                       admission_age    -0.346750
2                                        weight_admit     0.177915
3                                              height    -0.175623
4                                      charlson_score     0.048148
5                                 atrial_fibrillation     0.247984
6                                    malignant_cancer    -0.167300
7                                                 chf     0.031541
8                                                 ckd    -0.063211
9                                                 cld    -0.145558
10                                               copd     0.165082
11                                           diabetes    -0.082146
12                                       hypertension    -0.131916
13                                                ihd    -0.21

In [52]:
# Calculating MSE for L1 Lasso Regression

y_train_pred = ridge_model.predict(X_train)
mse_train_ridge = mean_squared_error(y_train, y_train_pred)
print("L2 Ridge Regression y_train MSE:", mse_train_ridge)

y_test_pred = ridge_model.predict(X_test)
mse_test_ridge = mean_squared_error(y_test, y_test_pred)
print("L2 Ridge Regression y_test MSE:", mse_test_ridge)

y_holdout_pred = ridge_model.predict(X_holdout)
mse_holdout_ridge = mean_squared_error(y_holdout, y_holdout_pred)
print("L2 Ridge Regression y_holdout MSE:", mse_holdout_ridge)

L2 Ridge Regression y_train MSE: 34.796114675752584
L2 Ridge Regression y_test MSE: 34.359273820645505
L2 Ridge Regression y_holdout MSE: 28.738525332556467


# Applying linear regression using pytorch

In [53]:
import torch
import torch.nn as nn
from torch.optim import SGD #gradient descent optimizer

# NumPy for math operations, and Pandas for processing tabular data.
import numpy as np
import pandas as pd

# Plotly plotting package
import plotly.graph_objects as go
import plotly.express as px

In [54]:
X_train['gender'] = X_train['gender'].astype(int)
X_test['gender'] = X_test['gender'].astype(int)
X_holdout['gender'] = X_holdout['gender'].astype(int)
print(X_train.dtypes)

gender                                                               int32
admission_age                                                      float64
weight_admit                                                       float64
height                                                             float64
charlson_score                                                     float64
atrial_fibrillation                                                float64
malignant_cancer                                                   float64
chf                                                                float64
ckd                                                                float64
cld                                                                float64
copd                                                               float64
diabetes                                                           float64
hypertension                                                       float64
ihd                      

In [55]:
X_train_tensor = torch.tensor(X_train.to_numpy(),dtype=torch.float32)
m,n = X_train_tensor.shape
y_train_tensor = torch.tensor(y_train.to_numpy(),dtype=torch.float32).reshape(m,1)

X_test_tensor = torch.tensor(X_test.to_numpy(),dtype=torch.float32)
m,n = X_test_tensor.shape
y_test_tensor = torch.tensor(y_test.to_numpy(),dtype=torch.float32).reshape(m,1)

X_holdout_tensor = torch.tensor(X_holdout.to_numpy(),dtype=torch.float32)
m,n = X_holdout_tensor.shape
y_holdout_tensor = torch.tensor(y_holdout.to_numpy(),dtype=torch.float32).reshape(m,1)

In [56]:
print('Training input has size: ',X_train_tensor.shape)
print('y_Training input has size: ',y_train_tensor.shape)

Training input has size:  torch.Size([14289, 80])
y_Training input has size:  torch.Size([14289, 1])


In [57]:
h = torch.nn.Linear(
    in_features=80,
    out_features=1,
    bias=True
)

# For torch SGD, we need to tell it which parameter we what to optimize.
GD_optimizer = torch.optim.SGD(lr=0.05, params=h.parameters())
J_MSE = torch.nn.MSELoss()

# Apply gradient descent 10000 times
nIter = 10000
printInterval = 500

# PyTorch optimization steps:
# 1. Clear gradient
# 2. Calculate model and loss values (so-called forward path)
# 3. Calculate gradient of loss (so-called backward path)
# 4. Ask optimizer to update parameters of model
for i in range(nIter):
    # Step 1
    GD_optimizer.zero_grad()
    # Step 2
    pred = h(X_train_tensor)
    loss = J_MSE(pred,y_train_tensor)
    # Step 3
    loss.backward()
    # Step 4
    GD_optimizer.step()
    # Print loss value to track optimization progress
    if i == 0 or ((i+1) % printInterval) == 0:
        # We take square root of MSE (PyTorch internally averaged in J_MSE)
        # so that scale of printout is same as scale of y values.
        print('Iter {} : average rooted training MSE {:.3f}'.format(i+1,torch.sqrt(loss).item()))

Iter 1 : average rooted training MSE 7.788
Iter 500 : average rooted training MSE 5.900
Iter 1000 : average rooted training MSE 5.899
Iter 1500 : average rooted training MSE 5.899
Iter 2000 : average rooted training MSE 5.899
Iter 2500 : average rooted training MSE 5.899
Iter 3000 : average rooted training MSE 5.899
Iter 3500 : average rooted training MSE 5.899
Iter 4000 : average rooted training MSE 5.899
Iter 4500 : average rooted training MSE 5.899
Iter 5000 : average rooted training MSE 5.899
Iter 5500 : average rooted training MSE 5.899
Iter 6000 : average rooted training MSE 5.899
Iter 6500 : average rooted training MSE 5.899
Iter 7000 : average rooted training MSE 5.899
Iter 7500 : average rooted training MSE 5.899
Iter 8000 : average rooted training MSE 5.899
Iter 8500 : average rooted training MSE 5.899
Iter 9000 : average rooted training MSE 5.899
Iter 9500 : average rooted training MSE 5.899
Iter 10000 : average rooted training MSE 5.899


In [58]:
# Calculate testing results.

with torch.no_grad():
    y_train_pred = h(X_train_tensor)
with torch.no_grad():
    y_test_pred = h(X_test_tensor)
with torch.no_grad():
    y_holdout_pred = h(X_holdout_tensor)


mse_train_torch = mean_squared_error(y_train, y_train_pred)
print("pyTorch y_train MSE:", mse_train_torch)

mse_test_torch = mean_squared_error(y_test, y_test_pred)
print("pyTorch y_test MSE:", mse_test_torch)

mse_holdout_torch = mean_squared_error(y_holdout, y_holdout_pred)
print("pyTorch y_holdout MSE:", mse_holdout_torch)

pyTorch y_train MSE: 34.796371660872424
pyTorch y_test MSE: 34.360678704285974
pyTorch y_holdout MSE: 28.737634913214976


# Random Forest Regressor

In [59]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=8, min_samples_split=10)

rf_regressor.fit(X_train, y_train)

In [60]:
feature_importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                                              Feature  Importance
3                                              height    0.255938
41                                       calcium_last    0.072697
2                                        weight_admit    0.062627
48                                    hemoglobin_last    0.056466
62                                      calcium_delta    0.035223
49                                           mch_last    0.026974
40                                           bun_last    0.021938
43                                    creatinine_last    0.021407
79                                          ptt_delta    0.021373
55                                           wbc_last    0.021040
1                                       admission_age    0.020059
58                                           ptt_last    0.019103
64                                   creatinine_delta    0.016848
68                                   hematocrit_delta    0.016101
76        

In [61]:
# Calculating MSE for Random Forest Regressor

y_train_pred = rf_regressor.predict(X_train)
mse_train_rfregressor = mean_squared_error(y_train, y_train_pred)
print("RF regressor y_train MSE:", mse_train_rfregressor)

y_test_pred = rf_regressor.predict(X_test)
mse_test_rfregressor = mean_squared_error(y_test, y_test_pred)
print("RF regressor y_test MSE:", mse_test_ridge)

y_holdout_pred = rf_regressor.predict(X_holdout)
mse_holdout_rfregressor = mean_squared_error(y_holdout, y_holdout_pred)
print("RF regressor y_holdout MSE:", mse_holdout_rfregressor)

RF regressor y_train MSE: 27.175973345794997
RF regressor y_test MSE: 34.359273820645505
RF regressor y_holdout MSE: 27.983530612107405


In [62]:
# Experimenting with removal of height and weight from the X datasets and training RF regressor again

# X_train = X_train.drop(columns=['height', 'weight_admit'])
# X_test = X_test.drop(columns=['height', 'weight_admit'])
# X_holdout = X_holdout.drop(columns=['height', 'weight_admit'])

In [63]:
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=8, min_samples_split=10)

rf_regressor.fit(X_train, y_train)

In [64]:
feature_importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                                              Feature  Importance
3                                              height    0.261690
41                                       calcium_last    0.065153
2                                        weight_admit    0.053085
48                                    hemoglobin_last    0.048017
62                                      calcium_delta    0.038449
49                                           mch_last    0.026387
43                                    creatinine_last    0.026289
55                                           wbc_last    0.024483
58                                           ptt_last    0.021666
1                                       admission_age    0.021331
40                                           bun_last    0.019768
79                                          ptt_delta    0.019615
76                                          wbc_delta    0.019341
46                                     potassium_last    0.017501
64        

In [65]:
# Calculating MSE for Random Forest Regressor

y_train_pred = rf_regressor.predict(X_train)
mse_train_rfregressor = mean_squared_error(y_train, y_train_pred)
print("RF regressor y_train MSE:", mse_train_rfregressor)

y_test_pred = rf_regressor.predict(X_test)
mse_test_rfregressor = mean_squared_error(y_test, y_test_pred)
print("RF regressor y_test MSE:", mse_test_ridge)

y_holdout_pred = rf_regressor.predict(X_holdout)
mse_holdout_rfregressor = mean_squared_error(y_holdout, y_holdout_pred)
print("RF regressor y_holdout MSE:", mse_holdout_rfregressor)

RF regressor y_train MSE: 27.15226107313759
RF regressor y_test MSE: 34.359273820645505
RF regressor y_holdout MSE: 27.829574714314177


# Support Vector Regressor 

In [66]:
from sklearn.svm import SVR

svr_regressor = SVR()

svr_regressor.fit(X_train, y_train)

In [67]:
# Calculating MSE for Support Vector Regressor

y_train_pred = svr_regressor.predict(X_train)
mse_train_SVR = mean_squared_error(y_train, y_train_pred)
print("Support Vector regressor y_train MSE:", mse_train_SVR)

y_test_pred = svr_regressor.predict(X_test)
mse_test_SVR = mean_squared_error(y_test, y_test_pred)
print("RF regressor y_test MSE:", mse_test_SVR)

y_holdout_pred = svr_regressor.predict(X_holdout)
mse_holdout_SVR = mean_squared_error(y_holdout, y_holdout_pred)
print("RF regressor y_holdout MSE:", mse_holdout_SVR)

Support Vector regressor y_train MSE: 35.65760621784725
RF regressor y_test MSE: 36.84467450819346
RF regressor y_holdout MSE: 30.545340376453588
