In [1]:
import os
import re
import scipy
from scipy import stats
import pickle
import subprocess
import shlex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Start here - load processed timestep5 data

train_df = pd.read_excel('Timestep5_train.xlsx')
test_df = pd.read_excel('Timestep5_test.xlsx')
holdout_df = pd.read_excel('Timestep5_holdout.xlsx')

In [3]:
train_id_list = train_df.id.unique().tolist()
test_id_list = test_df.id.unique().tolist()
holdout_id_list = holdout_df.id.unique().tolist()
print(len(train_id_list))

14289


In [4]:
print(train_df.shape)
print(test_df.shape)
print(holdout_df.shape)

(71445, 70)
(20415, 70)
(10210, 70)


In [5]:
combined_df = pd.concat([train_df, test_df, holdout_df], axis=0)
combined_df.shape

(102070, 70)

In [6]:
combined_df.head(20)

Unnamed: 0,id,charttime,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,...,first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),hour,icu_cat
0,20001305,1978-03-25 08:20:00,15,23.0,47,11.4,108,0.8,154,142.0,...,0,0,1,0,0,0,0,0,5.35,1
1,20001305,1978-03-25 08:20:00,15,23.0,47,11.4,108,0.8,154,142.0,...,0,0,1,0,0,0,0,0,5.35,1
2,20001305,1978-03-25 08:20:00,15,23.0,47,11.4,108,0.8,154,142.0,...,0,0,1,0,0,0,0,0,5.35,1
3,20001305,1978-03-25 13:45:00,13,25.0,48,10.8,107,0.9,149,140.0,...,0,0,1,0,0,0,0,0,10.766667,1
4,20001305,1978-03-25 21:55:00,13,24.0,50,10.8,108,0.9,131,141.0,...,0,0,1,0,0,0,0,0,18.933333,1
5,20001361,2043-05-04 17:24:00,14,22.0,28,6.3,107,2.5,161,137.0,...,0,0,1,0,0,0,0,0,0.533333,2
6,20001361,2043-05-04 21:07:00,15,20.0,32,6.5,108,2.5,124,137.0,...,0,0,1,0,0,0,0,0,4.25,2
7,20001361,2043-05-05 04:27:00,15,23.0,36,7.2,108,2.9,98,142.0,...,0,0,1,0,0,0,0,0,11.583333,2
8,20001361,2043-05-05 11:50:00,15,23.0,27,8.4,107,1.3,134,142.0,...,0,0,1,0,0,0,0,0,18.966667,2
9,20001361,2043-05-05 15:02:00,18,23.0,40,7.9,107,3.7,123,144.0,...,0,0,1,0,0,0,0,0,22.166667,2


In [7]:
columns_to_drop = ['id',
                   'charttime',
                   'hour',
                   'icu_cat',
                   'hosp_admittime',
                   'hosp_dischtime',
                   'icu_intime',
                   'icu_outtime',
                   'los_icu',
                   'icu_death',
                   'race'
                  ]

static_columns_to_drop = ['gender',
 'admission_age',
 'weight_admit',
 'height',
 'charlson_score',
 'atrial_fibrillation',
 'malignant_cancer',
 'chf',
 'ckd',
 'cld',
 'copd',
 'diabetes',
 'hypertension',
 'ihd',
 'stroke',
 'race_encode_African',
 'race_encode_Asian',
 'race_encode_Caucasian',
 'race_encode_Hispanic',
 'race_encode_Not Specified',
 'race_encode_South American',
 'admission_type_DIRECT EMER.',
 'admission_type_DIRECT OBSERVATION',
 'admission_type_ELECTIVE',
 'admission_type_EU OBSERVATION',
 'admission_type_EW EMER.',
 'admission_type_OBSERVATION ADMIT',
 'admission_type_SURGICAL SAME DAY ADMISSION',
 'admission_type_URGENT',
 'first_careunit_Cardiac Vascular Intensive Care Unit (CVICU)',
 'first_careunit_Coronary Care Unit (CCU)',
 'first_careunit_Medical Intensive Care Unit (MICU)',
 'first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU)',
 'first_careunit_Neuro Intermediate',
 'first_careunit_Neuro Stepdown',
 'first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU)',
 'first_careunit_Surgical Intensive Care Unit (SICU)',
 'first_careunit_Trauma SICU (TSICU)'
                         ]

dynamic_columns_to_drop = ['aniongap',
 'bicarbonate',
 'bun',
 'calcium',
 'chloride',
 'creatinine',
 'glucose',
 'sodium',
 'potassium',
 'hematocrit',
 'hemoglobin',
 'mch',
 'mchc',
 'mcv',
 'platelet',
 'rbc',
 'rdw',
 'wbc',
 'inr',
 'pt',
 'ptt',
                          ]

X_train_df = train_df.drop(columns=columns_to_drop)
X_train_df = X_train_df.drop(columns=static_columns_to_drop)
X_test_df = test_df.drop(columns=columns_to_drop)
X_test_df = X_test_df.drop(columns=static_columns_to_drop)
X_holdout_df = holdout_df.drop(columns=columns_to_drop)
X_holdout_df = X_holdout_df.drop(columns=static_columns_to_drop)

combined_df_dynamic = combined_df.drop(columns=columns_to_drop)
combined_df_dynamic = combined_df_dynamic.drop(columns=static_columns_to_drop) 

combined_df_static = combined_df.drop(columns=dynamic_columns_to_drop)

print(X_train_df.shape)
print(X_test_df.shape)
print(X_holdout_df.shape)    
print(combined_df_dynamic.shape)
print(combined_df_static.shape)

(71445, 21)
(20415, 21)
(10210, 21)
(102070, 21)
(102070, 49)


In [8]:
combined_static = combined_df[static_columns_to_drop]
combined_static.shape

(102070, 38)

In [9]:
# Perform normalization using data from X_train to transform X_test

from sklearn.preprocessing import StandardScaler

num_cols = X_train_df.columns[X_train_df.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
# print(num_cols)
scaler = StandardScaler()
X_train_df[num_cols] = scaler.fit_transform(X_train_df[num_cols])
X_test_df[num_cols] = scaler.transform(X_test_df[num_cols])
X_holdout_df[num_cols] = scaler.transform(X_holdout_df[num_cols])

In [10]:
combined_df_dynamic[num_cols] = scaler.transform(combined_df_dynamic[num_cols])

In [11]:
combined_df_dynamic.describe()

Unnamed: 0,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,...,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt
count,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,...,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0,102070.0
mean,-0.00462,-0.000637,-0.005993,-0.002668,-0.004582,-0.00736,-0.001062,-0.008652,-0.00153,0.002703,...,0.00462,0.003243,0.003446,0.005167,0.001369,-0.00441,0.009301,0.007706,0.006099,0.006913
std,1.001152,0.998981,0.998929,0.998056,0.995505,0.976615,0.999925,0.997647,0.998267,1.00222,...,1.006947,1.00362,1.00637,1.002025,1.000487,0.995392,1.027643,1.025527,1.013393,1.010381
min,-4.373593,-4.093518,-1.298131,-8.182625,-6.124863,-0.890567,-1.815757,-8.745787,-4.084741,-4.269258,...,-5.435353,-5.699177,-5.733005,-1.85966,-4.191853,-2.044529,-1.266673,-0.956764,-1.046013,-1.153885
25%,-0.546588,-0.600528,-0.647104,-0.549026,-0.575844,-0.546173,-0.561937,-0.580979,-0.650235,-0.592042,...,-0.402901,-0.546077,-0.544904,-0.532063,-0.552956,-0.644977,-0.403577,-0.433995,-0.342141,-0.34232
50%,-0.096352,-0.018363,-0.280901,0.056815,0.008264,-0.300177,-0.262111,-0.024287,-0.12185,-0.251559,...,0.072606,0.033647,0.048022,-0.202518,-0.246061,-0.058068,-0.20938,-0.172611,-0.203881,-0.186914
75%,0.353884,0.563802,0.329437,0.541488,0.592371,0.093416,0.228514,0.532404,0.538632,0.599648,...,0.468862,0.548957,0.492716,0.353002,0.58694,0.348253,0.20059,-0.172611,-0.166173,-0.186914
max,9.133483,5.221122,9.362437,25.623313,7.601659,38.419564,29.666018,8.697212,11.502631,5.809037,...,6.17495,9.631296,6.570205,13.082868,6.111048,9.6485,48.631054,26.096515,16.777032,6.52432


In [12]:
X_test_df.describe()

Unnamed: 0,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,...,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt
count,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,...,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0,20415.0
mean,-0.005952,-0.004759,-0.018932,-0.001351,-0.016178,-0.037825,-0.004492,-0.024483,-0.000375,0.019326,...,0.010449,0.012173,0.005055,0.026587,0.017427,-0.018536,0.024501,0.029066,0.023381,0.020272
std,1.014202,0.999285,1.008593,1.001648,0.979664,0.887121,1.016245,1.004946,0.999261,1.021322,...,1.034937,1.013095,1.036404,1.020192,1.021687,1.001434,1.067312,1.097595,1.047359,1.024264
min,-3.022885,-3.705408,-1.298131,-7.697952,-4.810622,-0.890567,-1.802128,-6.147893,-3.820548,-4.269258,...,-5.435353,-4.926212,-4.695384,-1.85966,-4.191853,-1.999382,-1.266673,-0.956764,-1.020875,-1.073304
25%,-0.546588,-0.600528,-0.687793,-0.549026,-0.575844,-0.546173,-0.561937,-0.580979,-0.650235,-0.609067,...,-0.402901,-0.546077,-0.544904,-0.513232,-0.56757,-0.690124,-0.403577,-0.433995,-0.329572,-0.336565
50%,-0.096352,-0.018363,-0.280901,0.056815,0.008264,-0.300177,-0.262111,-0.024287,-0.12185,-0.251559,...,0.072606,0.033647,0.048022,-0.202518,-0.246061,-0.058068,-0.20938,-0.172611,-0.203881,-0.186914
75%,0.353884,0.563802,0.288748,0.541488,0.592371,0.044217,0.214886,0.532404,0.538632,0.616672,...,0.468862,0.613371,0.492716,0.409495,0.616168,0.303106,0.227562,-0.172611,-0.153604,-0.163891
max,9.133483,5.221122,8.22314,18.110882,6.579471,8.358875,29.666018,8.697212,7.407644,5.809037,...,6.17495,3.898472,6.273742,9.665012,4.956539,8.474682,48.631054,26.096515,16.714187,6.52432


In [13]:
# Converting training, testing and holdout data into 3D numpy array

steps = 5

numpy_train_data = X_train_df.values
numpy_test_data = X_test_df.values
numpy_holdout_data = X_holdout_df.values
numpy_combined = combined_df.values

# print(len(numpy_data))
X_train_input = []
y_train = []
X_test_input = []
y_test = []
X_holdout_input = []
y_holdout = []

combined_last4time = []


for i in range(int(len(numpy_train_data)/steps)):
    sample = X_train_df.iloc[i*steps:i*steps+steps-1]
    label = X_train_df.iloc[i*steps+steps-1]
    X_train_input.append(sample)
    y_train.append(label)

for i in range(int(len(numpy_test_data)/steps)):
    sample = X_test_df.iloc[i*steps:i*steps+steps-1]
    label = X_test_df.iloc[i*steps+steps-1]
    X_test_input.append(sample)
    y_test.append(label)

for i in range(int(len(numpy_holdout_data)/steps)):
    sample = X_holdout_df.iloc[i*steps:i*steps+steps-1]
    label = X_holdout_df.iloc[i*steps+steps-1]
    X_holdout_input.append(sample)
    y_holdout.append(label)

for i in range(int(len(numpy_combined)/steps)):
    sample = combined_df_dynamic.iloc[i*steps+1:i*steps+steps] # getting the last 4 time point
    combined_last4time.append(sample)

X_train_input = np.array(X_train_input)
y_train = np.array(y_train)

X_test_input = np.array(X_test_input)
y_test = np.array(y_test)

X_holdout_input = np.array(X_holdout_input)
y_holdout = np.array(y_holdout)

combined_last4time = np.array(combined_last4time)

In [14]:
print(X_train_input.shape)
print(X_test_input.shape)
print(X_holdout_input.shape)
print(y_train.shape)
print(y_test.shape)
print(y_holdout.shape)
print(combined_last4time.shape)

(14289, 4, 21)
(4083, 4, 21)
(2042, 4, 21)
(14289, 21)
(4083, 21)
(2042, 21)
(20414, 4, 21)


In [15]:
# converting combined static from 5 time step into single static row

steps = 5

combined_static_1row = pd.DataFrame()

for i in range(0, len(combined_df_static), 5):
    selected_row = combined_df_static.iloc[i:i+1]  # Select every 5th row
    combined_static_1row = combined_static_1row.append(selected_row)

combined_static_1row.head()

Unnamed: 0,id,charttime,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,race,...,first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),hour,icu_cat
0,20001305,1978-03-25 08:20:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,0,WHITE,...,0,0,1,0,0,0,0,0,5.35,1
5,20001361,2043-05-04 17:24:00,2043-05-04 14:55:00,2043-05-18 16:58:00,2043-05-04 16:52:00,2043-05-10 17:59:00,6.05,0,1,WHITE,...,0,0,1,0,0,0,0,0,0.533333,2
10,20002506,2032-03-19 06:13:00,2032-03-19 05:42:00,2032-03-28 16:09:00,2032-03-19 05:50:00,2032-03-25 19:23:00,6.56,0,1,UNKNOWN,...,0,0,0,0,0,1,0,0,0.383333,2
15,20003425,2055-07-21 23:27:00,2055-07-21 10:00:00,2055-07-29 14:40:00,2055-07-22 17:13:00,2055-07-26 17:11:00,4.0,0,1,WHITE,...,0,0,0,0,0,0,1,0,-17.766667,1
20,20003491,1997-12-17 15:33:00,1997-12-18 04:50:00,1997-12-28 17:29:00,1997-12-18 06:10:00,1997-12-20 19:02:00,2.54,0,1,HISPANIC/LATINO - GUATEMALAN,...,0,0,1,0,0,0,0,0,-14.616667,1


In [16]:
combined_static_1row.dtypes

id                                                                   int64
charttime                                                           object
hosp_admittime                                                      object
hosp_dischtime                                                      object
icu_intime                                                          object
icu_outtime                                                         object
los_icu                                                            float64
icu_death                                                            int64
gender                                                               int64
race                                                                object
admission_age                                                      float64
weight_admit                                                       float64
height                                                             float64
charlson_score           

In [69]:
combined_static_1row.shape

(20414, 49)

In [17]:
static_scaled_columns = ['gender',
 'admission_age',
 'weight_admit',
 'height',
 'charlson_score',
 'atrial_fibrillation',
 'malignant_cancer',
 'chf',
 'ckd',
 'cld',
 'copd',
 'diabetes',
 'hypertension',
 'ihd',
 'stroke',
 'race_encode_African',
 'race_encode_Asian',
 'race_encode_Caucasian',
 'race_encode_Hispanic',
 'race_encode_Not Specified',
 'race_encode_South American',
 'admission_type_DIRECT EMER.',
 'admission_type_DIRECT OBSERVATION',
 'admission_type_ELECTIVE',
 'admission_type_EU OBSERVATION',
 'admission_type_EW EMER.',
 'admission_type_OBSERVATION ADMIT',
 'admission_type_SURGICAL SAME DAY ADMISSION',
 'admission_type_URGENT',
 'first_careunit_Cardiac Vascular Intensive Care Unit (CVICU)',
 'first_careunit_Coronary Care Unit (CCU)',
 'first_careunit_Medical Intensive Care Unit (MICU)',
 'first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU)',
 'first_careunit_Neuro Intermediate',
 'first_careunit_Neuro Stepdown',
 'first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU)',
 'first_careunit_Surgical Intensive Care Unit (SICU)',
 'first_careunit_Trauma SICU (TSICU)'
                         ]

print(len(static_scaled_columns))

38


In [18]:
id_y = ['id', 'icu_cat']

combined_y = combined_static_1row[id_y]

combined_y.head()

Unnamed: 0,id,icu_cat
0,20001305,1
5,20001361,2
10,20002506,2
15,20003425,1
20,20003491,1


In [19]:
import torch
import torch.nn as nn

# Define the LSTM model
class LSTMPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.0):
        super(LSTMPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, output_size)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out) 
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc2(out[:, -1, :]) # Predicting the next time point
        return out

# Prepare the data
X_train = torch.tensor(X_train_input, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)

# Define hyperparameters
input_size = 21  # Number of features
hidden_size = 64  # Number of LSTM units
num_layers = 1  # Number of LSTM layers
output_size = 21  # Number of output features
dropout_rate = 0.3  # Example dropout rate

# Instantiate the LSTM model
model = LSTMPredictor(input_size, hidden_size, num_layers, output_size, dropout_rate)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 1000
for epoch in range(num_epochs):
    # model.train()
    optimizer.zero_grad()
    outputs = model.forward(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/1000], Loss: 0.8594
Epoch [2/1000], Loss: 0.8549
Epoch [3/1000], Loss: 0.8509
Epoch [4/1000], Loss: 0.8469
Epoch [5/1000], Loss: 0.8431
Epoch [6/1000], Loss: 0.8391
Epoch [7/1000], Loss: 0.8352
Epoch [8/1000], Loss: 0.8316
Epoch [9/1000], Loss: 0.8277
Epoch [10/1000], Loss: 0.8235
Epoch [11/1000], Loss: 0.8197
Epoch [12/1000], Loss: 0.8158
Epoch [13/1000], Loss: 0.8108
Epoch [14/1000], Loss: 0.8067
Epoch [15/1000], Loss: 0.8019
Epoch [16/1000], Loss: 0.7973
Epoch [17/1000], Loss: 0.7915
Epoch [18/1000], Loss: 0.7865
Epoch [19/1000], Loss: 0.7808
Epoch [20/1000], Loss: 0.7751
Epoch [21/1000], Loss: 0.7688
Epoch [22/1000], Loss: 0.7626
Epoch [23/1000], Loss: 0.7553
Epoch [24/1000], Loss: 0.7486
Epoch [25/1000], Loss: 0.7412
Epoch [26/1000], Loss: 0.7339
Epoch [27/1000], Loss: 0.7262
Epoch [28/1000], Loss: 0.7181
Epoch [29/1000], Loss: 0.7102
Epoch [30/1000], Loss: 0.7019
Epoch [31/1000], Loss: 0.6941
Epoch [32/1000], Loss: 0.6867
Epoch [33/1000], Loss: 0.6784
Epoch [34/1000], Lo

In [20]:
# Prepare the test data
X_test = torch.tensor(X_test_input, dtype=torch.float32)
y_test  = torch.tensor(y_test, dtype=torch.float32)
X_holdout = torch.tensor(X_holdout_input, dtype=torch.float32)
y_holdout = torch.tensor(y_holdout, dtype=torch.float32)


# Pass the test data through the model to get predictions
with torch.no_grad():
    model.eval()
    predicted_y_test = model(X_test)

# Convert the predictions and ground truth to NumPy arrays for evaluation
predicted_y_test_np = predicted_y_test.numpy()
y_test_np = y_test.numpy()

# Evaluate the performance using a suitable metric (e.g., Mean Squared Error)
mse = ((predicted_y_test_np - y_test_np) ** 2).mean()
print(f'Mean Squared Error on Test Data: {mse:.4f}')

Mean Squared Error on Test Data: 0.4212


In [22]:
print(predicted_y_test_np.shape)
print(predicted_y_test.shape)      

(4083, 21)
torch.Size([4083, 21])


In [23]:
# Prepare the test data

combined_last4time = torch.tensor(combined_last4time, dtype=torch.float32)

# Pass the test data through the model to get predictions
with torch.no_grad():
    model.eval()
    predicted_combined = model(combined_last4time)

predicted_combined_np = predicted_combined.numpy()

print(predicted_combined.shape)
print(predicted_combined_np.shape)



torch.Size([20414, 21])
(20414, 21)


In [28]:
first_five_rows = predicted_combined[:5]

print(first_five_rows)

tensor([[-3.9499e-01,  2.0769e-01,  4.2876e-01,  2.3056e+00,  5.3807e-01,
         -2.2985e-01, -6.8217e-02,  6.1360e-01, -2.3311e-02, -2.6833e-01,
         -1.7647e-01,  1.7742e-01,  2.9100e-01, -9.3012e-03, -3.8369e-01,
         -2.1834e-01, -2.4097e-01, -4.8529e-01, -2.8703e-01, -3.0086e-01,
         -7.8858e-02],
        [-1.2189e-02, -4.3462e-02,  1.9252e-01, -2.8423e-01,  4.6670e-01,
          4.3596e-01, -2.7878e-01,  5.4064e-01, -3.1594e-01, -1.0286e-01,
         -4.1337e-02,  9.7828e-02,  2.5544e-01, -5.1499e-02, -2.4386e-01,
         -9.6929e-02, -3.4039e-01, -2.4119e-01, -1.9492e-01, -2.1430e-01,
         -2.0442e-01],
        [-3.6038e-01,  1.2468e-01, -6.7689e-01,  3.0196e-01,  6.4003e-01,
         -4.8148e-01, -3.0178e-01,  6.8452e-01, -3.9721e-01,  6.6666e-01,
          7.7587e-01,  1.4132e-01,  5.1429e-01, -1.6730e-01, -1.2378e-01,
          6.6139e-01, -9.5189e-01, -2.7140e-01, -3.3544e-01, -3.4904e-01,
         -2.1774e-01],
        [-6.4480e-01,  6.1487e-04, -2.7994e

In [29]:
first_five_rows = predicted_combined_np[:5]

print(first_five_rows)

[[-3.94989789e-01  2.07691252e-01  4.28759128e-01  2.30560565e+00
   5.38074553e-01 -2.29850397e-01 -6.82166219e-02  6.13597035e-01
  -2.33111531e-02 -2.68328965e-01 -1.76470518e-01  1.77416265e-01
   2.91004449e-01 -9.30120051e-03 -3.83689344e-01 -2.18340307e-01
  -2.40966931e-01 -4.85289335e-01 -2.87031382e-01 -3.00858855e-01
  -7.88578689e-02]
 [-1.21893287e-02 -4.34617698e-02  1.92521423e-01 -2.84226865e-01
   4.66701180e-01  4.35962260e-01 -2.78783828e-01  5.40642381e-01
  -3.15937936e-01 -1.02855325e-01 -4.13372889e-02  9.78283286e-02
   2.55436182e-01 -5.14994748e-02 -2.43858650e-01 -9.69293118e-02
  -3.40389788e-01 -2.41192952e-01 -1.94922984e-01 -2.14303404e-01
  -2.04423055e-01]
 [-3.60384256e-01  1.24682680e-01 -6.76888466e-01  3.01956356e-01
   6.40028000e-01 -4.81478333e-01 -3.01780522e-01  6.84517682e-01
  -3.97209346e-01  6.66661263e-01  7.75867760e-01  1.41322196e-01
   5.14288664e-01 -1.67304903e-01 -1.23784125e-01  6.61394238e-01
  -9.51891065e-01 -2.71397650e-01 -3.3

In [24]:
# predicted_combined_dynamic_df = pd.DataFrame(predicted_combined_np, columns=[f'prediction_{i}' for i in range(predicted_combined.shape[1])])
predicted_combined_dynamic_df = pd.DataFrame(predicted_combined_np, columns=[dynamic_columns_to_drop])

In [25]:
predicted_combined_dynamic_df.head()

Unnamed: 0,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,...,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt
0,-0.39499,0.207691,0.428759,2.305606,0.538075,-0.22985,-0.068217,0.613597,-0.023311,-0.268329,...,0.177416,0.291004,-0.009301,-0.383689,-0.21834,-0.240967,-0.485289,-0.287031,-0.300859,-0.078858
1,-0.012189,-0.043462,0.192521,-0.284227,0.466701,0.435962,-0.278784,0.540642,-0.315938,-0.102855,...,0.097828,0.255436,-0.051499,-0.243859,-0.096929,-0.34039,-0.241193,-0.194923,-0.214303,-0.204423
2,-0.360384,0.124683,-0.676888,0.301956,0.640028,-0.481478,-0.301781,0.684518,-0.397209,0.666661,...,0.141322,0.514289,-0.167305,-0.123784,0.661394,-0.951891,-0.271398,-0.335441,-0.349038,-0.217738
3,-0.644798,0.000615,-0.279942,0.178031,-0.168479,-0.328684,-0.247705,-0.703199,0.609102,-0.791099,...,-0.463774,-0.593994,-0.155309,0.010233,-0.691954,-0.141471,-0.153174,-0.270119,-0.283493,-0.231409
4,0.274095,-0.504555,0.504493,-0.507762,-0.367926,0.130588,-0.072209,-0.803737,1.343336,0.159621,...,0.022985,-0.047857,0.048202,-0.219825,0.128122,-0.008897,-0.196616,0.027592,0.011165,0.207145


In [30]:
static_scaled_columns = ['gender',
 'admission_age',
 'weight_admit',
 'height',
 'charlson_score',
 'atrial_fibrillation',
 'malignant_cancer',
 'chf',
 'ckd',
 'cld',
 'copd',
 'diabetes',
 'hypertension',
 'ihd',
 'stroke',
 'race_encode_African',
 'race_encode_Asian',
 'race_encode_Caucasian',
 'race_encode_Hispanic',
 'race_encode_Not Specified',
 'race_encode_South American',
 'admission_type_DIRECT EMER.',
 'admission_type_DIRECT OBSERVATION',
 'admission_type_ELECTIVE',
 'admission_type_EU OBSERVATION',
 'admission_type_EW EMER.',
 'admission_type_OBSERVATION ADMIT',
 'admission_type_SURGICAL SAME DAY ADMISSION',
 'admission_type_URGENT',
 'first_careunit_Cardiac Vascular Intensive Care Unit (CVICU)',
 'first_careunit_Coronary Care Unit (CCU)',
 'first_careunit_Medical Intensive Care Unit (MICU)',
 'first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU)',
 'first_careunit_Neuro Intermediate',
 'first_careunit_Neuro Stepdown',
 'first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU)',
 'first_careunit_Surgical Intensive Care Unit (SICU)',
 'first_careunit_Trauma SICU (TSICU)'
                         ]
# num_cols = combined_static_1row.columns[combined_static_1row.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
# print(num_cols)
scaler = StandardScaler()
combined_static_1row[static_scaled_columns] = scaler.fit_transform(combined_static_1row[static_scaled_columns])


In [31]:
combined_static_1row.describe()

Unnamed: 0,id,los_icu,icu_death,gender,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,...,first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),hour,icu_cat
count,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,...,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0,20414.0
mean,24994260.0,4.854449,0.104732,1.092928e-16,-6.874311e-17,-9.954699000000001e-17,-1.970056e-16,1.228674e-16,6.891715e-17,1.870857e-17,...,-4.35083e-18,-2.192818e-17,1.350498e-16,-5.586466000000001e-17,2.5060780000000002e-17,-5.743095e-17,1.371382e-16,4.4552500000000004e-17,-48.893207,0.833546
std,2871874.0,5.973425,0.306215,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,...,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,6135.165906,0.824786
min,20001300.0,1.0,0.0,-1.131091,-3.154232,-3.023908,-6.00616,-1.986938,-0.6065934,-0.3845592,...,-0.3714398,-0.5977573,-0.4899674,-0.08458003,-0.05383819,-0.1475563,-0.4231381,-0.3694732,-876578.766667,0.0
25%,22516580.0,1.8,0.0,-1.131091,-0.6300455,-0.6078629,-0.2142305,-0.6145124,-0.6065934,-0.3845592,...,-0.3714398,-0.5977573,-0.4899674,-0.08458003,-0.05383819,-0.1475563,-0.4231381,-0.3694732,-7.816667,0.0
50%,25009580.0,2.88,0.0,0.8841025,0.08372828,-0.1382756,0.03759255,0.0717004,-0.6065934,-0.3845592,...,-0.3714398,-0.5977573,-0.4899674,-0.08458003,-0.05383819,-0.1475563,-0.4231381,-0.3694732,1.033333,1.0
75%,27461540.0,5.28,0.0,0.8841025,0.7518134,0.4460201,0.03759255,0.7579132,1.648551,-0.3845592,...,-0.3714398,1.67292,-0.4899674,-0.08458003,-0.05383819,-0.1475563,-0.4231381,-0.3694732,4.066667,2.0
max,29999620.0,101.73,1.0,0.8841025,2.109188,33.14507,4.82223,4.532084,1.648551,2.60038,...,2.692226,1.67292,2.040952,11.82312,18.57418,6.777074,2.363295,2.706557,23.95,2.0


In [33]:
print(combined_static_1row.shape)
combined_static_1row.head()

(20414, 49)


Unnamed: 0,id,charttime,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,race,...,first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),hour,icu_cat
0,20001305,1978-03-25 08:20:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,-1.131091,WHITE,...,-0.37144,-0.597757,2.040952,-0.08458,-0.053838,-0.147556,-0.423138,-0.369473,5.35,1
5,20001361,2043-05-04 17:24:00,2043-05-04 14:55:00,2043-05-18 16:58:00,2043-05-04 16:52:00,2043-05-10 17:59:00,6.05,0,0.884103,WHITE,...,-0.37144,-0.597757,2.040952,-0.08458,-0.053838,-0.147556,-0.423138,-0.369473,0.533333,2
10,20002506,2032-03-19 06:13:00,2032-03-19 05:42:00,2032-03-28 16:09:00,2032-03-19 05:50:00,2032-03-25 19:23:00,6.56,0,0.884103,UNKNOWN,...,-0.37144,-0.597757,-0.489967,-0.08458,-0.053838,6.777074,-0.423138,-0.369473,0.383333,2
15,20003425,2055-07-21 23:27:00,2055-07-21 10:00:00,2055-07-29 14:40:00,2055-07-22 17:13:00,2055-07-26 17:11:00,4.0,0,0.884103,WHITE,...,-0.37144,-0.597757,-0.489967,-0.08458,-0.053838,-0.147556,2.363295,-0.369473,-17.766667,1
20,20003491,1997-12-17 15:33:00,1997-12-18 04:50:00,1997-12-28 17:29:00,1997-12-18 06:10:00,1997-12-20 19:02:00,2.54,0,0.884103,HISPANIC/LATINO - GUATEMALAN,...,-0.37144,-0.597757,2.040952,-0.08458,-0.053838,-0.147556,-0.423138,-0.369473,-14.616667,1


In [35]:
print(combined_static_1row.shape)
print(predicted_combined_dynamic_df.shape)

combined_static_1row = combined_static_1row.reset_index(drop=True)

combined_static_1row.head()

# # Reset index of predicted_combined_dynamic_df if needed
# predicted_combined_dynamic_df = predicted_combined_dynamic_df.reset_index(drop=True)

# combined_predicted_static = pd.concat([combined_static_1row, predicted_combined_dynamic_df], axis=1)

(20414, 49)
(20414, 21)


Unnamed: 0,id,charttime,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,race,...,first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),hour,icu_cat
0,20001305,1978-03-25 08:20:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,-1.131091,WHITE,...,-0.37144,-0.597757,2.040952,-0.08458,-0.053838,-0.147556,-0.423138,-0.369473,5.35,1
1,20001361,2043-05-04 17:24:00,2043-05-04 14:55:00,2043-05-18 16:58:00,2043-05-04 16:52:00,2043-05-10 17:59:00,6.05,0,0.884103,WHITE,...,-0.37144,-0.597757,2.040952,-0.08458,-0.053838,-0.147556,-0.423138,-0.369473,0.533333,2
2,20002506,2032-03-19 06:13:00,2032-03-19 05:42:00,2032-03-28 16:09:00,2032-03-19 05:50:00,2032-03-25 19:23:00,6.56,0,0.884103,UNKNOWN,...,-0.37144,-0.597757,-0.489967,-0.08458,-0.053838,6.777074,-0.423138,-0.369473,0.383333,2
3,20003425,2055-07-21 23:27:00,2055-07-21 10:00:00,2055-07-29 14:40:00,2055-07-22 17:13:00,2055-07-26 17:11:00,4.0,0,0.884103,WHITE,...,-0.37144,-0.597757,-0.489967,-0.08458,-0.053838,-0.147556,2.363295,-0.369473,-17.766667,1
4,20003491,1997-12-17 15:33:00,1997-12-18 04:50:00,1997-12-28 17:29:00,1997-12-18 06:10:00,1997-12-20 19:02:00,2.54,0,0.884103,HISPANIC/LATINO - GUATEMALAN,...,-0.37144,-0.597757,2.040952,-0.08458,-0.053838,-0.147556,-0.423138,-0.369473,-14.616667,1


In [68]:
predicted_combined_dynamic_df.head()

Unnamed: 0,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,...,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt
0,-0.39499,0.207691,0.428759,2.305606,0.538075,-0.22985,-0.068217,0.613597,-0.023311,-0.268329,...,0.177416,0.291004,-0.009301,-0.383689,-0.21834,-0.240967,-0.485289,-0.287031,-0.300859,-0.078858
1,-0.012189,-0.043462,0.192521,-0.284227,0.466701,0.435962,-0.278784,0.540642,-0.315938,-0.102855,...,0.097828,0.255436,-0.051499,-0.243859,-0.096929,-0.34039,-0.241193,-0.194923,-0.214303,-0.204423
2,-0.360384,0.124683,-0.676888,0.301956,0.640028,-0.481478,-0.301781,0.684518,-0.397209,0.666661,...,0.141322,0.514289,-0.167305,-0.123784,0.661394,-0.951891,-0.271398,-0.335441,-0.349038,-0.217738
3,-0.644798,0.000615,-0.279942,0.178031,-0.168479,-0.328684,-0.247705,-0.703199,0.609102,-0.791099,...,-0.463774,-0.593994,-0.155309,0.010233,-0.691954,-0.141471,-0.153174,-0.270119,-0.283493,-0.231409
4,0.274095,-0.504555,0.504493,-0.507762,-0.367926,0.130588,-0.072209,-0.803737,1.343336,0.159621,...,0.022985,-0.047857,0.048202,-0.219825,0.128122,-0.008897,-0.196616,0.027592,0.011165,0.207145


In [65]:
# checking the dtypes of predicted_combined_dynamic_df columns

predicted_combined_dynamic_df.columns.dtype

dtype('O')

In [66]:
combined_static_1row.columns.dtype

dtype('O')

In [44]:
predicted_combined_dynamic_df.columns.unique().tolist()

# combined_processed_df.dtypes

[('aniongap',),
 ('bicarbonate',),
 ('bun',),
 ('calcium',),
 ('chloride',),
 ('creatinine',),
 ('glucose',),
 ('sodium',),
 ('potassium',),
 ('hematocrit',),
 ('hemoglobin',),
 ('mch',),
 ('mchc',),
 ('mcv',),
 ('platelet',),
 ('rbc',),
 ('rdw',),
 ('wbc',),
 ('inr',),
 ('pt',),
 ('ptt',)]

In [75]:
# Example list of column names as tuples
# column_names_tuples = [('aniongap',), ('bicarbonate',), ('bun',), ('calcium',), ('chloride',), ('creatinine',), ('glucose',), ('sodium',), ('potassium',), ('hematocrit',), ('hemoglobin',), ('mch',), ('mchc',), ('mcv',), ('platelet',), ('rbc',), ('rdw',), ('wbc',), ('inr',), ('pt',), ('ptt',)]

# Convert tuples to strings
column_names_strings = [col[0] for col in predicted_combined_dynamic_df.columns.unique()]

# Assuming predicted_combined_dynamic_df is your DataFrame
# Rename columns with strings derived from tuples
predicted_combined_dynamic_df.columns = column_names_strings

print(predicted_combined_dynamic_df.columns.unique().tolist())

['aniongap', 'bicarbonate', 'bun', 'calcium', 'chloride', 'creatinine', 'glucose', 'sodium', 'potassium', 'hematocrit', 'hemoglobin', 'mch', 'mchc', 'mcv', 'platelet', 'rbc', 'rdw', 'wbc', 'inr', 'pt', 'ptt']


In [77]:
# merging normalized static data and predicted 6th time point dynamic data into the same df

predicted_combined_dynamic_df = predicted_combined_dynamic_df.reset_index(drop=True)

combined_processed_df = pd.merge(combined_static_1row, predicted_combined_dynamic_df, left_index=True, right_index=True)
# combined_processed_df = pd.merge(predicted_combined_dynamic_df, combined_static_1row, left_index=True, right_index=True)

print(combined_processed_df.shape)
combined_processed_df.head()

(20414, 70)


Unnamed: 0,id,charttime,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,race,...,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt
0,20001305,1978-03-25 08:20:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,-1.131091,WHITE,...,0.177416,0.291004,-0.009301,-0.383689,-0.21834,-0.240967,-0.485289,-0.287031,-0.300859,-0.078858
1,20001361,2043-05-04 17:24:00,2043-05-04 14:55:00,2043-05-18 16:58:00,2043-05-04 16:52:00,2043-05-10 17:59:00,6.05,0,0.884103,WHITE,...,0.097828,0.255436,-0.051499,-0.243859,-0.096929,-0.34039,-0.241193,-0.194923,-0.214303,-0.204423
2,20002506,2032-03-19 06:13:00,2032-03-19 05:42:00,2032-03-28 16:09:00,2032-03-19 05:50:00,2032-03-25 19:23:00,6.56,0,0.884103,UNKNOWN,...,0.141322,0.514289,-0.167305,-0.123784,0.661394,-0.951891,-0.271398,-0.335441,-0.349038,-0.217738
3,20003425,2055-07-21 23:27:00,2055-07-21 10:00:00,2055-07-29 14:40:00,2055-07-22 17:13:00,2055-07-26 17:11:00,4.0,0,0.884103,WHITE,...,-0.463774,-0.593994,-0.155309,0.010233,-0.691954,-0.141471,-0.153174,-0.270119,-0.283493,-0.231409
4,20003491,1997-12-17 15:33:00,1997-12-18 04:50:00,1997-12-28 17:29:00,1997-12-18 06:10:00,1997-12-20 19:02:00,2.54,0,0.884103,HISPANIC/LATINO - GUATEMALAN,...,0.022985,-0.047857,0.048202,-0.219825,0.128122,-0.008897,-0.196616,0.027592,0.011165,0.207145


# using predicted 6th time point and static data to classify icu_cat

In [78]:
train = combined_processed_df[combined_processed_df.id.isin(train_id_list)]
test = combined_processed_df[combined_processed_df.id.isin(test_id_list)]
holdout = combined_processed_df[combined_processed_df.id.isin(holdout_id_list)]

print(train.shape)
print(test.shape)
print(holdout.shape)

(14289, 70)
(4083, 70)
(2042, 70)


In [79]:
columns_to_drop = ['id',
                   'charttime',
                   'hour',
                   'icu_cat',
                   'hosp_admittime',
                   'hosp_dischtime',
                   'icu_intime',
                   'icu_outtime',
                   'los_icu',
                   'icu_death',
                   'race'
                  ]

X_train = train.drop(columns=columns_to_drop)
y_train_cat = train.icu_cat
y_train_reg = train.los_icu

X_test = test.drop(columns=columns_to_drop)
y_test_cat = test.icu_cat
y_test_reg = test.los_icu

X_holdout = holdout.drop(columns=columns_to_drop)
y_holdout_cat = holdout.icu_cat
y_holdout_reg = holdout.los_icu

print(X_train.shape)
print(X_test.shape)
print(X_holdout.shape)
print(y_train_cat.shape)
print(y_test_cat.shape)
print(y_holdout_cat.shape)
print(y_train_reg.shape)
print(y_test_reg.shape)
print(y_holdout_reg.shape)

(14289, 59)
(4083, 59)
(2042, 59)
(14289,)
(4083,)
(2042,)
(14289,)
(4083,)
(2042,)


In [80]:
y_test_cat.head()

14289    1
14290    1
14291    0
14292    0
14293    1
Name: icu_cat, dtype: int64

In [81]:
y_test_reg.head()

14289    2.87
14290    3.32
14291    2.42
14292    2.43
14293    2.60
Name: los_icu, dtype: float64

# Random Forest classifier

In [82]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

In [83]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(
    max_depth=7, 
    min_samples_split=10,
    n_estimators=100,
    random_state=42, 
)
random_forest.fit(X_train, y_train_cat)

In [84]:
y_train_pred = random_forest.predict(X_train)
y_test_pred = random_forest.predict(X_test)

In [86]:
print("Train Accuracy:\n", classification_report(y_train_cat, y_train_pred))
print("\n\nTest Accuracy:\n", classification_report(y_test_cat, y_test_pred))

Train Accuracy:
               precision    recall  f1-score   support

           0       0.49      0.97      0.65      6248
           1       1.00      0.02      0.04      4198
           2       0.69      0.35      0.46      3843

    accuracy                           0.52     14289
   macro avg       0.73      0.45      0.38     14289
weighted avg       0.69      0.52      0.42     14289



Test Accuracy:
               precision    recall  f1-score   support

           0       0.47      0.94      0.63      1794
           1       0.12      0.00      0.00      1150
           2       0.53      0.23      0.32      1139

    accuracy                           0.48      4083
   macro avg       0.37      0.39      0.32      4083
weighted avg       0.39      0.48      0.36      4083



In [87]:
print(confusion_matrix(y_test_cat, y_test_pred))

[[1682    1  111]
 [1027    1  122]
 [ 871    6  262]]


# Random Forest Regression

In [100]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=8, min_samples_split=10, random_state=26)

rf_regressor.fit(X_train, y_train_reg)

In [101]:
# Calculating MSE for Random Forest Regressor

y_train_pred = rf_regressor.predict(X_train)
mse_train_rfregressor = mean_squared_error(y_train_reg, y_train_pred)
print("Ensemble LSTM + Random Forest regressor train MSE:", mse_train_rfregressor)

y_test_pred = rf_regressor.predict(X_test)
mse_test_rfregressor = mean_squared_error(y_test_reg, y_test_pred)
print("Ensemble LSTM + Random Forest regressor test MSE:", mse_test_rfregressor)

y_holdout_pred = rf_regressor.predict(X_holdout)
mse_holdout_rfregressor = mean_squared_error(y_holdout_reg, y_holdout_pred)
print("Ensemble LSTM + Random Forest regressor holdout MSE:", mse_holdout_rfregressor)

Ensemble LSTM + Random Forest regressor train MSE: 26.90076184325122
Ensemble LSTM + Random Forest regressor test MSE: 33.51580100891895
Ensemble LSTM + Random Forest regressor holdout MSE: 28.124747201178657


# Deep Neural Network Classification

In [88]:
import torch
import torch.nn as nn
import torch.optim as optim

In [89]:
X_train_tensor = torch.tensor(X_train.to_numpy(),dtype=torch.float32)
m,n = X_train_tensor.shape
y_train_cat_tensor = torch.tensor(y_train_cat.to_numpy(),dtype=torch.long).reshape(m).squeeze()

X_test_tensor = torch.tensor(X_test.to_numpy(),dtype=torch.float32)
m,n = X_test_tensor.shape
y_test_cat_tensor = torch.tensor(y_test_cat.to_numpy(),dtype=torch.long).reshape(m).squeeze()

X_holdout_tensor = torch.tensor(X_holdout.to_numpy(),dtype=torch.float32)
m,n = X_holdout_tensor.shape
y_holdout_cat_tensor = torch.tensor(y_holdout_cat.to_numpy(),dtype=torch.long).reshape(m).squeeze()

In [90]:
print(X_train_tensor.shape)

print(y_train_cat_tensor.shape)

torch.Size([14289, 59])
torch.Size([14289])


In [91]:
class NN_Classifier(nn.Module):
    def __init__(self, input_size, output_size, dropout_prob): # set the arguments you'd need, including activation function
        super(NN_Classifier, self).__init__()
        self.input_size = input_size
        self.output_size = output_size

        self.layer1 = nn.Linear(self.input_size, 512)
        self.hidden2 = nn.Linear(512, 512)
        self.hidden3 = nn.Linear(512, 128)
        # self.hidden4 = nn.Linear(128, 64)
        # self.hidden5 = nn.Linear(64, 32)
        #self.hidden6 = nn.Linear(16, 8)
        self.output4 = nn.Linear(128, output_size)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.hidden2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.hidden3(x)
        x = self.relu(x)
        x = self.dropout(x)

        # x = self.hidden4(x)
        # x = self.relu(x)
        # x = self.dropout(x)

        # x = self.hidden5(x)
        # x = self.relu(x)
        # x = self.dropout(x)

        # x = self.hidden6(x)
        # x = self.relu(x)
        # x = self.dropout(x)

        x = self.output4(x)
        x = self.softmax(x)

        return x


def train_model(x_train, y_train, epochs=1000):
    model = NN_Classifier(59, 3, 0.3)
    optimiser = optim.Adam(model.parameters(), lr=0.005)
    loss_fn = nn.CrossEntropyLoss() 

    for i in range(epochs):
        # reset gradients to 0
        optimiser.zero_grad()

        # get predictions
        y_pred = model.forward(x_train)
        
        # compute loss (uncomment the next line and fill right hand side)
        abs_loss = loss_fn(y_pred, y_train) 

        # backpropagate
        abs_loss.backward()
    
        # update the model weights
        optimiser.step()
        
        print (f"{i:5d}", abs_loss.item(), sep='\t')
        
    return model
                
NNClassifier = train_model(X_train_tensor, y_train_cat_tensor)

    0	1.0957326889038086
    1	1.0986990928649902
    2	1.073049545288086
    3	1.07496976852417
    4	1.067990779876709
    5	1.061991810798645
    6	1.0620737075805664
    7	1.0577560663223267
    8	1.055496335029602
    9	1.0544182062149048
   10	1.0518580675125122
   11	1.0495094060897827
   12	1.0480698347091675
   13	1.0464664697647095
   14	1.0437901020050049
   15	1.0419929027557373
   16	1.0393742322921753
   17	1.0368907451629639
   18	1.0336835384368896
   19	1.0325275659561157
   20	1.0299570560455322
   21	1.0264873504638672
   22	1.0235618352890015
   23	1.021468162536621
   24	1.0192124843597412
   25	1.0152597427368164
   26	1.0152653455734253
   27	1.0114874839782715
   28	1.0099823474884033
   29	1.0076594352722168
   30	1.0047214031219482
   31	1.0032435655593872
   32	1.001099705696106
   33	0.999060869216919
   34	0.9972102642059326
   35	0.9935401678085327
   36	0.9922860264778137
   37	0.989987850189209
   38	0.986903190612793
   39	0.9852657318115234
   40	0.982

In [92]:
with torch.no_grad():
    y_test_pred = NNClassifier(X_test_tensor)


y_test_pred_NN = torch.argmax(y_test_pred, dim=1)

with torch.no_grad():
    y_holdout_pred = NNClassifier(X_holdout_tensor)

y_holdout_pred_NN = torch.argmax(y_holdout_pred, dim=1)

print("\n\nNN Test Accuracy:\n", classification_report(y_test_cat_tensor, y_test_pred_NN, digits=4))
print("\n\nNN Holdout Accuracy:\n", classification_report(y_holdout_cat_tensor, y_holdout_pred_NN, digits=4))



NN Test Accuracy:
               precision    recall  f1-score   support

           0     0.5111    0.5624    0.5356      1794
           1     0.2834    0.2783    0.2808      1150
           2     0.4306    0.3705    0.3983      1139

    accuracy                         0.4289      4083
   macro avg     0.4084    0.4037    0.4049      4083
weighted avg     0.4245    0.4289    0.4255      4083



NN Holdout Accuracy:
               precision    recall  f1-score   support

           0     0.5258    0.6002    0.5605       883
           1     0.3178    0.2769    0.2959       614
           2     0.4188    0.3835    0.4004       545

    accuracy                         0.4452      2042
   macro avg     0.4208    0.4202    0.4189      2042
weighted avg     0.4347    0.4452    0.4382      2042



In [93]:
conf_matrix_test = confusion_matrix(y_test_cat_tensor, y_test_pred_NN)

conf_matrix_test_df = pd.DataFrame(conf_matrix_test, columns=['Predicted_0', 'Predicted_1', 'Predicted_2'],
                               index=['True_0', 'True_1', 'True_2'])

conf_matrix_holdout = confusion_matrix(y_holdout_cat_tensor, y_holdout_pred_NN)

conf_matrix_holdout_df = pd.DataFrame(conf_matrix_holdout, columns=['Predicted_0', 'Predicted_1', 'Predicted_2'],
                               index=['True_0', 'True_1', 'True_2'])


print("NN Test Confusion Matrix:")
print(conf_matrix_test_df)
print('\n\n')
print("NN Holdout Confusion Matrix:")
print(conf_matrix_holdout_df)

NN Test Confusion Matrix:
        Predicted_0  Predicted_1  Predicted_2
True_0         1009          486          299
True_1          571          320          259
True_2          394          323          422



NN Holdout Confusion Matrix:
        Predicted_0  Predicted_1  Predicted_2
True_0          530          221          132
True_1          286          170          158
True_2          192          144          209


# Deep Neural Network Regression

In [96]:
X_train_tensor = torch.tensor(X_train.to_numpy(),dtype=torch.float32)
m,n = X_train_tensor.shape
y_train_reg_tensor = torch.tensor(y_train_reg.to_numpy(),dtype=torch.float32).reshape(m,1)

X_test_tensor = torch.tensor(X_test.to_numpy(),dtype=torch.float32)
m,n = X_test_tensor.shape
y_test_reg_tensor = torch.tensor(y_test_reg.to_numpy(),dtype=torch.float32).reshape(m,1)

X_holdout_tensor = torch.tensor(X_holdout.to_numpy(),dtype=torch.float32)
m,n = X_holdout_tensor.shape
y_holdout_reg_tensor = torch.tensor(y_holdout_reg.to_numpy(),dtype=torch.float32).reshape(m,1)

In [97]:
class NN_Regressor(nn.Module):
    def __init__(self, input_size, output_size, dropout_prob): # set the arguments you'd need, including activation function
        super(NN_Regressor, self).__init__()
        self.input_size = input_size
        self.output_size = output_size

        self.layer1 = nn.Linear(self.input_size, 128)
        self.hidden2 = nn.Linear(128, 128)
        self.hidden3 = nn.Linear(128, 32)
        # self.hidden4 = nn.Linear(32, 16)
        # self.hidden5 = nn.Linear(16, 16)
        # self.hidden6 = nn.Linear(16, 8)
        self.output7 = nn.Linear(32, output_size)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.hidden2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.hidden3(x)
        x = self.relu(x)
        x = self.dropout(x)

        # x = self.hidden4(x)
        # x = self.relu(x)
        # x = self.dropout(x)

        # x = self.hidden5(x)
        # x = self.relu(x)
        # x = self.dropout(x)

        # x = self.hidden6(x)
        # x = self.relu(x)
        # x = self.dropout(x)

        x = self.output7(x)
        # x = self.relu(x)

        return x


def train_model(x_train, y_train, epochs=400):
    model = NN_Regressor(59, 1, 0.3)
    optimiser = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss() 

    for i in range(epochs):
        # reset gradients to 0
        optimiser.zero_grad()

        # get predictions
        y_pred = model.forward(x_train)
        
        # compute loss (uncomment the next line and fill right hand side)
        abs_loss = loss_fn(y_pred, y_train) 

        # backpropagate
        abs_loss.backward()
    
        # update the model weights
        optimiser.step()
        
        print (f"{i:5d}", abs_loss.item(), sep='\t')
        
    return model
                

NNRegressor = train_model(X_train_tensor, y_train_reg_tensor)

    0	61.961788177490234
    1	61.53815460205078
    2	61.151371002197266
    3	60.75511169433594
    4	60.35018539428711
    5	59.9146728515625
    6	59.4320068359375
    7	58.87564468383789
    8	58.23600769042969
    9	57.48786163330078
   10	56.68841552734375
   11	55.821224212646484
   12	54.8539924621582
   13	53.78291702270508
   14	52.62723159790039
   15	51.4119873046875
   16	50.10017395019531
   17	48.76298141479492
   18	47.24089813232422
   19	45.78483963012695
   20	44.381744384765625
   21	42.936912536621094
   22	41.38352584838867
   23	40.397796630859375
   24	39.44026184082031
   25	39.01725387573242
   26	38.666568756103516
   27	38.573699951171875
   28	39.03358840942383
   29	39.400428771972656
   30	39.608707427978516
   31	39.774757385253906
   32	39.42011642456055
   33	39.05213928222656
   34	38.65628433227539
   35	38.25012969970703
   36	38.087242126464844
   37	37.52791213989258
   38	37.23508071899414
   39	37.271690368652344
   40	37.14517593383789
   41	3

In [99]:
# checking MSE on testing set
from sklearn.metrics import mean_squared_error

NNRegressor.eval()
with torch.no_grad():
    y_test_pred = NNRegressor(X_test_tensor)

mse_test_ensemble_NNregressor = mean_squared_error(y_test_reg_tensor, y_test_pred)
print("Ensemble LSTM + NN regressor test MSE:", mse_test_ensemble_NNregressor)



with torch.no_grad():
    y_holdout_pred = NNRegressor(X_holdout_tensor)

mse_holdout_ensemble_NNregressor = mean_squared_error(y_holdout_reg_tensor, y_holdout_pred)
print("Ensemble LSTM + NN regressor test MSE:", mse_holdout_ensemble_NNregressor)

Ensemble LSTM + NN regressor test MSE: 33.57175
Ensemble LSTM + NN regressor test MSE: 28.148096
