# BIOSTAT707 Group Project

RNN/LSTM Model

Developed by: Jackson Dial and Caitlyn Nguyen

The following code is adapted from:
https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/

In [1]:
# Library import
import numpy as np
import math
import random
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras import optimizers

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# Set Seed
tf.random.set_random_seed(1)

In [4]:
# All training dataset
all_train_df = pd.read_csv("train.csv").iloc[: , 1:]
all_train_df.head(5)

Unnamed: 0,subjid,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,...,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,height_cleaned,MechVent_cleaned,in_hosp_death
0,132547,06:36,,45.0,47.0,64,,15.0,,212.0,...,35.8,1.3,,140.0,,114.0,7.29,180.3,1,0
1,132547,06:38,,45.0,47.0,64,,15.0,,212.0,...,35.8,1.3,,140.0,,114.0,7.29,180.3,1,0
2,132547,06:53,,45.0,47.0,64,,15.0,,212.0,...,35.8,1.3,,400.0,,114.0,7.29,180.3,1,0
3,132547,07:23,,45.0,47.0,64,,15.0,,212.0,...,35.8,1.3,,400.0,24.0,114.0,7.29,180.3,1,0
4,132547,07:53,,45.0,47.0,64,,15.0,,212.0,...,35.8,1.3,,400.0,24.0,114.0,7.29,180.3,1,0


In [5]:
# Testing dataset
test_df = pd.read_csv("test.csv").iloc[: , 1:]
test_df.head(5)

Unnamed: 0,subjid,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,...,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,height_cleaned,MechVent_cleaned,in_hosp_death
0,132540,01:11,,,,76,,,,,...,35.2,,,,7.4,76.0,7.45,175.3,1,0
1,132540,01:26,,,,76,,,,,...,35.1,,,770.0,7.4,76.0,7.45,175.3,1,0
2,132540,01:27,,,,76,,,,,...,35.1,,,0.0,7.4,76.0,7.45,175.3,1,0
3,132540,01:31,,,,76,,,,,...,34.8,,,0.0,7.4,76.0,7.45,175.3,1,0
4,132540,01:38,,,,76,,,,,...,34.8,,,0.0,7.4,76.0,7.44,175.3,1,0


In [6]:
# Concat df
df = pd.concat([all_train_df, test_df])
len(df["subjid"].unique())

1815

In [61]:
# Scale continuous vars
nonscaled_cols = ["subjid", "Time", "Gender", "ICUType", "in_hosp_death", "MechVent_cleaned"]
all_cols = list(df.columns.values)
scaled_cols = set(all_cols) - set(nonscaled_cols)
clean_df = df.drop(nonscaled_cols, axis = 1)
# Normalize the dataset
scaled_df = (clean_df-clean_df.min())/(clean_df.max()-clean_df.min())
clean_df = pd.concat((df[nonscaled_cols], scaled_df), 1)
# Change dtypes
clean_df = clean_df.fillna(-1)
clean_df ["in_hosp_death"] = clean_df["in_hosp_death"].astype('category')
clean_df["Gender"] = clean_df["Gender"].astype('category')
clean_df["ICUType"] = clean_df["ICUType"].astype('category')
clean_df["MechVent_cleaned"] = clean_df["MechVent_cleaned"].astype('category')
clean_df.dtypes

Unnamed: 0,subjid,Time,Gender,ICUType,in_hosp_death,MechVent_cleaned,ALP,ALT,AST,Age,...,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,height_cleaned
0,132547,06:36,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.528169,0.37931,0.02045,-1.0,0.012727,-1.0,0.449287,0.512195,0.696897
1,132547,06:38,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.535211,0.37931,0.02045,-1.0,0.012727,-1.0,0.449287,0.512195,0.696897
2,132547,06:53,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.426056,0.37931,0.02045,-1.0,0.036364,-1.0,0.449287,0.512195,0.696897
3,132547,07:23,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.426056,0.37931,0.02045,-1.0,0.036364,0.153698,0.449287,0.512195,0.696897
4,132547,07:53,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.422535,0.37931,0.02045,-1.0,0.036364,0.153698,0.449287,0.512195,0.696897


In [62]:
# Get train dataset
new_train = clean_df[clean_df["subjid"].isin(all_train_df["subjid"].unique())]
len(new_train["subjid"].unique())

1270

In [63]:
# Get test dataset
new_test = clean_df[clean_df["subjid"].isin(test_df["subjid"].unique())]
len(new_test["subjid"].unique())

545

In [64]:
# Validation dataset
all_train_ids = new_train["subjid"].unique()
n = len(all_train_ids)
n_85 = math.ceil(n*0.85)
train_ids = random.sample(set(all_train_ids), n_85)
val_ids = set(all_train_ids) - set(train_ids)

val_df = new_train[new_train["subjid"].isin(val_ids)]
print(len(val_df["subjid"].unique()))
val_df.head(5)

190


Unnamed: 0,subjid,Time,Gender,ICUType,in_hosp_death,MechVent_cleaned,ALP,ALT,AST,Age,...,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,height_cleaned
58,132551,00:38,0,3,1,0,-1.0,-1.0,-1.0,0.835616,...,-1.0,0.373239,0.632184,-1.0,-1.0,-1.0,-1.0,0.189382,0.646341,0.48568
59,132551,00:40,0,3,1,0,-1.0,-1.0,-1.0,0.835616,...,-1.0,0.366197,0.632184,-1.0,-1.0,-1.0,-1.0,0.189382,0.646341,0.48568
60,132551,00:50,0,3,1,0,0.023458,0.004298,0.004785,0.835616,...,-1.0,0.366197,0.632184,0.06544,-1.0,-1.0,0.102894,0.189382,0.646341,0.48568
61,132551,01:10,0,3,1,0,0.023458,0.004298,0.004785,0.835616,...,-1.0,0.387324,0.632184,0.06544,-1.0,-1.0,0.102894,0.189382,0.646341,0.48568
62,132551,01:40,0,3,1,0,0.023458,0.004298,0.004785,0.835616,...,-1.0,0.419014,0.632184,0.06544,-1.0,0.010909,0.102894,0.189382,0.646341,0.48568


In [65]:
train_df = new_train[new_train["subjid"].isin(train_ids)]
print(len(train_df["subjid"].unique()))
train_df.head(5)

1080


Unnamed: 0,subjid,Time,Gender,ICUType,in_hosp_death,MechVent_cleaned,ALP,ALT,AST,Age,...,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,height_cleaned
0,132547,06:36,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.528169,0.37931,0.02045,-1.0,0.012727,-1.0,0.449287,0.512195,0.696897
1,132547,06:38,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.535211,0.37931,0.02045,-1.0,0.012727,-1.0,0.449287,0.512195,0.696897
2,132547,06:53,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.426056,0.37931,0.02045,-1.0,0.036364,-1.0,0.449287,0.512195,0.696897
3,132547,07:23,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.426056,0.37931,0.02045,-1.0,0.036364,0.153698,0.449287,0.512195,0.696897
4,132547,07:53,1,1,0,1,-1.0,0.004203,0.002552,0.643836,...,-1.0,0.422535,0.37931,0.02045,-1.0,0.036364,0.153698,0.449287,0.512195,0.696897


In [103]:
# Dataset matrix
def create_dataset(dataset, num_obs):
    subjids = dataset["subjid"].unique()
    data = []
    line = np.repeat(-1, 42)
    for subjid in subjids:
        ds = dataset[dataset["subjid"] == subjid].iloc[:, 2:]
        while len(ds) < num_obs:
            ds.loc[len(ds)] = line
        data.append(np.array(ds))
    return data

In [104]:
n_obs = max(clean_df.groupby('subjid').count()["Time"])
train_df_x = train_df.copy().drop("in_hosp_death", axis = 1)
train_x = create_dataset(train_df_x, n_obs)
train_x = tf.stack(train_x)
train_x

<tf.Tensor 'stack_3:0' shape=(1080, 161, 42) dtype=float64>

In [119]:
train_y = train_df.groupby('subjid')["in_hosp_death"].first()
train_y = train_y.astype('category')
train_y = tf.stack(train_y)
train_y

<tf.Tensor 'stack_1:0' shape=(1080,) dtype=int64>

In [121]:
# Create the RNN
tf.keras.backend.clear_session()

model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_obs, 42)))
model.add(Dense(1))

model.compile(optimizer= "Adam", loss = tf.keras.losses.BinaryCrossentropy(from_logits=False))

In [122]:
model.fit(train_x, train_y, epochs=100, verbose=1, steps_per_epoch = 50)

Epoch 1/100


InvalidArgumentError: Requested tensor connection from unknown node: "lstm_input:0".

In [None]:
model.evaluate(train_x)



0.0

In [124]:
pip install tensorflow --upgrade

^C
Note: you may need to restart the kernel to use updated packages.
