In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from scipy.ndimage.interpolation import shift
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
cols = ['Time', 'stay_id', 'stay_key', 'hadm_id', 'age', 'gender', 'Heart Rate',
       'Respiratory Rate', 'SpO2/SaO2', 'pH', 'Potassium', 'Calcium',
       'Glucose', 'Sodium', 'HCO3', 'White Blood Cells', 'Hemoglobin',
       'Red Blood Cells', 'Platelet Count', 'Weight', 'Urea Nitrogen',
       'Creatinine', 'Blood Pressure', '1 hours urine output',
       '6 hours urine output', 'AKI', 'gcs',
       'ventilation', 'vasoactive medications', 'sedative medications']
features = ['time_since', 'age', 'gender', 'Heart Rate',
       'Respiratory Rate', 'SpO2/SaO2', 'pH', 'Potassium', 'Calcium',
       'Glucose', 'Sodium', 'HCO3', 'White Blood Cells', 'Hemoglobin',
       'Red Blood Cells', 'Platelet Count', 'Weight', 'Urea Nitrogen',
       'Creatinine', 'Blood Pressure', '1 hours urine output',
       '6 hours urine output', 'gcs',
       'ventilation', 'vasoactive medications', 'sedative medications']

In [3]:
class AKIDataset(Dataset):

    def __init__(self, csv_file):
        self.dataframe = pd.read_csv(csv_file)
        self.dataframe['stay_key'] = self.dataframe['stay_id']
        self.dataframe = self.dataframe.groupby('stay_id')[cols].ffill().bfill()
        in_time = self.dataframe.groupby('stay_key')[['Time']].first()
        self.dataframe = pd.merge(self.dataframe, in_time, left_on=['stay_key'], right_index=True, how='left')
        self.dataframe['time_since'] = (pd.to_datetime(self.dataframe['Time_x']) - pd.to_datetime(self.dataframe['Time_y'])) / np.timedelta64(1, 'h')
        self.stay_ids = self.dataframe.stay_key.unique()

    def __len__(self):
        return len(self.stay_ids)

    def __getitem__(self, idx):
        
        data = self.dataframe[self.dataframe.stay_key == self.stay_ids[idx]][features].to_numpy()
        label = self.dataframe[self.dataframe.stay_key == self.stay_ids[idx]]['AKI'].to_numpy()
        
        return data, label

In [4]:
ds = AKIDataset('C:/Users/Kevin/Desktop/SPH6004 Assignment 2/assignment2_data/mimiciv_aki/time_series.csv')

In [5]:
## Notebook is currently set to only include the first 1000 unique stay_ids. Set to "len(ds)" to include all 39,742 stay IDs. Very slow though!!!
n_stay_id = 1000

In [6]:
## Han's function returns a tuple. Separate the tuple into x (features) and y (outcomes).
## x is a list "n_stay_id"s long. Each entry in the list contains a 2D array of shape (timesteps, features).
## y is a list "n_stay_id"s long. Each entry in the list contains a 1D array of shape (timesteps)

x = []
y = []
for i in range(n_stay_id):
    temp_x, temp_y = ds[i]
    x.append(temp_x)
    y.append(temp_y)

In [7]:
## Extract the number of timesteps in x for each stay_id.

x_timesteps = np.zeros(shape=(n_stay_id))

for i in range(n_stay_id):
    x_timesteps[i] = x[i].shape[0]
     
print(x_timesteps.shape)    
print('The maximum number of timesteps in the dataset is', x_timesteps.max(), 'steps.')

(1000,)
The maximum number of timesteps in the dataset is 687.0 steps.


In [8]:
## Manually pad the arrays for each stay_id in x and y to the maximum number of timesteps in the dataset. Pads with 0's.
## Returns x_padded and y_padded.

max_timesteps = int(x_timesteps.max())

x_padded = []
for i in range(n_stay_id):
    temp_padded = np.pad(x[i], pad_width=((0, (max_timesteps-x[i].shape[0])), (0, 0)), mode='constant', constant_values=0)
    x_padded.append(temp_padded)
    
y_padded = []
for i in range(n_stay_id):
    temp_padded = np.pad(y[i], pad_width=(0, max_timesteps-y[i].shape[0]), mode='constant', constant_values=0)
    y_padded.append(temp_padded)

In [9]:
## Shift y-values (AKI status) one step backward. So that the features at time t predict the AKI values at time t+1.

y_padded_shifted = []
for i in range(n_stay_id):
    next_y = shift(y_padded[i], -1, mode='constant', cval=0)
    y_padded_shifted.append(next_y)

In [10]:
print(y_padded[3].shape)
print(y_padded_shifted[3].shape)

(687,)
(687,)


In [11]:
## Convert the x and y lists into arrays to feed into the LSTM.
## X is a 3D array of shape (n_stay_id, timesteps, features)
## Y is a 2D array of shape (n_stay_id, timesteps)

X = np.asarray(x_padded)
Y = np.asarray(y_padded_shifted)

print(X.shape)
print(Y.shape)

(1000, 687, 26)
(1000, 687)


In [12]:
## Create a small neural network with an LSTM layer, that feeds into 2 fully-connected hidden layers. Output is a sigmoid activation function representing the probably of AKI at time t.

X_inputs = keras.Input(shape=(X.shape[1], X.shape[2]))

X_next = layers.LSTM(512, return_sequences=True)(X_inputs)
X_next = layers.TimeDistributed(layers.Dense(256, activation='relu', kernel_regularizer='l2'))(X_next)
X_next = layers.TimeDistributed(layers.Dense(64, activation='relu', kernel_regularizer='l2'))(X_next)                                
output = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'))(X_next)  

model = keras.Model(inputs=X_inputs, outputs=output)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 687, 26)]         0         
_________________________________________________________________
lstm (LSTM)                  (None, 687, 512)          1103872   
_________________________________________________________________
time_distributed (TimeDistri (None, 687, 256)          131328    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 687, 64)           16448     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 687, 1)            65        
Total params: 1,251,713
Trainable params: 1,251,713
Non-trainable params: 0
_________________________________________________________________


In [13]:
## Define optimizer, loss function, and evaluation metrics.

adam = keras.optimizers.Adam()

binarycrossentropy = keras.losses.BinaryCrossentropy()

binary_accuracy = keras.metrics.BinaryAccuracy()
AUC = keras.metrics.AUC() # Need AUC because accuracy is a poor measure in this case.

model.compile(optimizer=adam, loss=binarycrossentropy, metrics=[binary_accuracy, AUC])

In [14]:
## Fit and evaluate model with cross-validated metrics.

model.fit(x=X, y=Y, epochs=10, validation_split=0.2, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x24bd378a8e0>