In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
from torch.utils import data
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from torchtext import data, datasets

from google.colab import auth

device = "cuda" if torch.cuda.is_available() else "cpu"

# Read in data into relevant DF's, remove all but the first admission

In [0]:
!wget -r -N -c -np --user kyleliu --ask-password https://physionet.org/files/picdb/1.0.0/

In [0]:
# Read Data into DF

admissions = pd.read_csv('physionet.org/files/picdb/1.0.0/ADMISSIONS.csv.gz', compression='gzip')
chartevents = pd.read_csv('physionet.org/files/picdb/1.0.0/CHARTEVENTS.csv.gz', compression='gzip')
diagnoses_icd = pd.read_csv('physionet.org/files/picdb/1.0.0/DIAGNOSES_ICD.csv.gz', compression='gzip')
d_icd_diagnoses = pd.read_csv('physionet.org/files/picdb/1.0.0/D_ICD_DIAGNOSES.csv.gz', compression='gzip')
d_items = pd.read_csv('physionet.org/files/picdb/1.0.0/D_ITEMS.csv.gz', compression='gzip')
d_labitems = pd.read_csv('physionet.org/files/picdb/1.0.0/D_LABITEMS.csv.gz', compression='gzip')
emr_symptoms = pd.read_csv('physionet.org/files/picdb/1.0.0/EMR_SYMPTOMS.csv.gz', compression='gzip')
icu_stays = pd.read_csv('physionet.org/files/picdb/1.0.0/ICUSTAYS.csv.gz', compression='gzip')
input_events = pd.read_csv('physionet.org/files/picdb/1.0.0/INPUTEVENTS.csv.gz', compression='gzip')
lab_events = pd.read_csv('physionet.org/files/picdb/1.0.0/LABEVENTS.csv.gz', compression='gzip')
patients = pd.read_csv('physionet.org/files/picdb/1.0.0/PATIENTS.csv.gz', compression='gzip')
prescriptions = pd.read_csv('physionet.org/files/picdb/1.0.0/PRESCRIPTIONS.csv.gz', compression='gzip')
surgery_vital_signs = pd.read_csv('physionet.org/files/picdb/1.0.0/SURGERY_VITAL_SIGNS.csv.gz', compression='gzip')
microbiology_events = pd.read_csv('physionet.org/files/picdb/1.0.0/MICROBIOLOGYEVENTS.csv.gz', compression='gzip')

In [0]:
# Easier to use: 

item_dict = dict() 
for _, row in d_items.iterrows(): 
  item_dict[row.ITEMID] = row.LABEL

lab_item_dict = dict()
for _, row in d_labitems.iterrows(): 
  lab_item_dict[row.ITEMID] = row.LABEL

ICD_CN_TO_ICD = dict() 
for _, row in d_icd_diagnoses.iterrows(): 
  ICD_CN_TO_ICD[row.ICD10_CODE_CN] = row.ICD10_CODE 


Here we include only the first admission of each patient.

In [0]:
# Clean: Include only the first admission

admissions = admissions.sort_values(by = ['ADMITTIME'])
chartevents = chartevents.sort_values(by = ['CHARTTIME'])
lab_events = lab_events.sort_values(by = ['CHARTTIME'])

admits_to_keep = []
seen_patients = set()

for _, row in admissions.iterrows(): 
  if row.SUBJECT_ID not in seen_patients: 
    admits_to_keep.append(row.HADM_ID)
    seen_patients.add(row.SUBJECT_ID)

In [0]:
def remove_admits(df): 
  return df[df['HADM_ID'].isin(admits_to_keep)]

admissions = remove_admits(admissions)
chartevents = remove_admits(chartevents)
diagnoses_icd = remove_admits(diagnoses_icd)
emr_symptoms = remove_admits(emr_symptoms)
icu_stays = remove_admits(icu_stays)
input_events = remove_admits(input_events)
lab_events = remove_admits(lab_events)
prescriptions = remove_admits(prescriptions)
surgery_vital_signs = remove_admits(surgery_vital_signs)


# Helper Functions for Date/Times. Allign by first admit time (hours)

In [0]:
from datetime import date, timedelta, time, datetime

def to_datetime(x): 
  li = x.split()
  my_date = li[0].split("-")
  my_time = li[1].split(":")

  ret = datetime(int(my_date[0]), int(my_date[1]), int(my_date[2]), int(my_time[0]), int(my_time[1]), int(my_time[2]))
  
  return ret

age_at_admission = dict()  
birth_date = dict()
admit_date = dict() 
for _, row in patients.iterrows(): 
  birth_date[row.SUBJECT_ID] = to_datetime(row.DOB)

for _, row in admissions.iterrows(): 
  admit_date[row.SUBJECT_ID] = to_datetime(row.ADMITTIME)
  age_at_admission[row.SUBJECT_ID] = to_datetime(row.ADMITTIME) - birth_date[row.SUBJECT_ID]

In [0]:
# Time since admission (hours)

def isFloat(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def normalize_time(patient_id, x): 
  if isFloat(x): 
    return 100000
  delta = to_datetime(x) - admit_date[patient_id]
  return delta.total_seconds() / 3600.0 

In [0]:
patient_set = [p for p in patients.SUBJECT_ID]

In [0]:
chartevents['HOURS_IN'] = chartevents.apply(lambda row: normalize_time(row.SUBJECT_ID, row.CHARTTIME), axis=1)
lab_events['HOURS_IN'] = lab_events.apply(lambda row: normalize_time(row.SUBJECT_ID, row.CHARTTIME), axis=1)
surgery_vital_signs['HOURS_IN'] = surgery_vital_signs.apply(lambda row: normalize_time(row.SUBJECT_ID, row.MONITORTIME), axis=1)

# Feature Specification

Here we do our feature selection. Right now I have something very simple: most common chart, lab events, as well as age/gender (usually concatenated to the other two at each time step)

In [0]:
import math 
## Feature Set

## Chart Features
chart_feats = [1001, 1002, 1003, 1004, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016]

# Surgery Vital Signs
surgery_feats = surgery_vital_signs['ITEMID'].value_counts().index.tolist() 

lab_feats = [5225, 
             5097, 
             5141, 
             5129, 
             5257, 
             5132
             ]



We use these to index into the tensors that follow (i.e. chart_X[patient_index_of[subject_id]] is what you want, not chart_X[subject_id]. Similar for item_id's

In [0]:
# More Helper Dicts
chart_index_of = dict() 
for i in range(len(chart_feats)): 
  chart_index_of[chart_feats[i]] = i
  
lab_index_of = dict() 
for i in range(len(lab_feats)): 
  lab_index_of[lab_feats[i]] = i

surgery_index_of = dict() 
for i in range(len(surgery_feats)): 
  surgery_index_of[surgery_feats[i]] = i

print(chart_index_of)
print(lab_index_of)
print(surgery_index_of)

patient_index_of = dict() 
cc = 0
for p in patient_set: 
  patient_index_of[p] = cc 
  cc += 1
  

In [0]:
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # Data collection window: In hours
# Label has to be first satisfied after GAP_TIME + WINDOW_SIZE

# Generate per-hour aggregates (w/in the window).

# For each patient, generate a time series of

# Remove negative chart times
subjects_to_remove = set() 

for _, row in admissions.iterrows(): 
  if normalize_time(row.SUBJECT_ID, row.DISCHTIME) < (GAP_TIME + WINDOW_SIZE): 
    subjects_to_remove.add(row.SUBJECT_ID)

In [0]:
# Set up X, Y 

# Set up labels

patient_set = list(patient_set)

mort_icu = dict() 
for _, row in patients.iterrows(): 
  if row.SUBJECT_ID in patient_set: 
    mort_icu[row.SUBJECT_ID] = row.EXPIRE_FLAG 

gender_one_hot = np.zeros((len(patient_set), 2))
age_vec = np.zeros((len(patient_set), 1))
for _, row in patients.iterrows(): 
  if row.SUBJECT_ID in patient_set: 
    age_vec[patient_index_of[row.SUBJECT_ID]][0] = (age_at_admission[row.SUBJECT_ID].total_seconds() / 3600.0)
    if row.GENDER == 'M': 
      gender_one_hot[patient_index_of[row.SUBJECT_ID]][0] = 1
    else: 
      gender_one_hot[patient_index_of[row.SUBJECT_ID]][1] = 1

static_vec = np.concatenate((gender_one_hot, age_vec), axis = 1)
# [num_patients, 3]

# time_vec [num_patients, window_size, num_lab_features + num_chart_features + num_vital_features]

# concatenate this with static_vec [num_patients, 3]

# Setting Up Data for Aggregated Stuff

A lot of these setups are task specific. Like I said earlier, the inputs X are the same to each model, but you should flatten them for LR/RF.

The labels need to be different. See how I've defined Y in each of the models.
For LR/RF, we can leave Y as-is. For LSTM, we try to predict Y at each time point, so we should expand Y (by stacking WINDOW_SIZE copies of it).

Also, we need to protect against label leakage uniquely for each task. For instance, SEPTIC[i] records the first time patient i had the sepsis label (in hours). If SEPTIC[i] < GAP_TIME + WINDOW_SIZE, we have risk of label leakage, so we need to remove it. Append the subject id to my_subjects_to_remove.

Once done, get a boolean mask from get_mask(my_subjects_to_remove), and index into your covariates / labels accordingly (return covars[mask, ...], labels[mask, ...])

In [0]:
def get_mask(removed_subjects): 

  mask = [True for p in patient_set]
  for p in removed_subjects:
    mask[patient_index_of[p]] = False

  return mask 


In [0]:
from sklearn.metrics import roc_curve
def plot_roc(title, labels, probs): 
  fpr, tpr, thresholds = roc_curve(labels, probs) 
  plt.figure()
  plt.plot(fpr, tpr, label=title)
  plt.plot([0, 1], [0, 1],'r--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('1 - Specificity')
  plt.ylabel('Sensitivity')
  plt.title('ROC')
  plt.legend(loc="lower right")
  plt.show()

Note we need to further preprocess this data (zero mean, unit variance, PCA, etc..)


Let's also do some upsampling to try to deal with class imbalance.

# Transformer

Transformer Architecture. Motivated by https://arxiv.org/pdf/1711.03905.pdf page 3. We use a different positional embedding structure, and pass the outputs of the Encoder directly into our output layer.

## Wrapper Classes

Here, we create our wrapper class. Takes in src, src_times, src_mask

In [0]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder, pos_embed, embedding, generator, type='Interpolated'):
        super(Transformer, self).__init__()
        # decoder is some modified crap.
        self.encoder = encoder
        self.decoder = decoder
        self.pos_embed = pos_embed
        self.embedding = embedding
        self.generator = generator
        self.type = type
        
    def forward(self, src, src_times, src_mask):
        # Inputs: 
        # Src: (B x T x D_input)
        # Src_times: (B x T)
        # Src_mask:  (B x 1 x D)
        encoder_output = self.encode(src, src_times, src_mask)
        if self.type == 'Interpolated': 
          return self.decode(encoder_output, src_times, src_mask)
        else: 
          return encoder_output[:, 0, :]

    
    def encode(self, src, src_times, src_mask):
        inp = self.pos_embed(self.embedding(src), src_times)
        return self.encoder(inp, src_mask)
    
    def decode(self, memory, src_times, src_mask):
        return self.decoder(memory, src_times, src_mask)

In [0]:
class Generator(nn.Module):
    "Standard generation step. (Not described in the paper.)"
    def __init__(self, d_model, M, vocab, type='Interpolated'):
      super(Generator, self).__init__()
      self.proj = nn.Linear(d_model, vocab)
      self.proj2 = nn.Linear(d_model*M, vocab)
      self.type = type
    def forward(self, x):
      if self.type == 'Interpolated': 
        return self.proj2(x)
      else: 
        return self.proj(x)
      

Once we get the logits out of EncoderDecoder, we use this to get our output.

Encoder  Stacks. The original paper uses N = 6 stacks for encoder and decoder. Let's just use that for now.



In [0]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

## Layer Normalization

After multihead attention and after the feed forward network, we normalize layers.

In [0]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

The output of each sublayer S with input x is LayerNorm(x + S(x)). We should definitely apply dropout here to both the input x and the output of LayerNorm as well. All sublayers produce outputs of d_model = 512.

## Encoder Specification

In [0]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed- forward network.

In [0]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [0]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout=0.0):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

## Decoder (Returns Dense Interpolation)

Take the logits out of Encoder, pass into this. We use a modification of dense interpolation for continous values. Let's see how this works. This returns a dense interpolation of dimension B x (d_model*M)

In [0]:
#### Might not need to unsqueeze depending on src_mask


class Decoder(nn.Module):
    def __init__(self, d_model, max_time, M):
        super(Decoder, self).__init__()
        self.M = M
        self.max_time = max_time
        
    def forward(self, x, times, src_mask):
      ### Returns a Logit
      ### x.size = (B, T, D)
      ### times.size = (B, T)
      ### src_mask.size = (B, 1, T) 
      x = x.masked_fill(src_mask.transpose(-2, -1).expand_as(x) == False, 0)
      x = self.dense_interpolate(x, times, src_mask)
      return x

    def dense_interpolate(self, x, times, src_mask): 
      B = x.size(0)
      T = x.size(1) 

      s = self.M * times / self.max_time # B x T 
      m = (torch.arange(self.M) + 1).unsqueeze(0).expand((B, self.M)) # B x M 
      w = torch.abs(s.unsqueeze(2).expand((B, T, self.M)) # B x T x M
                    - m.unsqueeze(1).expand((B, T, self.M))) / self.M  
      w = (1.0 - w) * (1.0 - w)
      
      w.masked_fill_(src_mask.transpose(1, 2).expand(B, T, self.M) == 0, 0)

      u = torch.bmm(x.transpose(1, 2), w) # B x D x M 
      u = u.reshape(u.size(0), u.size(1) * u.size(2))
      return u

          


## Attention Mechanism

Attention Mechanism -- Concise via query, key, value matrices -- Luong Attention. Divide by sqrt(dimension) so that the result still has zero mean, unit variance.

In [0]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

Multi-headed attention. Each has dimension d_k = d_v = d_model / H. We use 8-head attention -- 64.


In [0]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

## Pointwise FFN

In addition to attention, we apply a feed forward network in each encoder/decoder layer. Hidden layer has d_ff = 256.

In [0]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

## Feature + Position Embeddings

Embedding Layer. We have a feature vector of length d_input. Concatenate all numerical features (centered / scaled) with one-hots for all pos/neg tests. Include age. Time is not included！We pass in times separately to the positional encoding.


In [0]:
class Embeddings(nn.Module): 
  def __init__(self, d_input, d_hidden, d_model, dropout=0.1): 
    super(Embeddings, self).__init__() 
    self.d_input = d_input
    self.d_hidden = d_hidden
    self.d_model = d_model 

    self.w_1 = nn.Linear(d_input, d_hidden)
    self.w_2 = nn.Linear(d_hidden, d_model)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x): 
    return self.w_2(self.dropout(F.relu(self.w_1(x))))

We still need to use positional encodings. Let's use this. https://openreview.net/attachment?id=SkeJMCVFDS&name=original_pdf 

PE_(t, 2i) = sin(t / T^(2i / d))
PE_(t, 2i+1) = cos(t / T^(2i / d))

In [0]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_time=1000):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.max_time = max_time
        self.dropout = nn.Dropout(p=dropout)
        self.div_term = torch.exp(torch.arange(0., d_model, 2) *
                             -(math.log(max_time) / d_model))
        
    def forward(self, x, times):
      # x: (batch, seq_len, dim)
      # times: (batch, seq_len)
        pe = torch.zeros(x.size(0), x.size(1), self.d_model)
        position = times.unsqueeze(2)

        pe[:, :, 0::2] = torch.sin(position * self.div_term)
        pe[:, :, 1::2] = torch.cos(position * self.div_term)

        x = x + Variable(pe)

        return self.dropout(x)

In [0]:
plt.figure(figsize=(15, 5))
pe = PositionalEncoding(20, 0)
x = torch.zeros(5, 100, 20)
t = torch.arange(0., 100, 1).unsqueeze(0).expand(x.shape[0:2])
y = pe.forward(Variable(x), t)
plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
plt.legend(["dim %d"%p for p in [4,5,6,7]])

## Instancing a Model

Now let's make our model.

In [0]:
def make_model(d_input, d_target, N=6, M=12, d_model=128, d_hidden=256, d_ff=512, h=8, max_observation_time=24.0, max_encoding_time=1000.0, dropout=0.1, type='Interpolated'):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout, max_time = max_encoding_time)

    model = Transformer(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(d_model, max_time=max_observation_time, M=M),
        c(position), Embeddings(d_input, d_hidden, d_model),
        Generator(d_model, M, d_target, type), 
        type)

    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

## Training Details

Now let's define our training regime.

Optimizer Directly from srush (cite him).

In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.pos_embed.d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [0]:
# Three settings of the lrate hyperparameters.
opts = [NoamOpt(512, 1, 4000, None), 
        NoamOpt(512, 1, 8000, None),
        NoamOpt(256, 1, 4000, None)]
plt.plot(np.arange(1, 20000), [[opt.rate(i) for opt in opts] for i in range(1, 20000)])
plt.legend(["512:4000", "512:8000", "256:4000"])
None

Let's use label smoothing as well.

### Training Loops

In [0]:
import time

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def train_epoch(train_iter, model, criterion, opt, transpose=False):
    i = 0
    total = 0.0

    start = time.time()


    model.train()
    for src, src_times, src_mask, trg in train_iter:

        i += 1

        model.zero_grad()
        src_mask = src_mask.unsqueeze(-2)
        out = model.forward(src, src_times, src_mask)
        gen = model.generator(out)
        
        loss = criterion(gen.squeeze(-1), trg) 
        loss.backward()
        model_opt.step()

        total += loss.item()

        if i % 20 == 19:
            print(i, " Batches, Average Loss: ", total/20, " Opt Rate: ", model_opt._rate, " Time: ", timeSince(start))
            total = 0.0
        

In [0]:
def valid_epoch(valid_iter, model, criterion, transpose=False):
    
    i = 0
    probs = []
    preds = [] 
    trgs = []
    model.eval()
    for src, src_times, src_mask, trg in valid_iter:
        src_mask = src_mask.unsqueeze(-2)
        out = model.forward(src, src_times, src_mask)
        gen = model.generator(Variable(out.data, requires_grad=False))
        prob = torch.sigmoid(gen).data
        probs.extend(prob)
        preds.extend((prob >= 0.5).float())
        trgs.extend(trg)
        i += 1
      
    
    return probs, preds, trgs
        

## Batch Stuff

# Set Up Time Series (Full Resolution)

Here let's put the age and gender in at each time point. Categoricals are one-hotted, continuous are centered/scaled.

In [0]:
# Lol
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [0]:
# First let's get the set of timestamps.  
MAX_TIMESTAMPS = 150
LEN_FEATS = len(chart_feats)+len(surgery_feats)+len(lab_feats)
PAD = '9999-01-01 01:01:01'
PAD_TIME = 10000000
# Max # of timestamps for a guy is ~150 for window size of 24 hours.
cnt = 0

keep_tmp = np.unique(np.concatenate((lab_events.SUBJECT_ID, chartevents.SUBJECT_ID, surgery_vital_signs.SUBJECT_ID)))
rem_tmp = np.concatenate((lab_events[lab_events['HOURS_IN'] < 0].SUBJECT_ID, chartevents[chartevents['HOURS_IN'] < 0].SUBJECT_ID, surgery_vital_signs[surgery_vital_signs['HOURS_IN'] < 0].SUBJECT_ID))

X_feats = np.zeros((len(patient_set), MAX_TIMESTAMPS, LEN_FEATS))
X_present = np.zeros((len(patient_set), MAX_TIMESTAMPS, LEN_FEATS)) == 1
X_times = np.zeros((len(patient_set), MAX_TIMESTAMPS))
X_masks = np.zeros((len(patient_set), MAX_TIMESTAMPS)) == 1

time_lists = [list() for p in patient_set]

for p in patient_set: 
  if p not in keep_tmp: 
    subjects_to_remove.add(p)  
  elif p in rem_tmp: 
    subjects_to_remove.add(p)

# Compute Times 
for _, row in chartevents[chartevents['HOURS_IN'] < WINDOW_SIZE][chartevents['ITEMID'].isin(chart_feats)].iterrows(): 
  if row.SUBJECT_ID in subjects_to_remove: 
    continue 
  elif is_number(row.VALUE): 
    my_idx = patient_index_of[row.SUBJECT_ID]
    time_lists[my_idx].append(row.CHARTTIME) 

for _, row in lab_events[lab_events['HOURS_IN'] < WINDOW_SIZE][lab_events['ITEMID'].isin(lab_feats)].iterrows(): 
  if row.SUBJECT_ID in subjects_to_remove: 
    continue 
  elif is_number(row.VALUE): 
    my_idx = patient_index_of[row.SUBJECT_ID]
    time_lists[my_idx].append(row.CHARTTIME) 

for _, row in surgery_vital_signs[surgery_vital_signs['HOURS_IN'] < WINDOW_SIZE][surgery_vital_signs['ITEMID'].isin(surgery_feats)].iterrows(): 
  if row.SUBJECT_ID in subjects_to_remove: 
    continue 
  elif is_number(row.VALUE):  
    my_idx = patient_index_of[row.SUBJECT_ID]
    time_lists[my_idx].append(row.MONITORTIME) 

for i in range(len(patient_set)):
  time_lists[i] = sorted(list(set(time_lists[i])))
  
lens = [len(time_lists[i]) for i in range(len(patient_set))]
print(max(lens))

for _, row in chartevents[chartevents['HOURS_IN'] < WINDOW_SIZE][chartevents['ITEMID'].isin(chart_feats)].iterrows(): 
  if row.SUBJECT_ID in subjects_to_remove: 
    continue 
  elif is_number(row.VALUE):  
    my_idx = patient_index_of[row.SUBJECT_ID]
    time_idx = time_lists[my_idx].index(row.CHARTTIME) + 1
    X_feats[my_idx][time_idx][chart_index_of[row.ITEMID]] = row.VALUE
    X_present[my_idx][time_idx][chart_index_of[row.ITEMID]] = True
    X_masks[my_idx][time_idx] = True

for _, row in lab_events[lab_events['HOURS_IN'] < WINDOW_SIZE][lab_events['ITEMID'].isin(lab_feats)].iterrows(): 
  if row.SUBJECT_ID in subjects_to_remove: 
    continue 
  elif is_number(row.VALUE):  
    my_idx = patient_index_of[row.SUBJECT_ID]
    time_idx = time_lists[my_idx].index(row.CHARTTIME) + 1
    X_feats[my_idx][time_idx][len(chart_feats) + lab_index_of[row.ITEMID]] = row.VALUE
    X_present[my_idx][time_idx][len(chart_feats) + lab_index_of[row.ITEMID]] = True
    X_masks[my_idx][time_idx] = True

for _, row in surgery_vital_signs[surgery_vital_signs['HOURS_IN'] < WINDOW_SIZE][surgery_vital_signs['ITEMID'].isin(surgery_feats)].iterrows(): 
  if row.SUBJECT_ID in subjects_to_remove: 
    continue 
  elif is_number(row.VALUE):  
    my_idx = patient_index_of[row.SUBJECT_ID]
    time_idx = time_lists[my_idx].index(row.MONITORTIME) + 1
    X_feats[my_idx][time_idx][len(chart_feats) + len(lab_feats) + surgery_index_of[row.ITEMID]] = row.VALUE
    X_present[my_idx][time_idx][len(chart_feats) + len(lab_feats) + surgery_index_of[row.ITEMID]] = True
    X_masks[my_idx][time_idx] = True

for i in range(len(patient_set)):
  for j in range(len(time_lists[i])): 
    time_lists[i][j] = normalize_time(patient_set[i], time_lists[i][j])
  time_lists[i].append(-1) # [CLS]
  time_lists[i] = sorted(list(set(time_lists[i])))
  for j in range(MAX_TIMESTAMPS): 
    if j >= len(time_lists[i]): 
      X_times[i][j] = PAD_TIME
    else: 
      if j == 0: 
        X_times[i][j] = -1
      else: 
        X_times[i][j] = normalize_time(patient_set[i], time_lists[i][j])
  X_masks[i][0] = True 


In [0]:
# Add in static_vec to non [CLS] states.
print(X_feats.shape)
static_add = np.expand_dims(static_vec, axis = 1)
static_add = np.tile(static_add, (1, X_feats.shape[1], 1))
static_msk = np.zeros(static_add.shape) == 0

for i in range(len(patient_set)): 
  for j in range(MAX_TIMESTAMPS): 
    if X_times[i][j] == PAD_TIME: 
      static_msk[i][j][:] = [False, False, False]
      static_add[i][j][:] = [0, 0, 0]
    elif j == 0: 
      static_msk[i][j][:] = [False, False, False]
      static_add[i][j][:] = [0, 0, 0]


X_feats = np.concatenate((X_feats, static_add), axis=2)
X_present = np.concatenate((X_present, static_msk), axis=2)

In [0]:
print(X_feats[0][0])

In [0]:
# Add in [CLS] token.
new_row = np.zeros((1, MAX_TIMESTAMPS, 1))
new_row[0][0][0] = 1
new_row = np.tile(new_row, (X_feats.shape[0], 1, 1))

X_feats = np.concatenate((new_row, X_feats), axis=2)
X_present = np.concatenate((new_row == 1, X_present), axis=2)

In [0]:
print(new_row.shape)

In [0]:
print(X_feats.shape)
print(X_present.shape)

In [0]:
LEN_FEATS += 4

Generate Labels 

In [0]:
good_mask = get_mask(subjects_to_remove)
patients_to_include = np.array(patient_set)[good_mask]

In [0]:
Y_mort = np.zeros(len(patient_set)) 
for i in range(len(patient_set)): 
  if mort_icu[patient_set[i]] == 1:
    Y_mort[i] = 1
  else: 
    Y_mort[i] = 0

In [0]:
Y_3d = np.zeros(len(patient_set)) 
Y_7d = np.zeros(len(patient_set))
for i in range(len(patient_set)): 
  disch = admissions[admissions['SUBJECT_ID'] == patient_set[i]].DISCHTIME.item()
  if normalize_time(patient_set[i], disch) > (3 * 24): 
    Y_3d[i] = 1
  else: 
    Y_3d[i] = 0
  if normalize_time(patient_set[i], disch) > (7 * 24): 
    Y_7d[i] = 1
  else: 
    Y_7d[i] = 0

Alright now here the stuff is set up. feats, times, masks need to be passed in to the transformer.

In [0]:
# X_feats = np.zeros((len(patient_set), MAX_TIMESTAMPS, LEN_FEATS))
# X_present = np.zeros((len(patient_set), MAX_TIMESTAMPS, LEN_FEATS)) == 1
# X_times = np.zeros((len(patient_set), MAX_TIMESTAMPS))
# X_masks = np.zeros((len(patient_set), MAX_TIMESTAMPS)) == 1

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train_patients, test_patients, _, _ = train_test_split([i for i in patients_to_include], Y_mort[good_mask], test_size=0.2, random_state=1)

train_indices = [patient_index_of[i] for i in train_patients]
test_indices = [patient_index_of[i] for i in test_patients]

### Let's manually normalize. 

global_means = np.zeros(LEN_FEATS)
global_std = np.zeros(LEN_FEATS) 

for i in range(1, LEN_FEATS): 
  sum_here = np.sum(X_feats[:, :, i])
  num_here = np.sum(X_present[:, :, i])

  global_means[i] = sum_here / num_here 

  square_sum = np.sum(X_feats[:, :, i] * X_feats[:, :, i])
  square_sum += (-2 * global_means[i] * sum_here)
  square_sum += global_means[i] * global_means[i] * num_here 

  global_std[i] = np.sqrt(square_sum / num_here)

  X_feats[:, :, i] = (X_feats[:, :, i] - global_means[i]) / global_std[i]
  X_feats[:, :, i] = X_feats[:, :, i] * X_present[:, :, i]


train_mask = [False for p in patient_set] 
test_mask = [True for p in patient_set] 
for i in train_indices:
  train_mask[i] = True 
  test_mask[i] = False 

X_train_feats = X_feats[train_mask, ...]
X_train_present = X_present[train_mask, ...]
X_train_times = X_times[train_mask, ...]
X_train_masks = X_masks[train_mask, ...]
Y_train_mort = Y_mort[train_mask, ...]

X_test_feats = X_feats[test_mask, ...]
X_test_present = X_present[test_mask, ...]
X_test_times = X_times[test_mask, ...]
X_test_masks = X_masks[test_mask, ...]
Y_test_mort = Y_mort[test_mask, ...]


In [0]:
Y_train_3d = Y_3d[train_mask, ...]
Y_train_7d = Y_7d[train_mask, ...]
Y_test_3d = Y_3d[test_mask, ...]
Y_test_7d = Y_7d[test_mask, ...]

In [0]:
print(X_train_present[0][0])

In [0]:
print(X_train_present[0][0])
print(X_train_present[0][0].shape)
print(X_train_times[0][0])

Dimensions: 
*   X_feats : (|P|, T, D)
*   X_times : (|P|, T)
*   X_masks : (|P|, T)
*   Y_mort : (|P|) 








In [0]:
X_train_feats = torch.from_numpy(X_train_feats).float()
X_train_times = torch.from_numpy(X_train_times).float() 
X_train_masks = torch.from_numpy(X_train_masks) 
Y_train_mort = torch.from_numpy(Y_train_mort)
Y_train_3d = torch.from_numpy(Y_train_3d)
Y_train_7d = torch.from_numpy(Y_train_7d)

X_test_feats = torch.from_numpy(X_test_feats).float()
X_test_times = torch.from_numpy(X_test_times).float() 
X_test_masks = torch.from_numpy(X_test_masks) 
Y_test_mort = torch.from_numpy(Y_test_mort)
Y_test_3d = torch.from_numpy(Y_test_3d)
Y_test_7d = torch.from_numpy(Y_test_7d)

# Run Model

In [0]:
from torch.utils.data import DataLoader, Dataset

BATCH_SIZE = 64

class TimeDataset(Dataset): 
  def __init__(self, xf, xt, xm, ym): 
    self.src = xf
    self.times = xt
    self.src_mask = xm
    self.trg = ym
  def __len__(self): 
    return self.src.size(0)
  def __getitem__(self, index): 
    return self.src[index], self.times[index], self.src_mask[index], self.trg[index]

train_dataset = TimeDataset(X_train_feats, X_train_times, X_train_masks, Y_train_7d)
test_dataset = TimeDataset(X_test_feats, X_test_times, X_test_masks, Y_test_7d)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [0]:
# make_model(d_input, d_target, N=6, M=12, d_model=512, d_hidden=256, d_ff=2048, h=8, max_observation_time=24.0, max_encoding_time=1000.0, dropout=0.1):
model = make_model(X_train_feats.size(2), d_target=1, N=4, d_model=128, d_hidden=128, d_ff=512, dropout=0.5, type='CLS')
model_opt = get_std_opt(model)
model.cpu()



In [0]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

def evaluate_result(true_tag_list, predicted_tag_list, probs):
  for i in range(len(true_tag_list)): 
    true_tag_list[i] = true_tag_list[i].item()
    predicted_tag_list[i] = predicted_tag_list[i].item()
  acc = 1.0 - np.mean(np.abs(np.array(true_tag_list) - np.array(predicted_tag_list)))
  return acc, roc_auc_score(true_tag_list, probs)

criterion = torch.nn.BCEWithLogitsLoss()
criterion.cuda()
for epoch in range(40):
    train_epoch(train_loader, model, criterion, model_opt)
    probs, preds, trgs = valid_epoch(test_loader, model, criterion)
    acc, auc = evaluate_result(trgs, preds, probs)
    print("Epoch ", epoch + 1, " Acc: ", acc, " AUC: ", auc)