In [2]:
# setup drive
import os
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import pickle

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
# from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import f1_score

device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )

In [4]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 24.2MB/s eta 0:00:01[K     |█▏                              | 20kB 1.7MB/s eta 0:00:01[K     |█▊                              | 30kB 2.6MB/s eta 0:00:01[K     |██▎                             | 40kB 3.4MB/s eta 0:00:01[K     |███                             | 51kB 2.1MB/s eta 0:00:01[K     |███▌                            | 61kB 2.5MB/s eta 0:00:01[K     |████                            | 71kB 2.9MB/s eta 0:00:01[K     |████▋                           | 81kB 3.3MB/s eta 0:00:01[K     |█████▎                          | 92kB 2.5MB/s eta 0:00:01[K     |█████▉                          | 102kB 2.8MB/s eta 0:00:01[K     |██████▍                         | 112kB 2.8MB/s eta 0:00:01[K     |███████                         | 122kB 2.8M

In [123]:
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model.eval()
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
  model = nn.DataParallel(model)
model.to(device);

HBox(children=(IntProgress(value=0, description='Downloading', max=435778770, style=ProgressStyle(description_…




# Import files

In [0]:
# import files
x_train = pd.read_pickle('/content/drive/My Drive/cis522/project/x_train.pkl')
x_test = pd.read_pickle('/content/drive/My Drive/cis522/project/x_test.pkl')
x_val = pd.read_pickle('/content/drive/My Drive/cis522/project/x_val.pkl')
y_train = pd.read_pickle('/content/drive/My Drive/cis522/project/y_train.pkl')
y_test = pd.read_pickle('/content/drive/My Drive/cis522/project/y_test.pkl')
y_val = pd.read_pickle('/content/drive/My Drive/cis522/project/y_val.pkl')

with open('/content/drive/My Drive/cis522/project/diagnoses.pkl', 'rb') as dict_file:
    diagnoses_dict = pickle.load(dict_file)

# Preprocessing

In [0]:
x_train = x_train.str.replace("\n", " ").str.split("<SECTION>")
x_test = x_test.str.replace("\n", " ").str.split("<SECTION>")
x_val = x_val.str.replace("\n", " ").str.split("<SECTION>")

In [0]:
def tokenize(document):
  ids = []
  mask = []
  for sent in document:
    encoded = tokenizer.encode_plus(sent, max_length=256, add_special_tokens=True, pad_to_max_length=True, return_token_type_ids=False)
    ids.append(torch.tensor(encoded['input_ids']))
    mask.append(torch.tensor(encoded['attention_mask']))
  ids = torch.stack(ids)
  mask = torch.stack(mask)
  return (ids, mask)

In [150]:
x_ids, x_mask = tokenize(x_train.iloc[0])
x_ids, x_mask = x_ids.to(device), x_mask.to(device)
x_embed = model(x_ids, token_type_ids=None, attention_mask=x_mask) # second to last hidden layer - best for classification? https://github.com/BramVanroy/bert-for-inference/blob/master/introduction-to-bert.ipynb
print(x_embed[1].shape) # should be the hidden layer

torch.Size([17, 256])
torch.Size([17, 768])


# Dataset/dataloader

In [0]:
# build a mapping from y (diagnostic codes) to classes based on the number of elements in the classs
def buildMap(y, diagnosis_dict, min_class_size, start_index = 1):
  codeToClassMap = {}
  classToCodeMap = {}
  # i = 0 should be reserved for padding
  # i = 1 should be reserved for classes to catch the rest
  i = start_index
  for row in y:
    for code in row:
      if diagnosis_dict[code] >= min_class_size:
        if code not in codeToClassMap:
          codeToClassMap[code] = i
          classToCodeMap[i] = code
          i += 1
  return (codeToClassMap, classToCodeMap, i+1) # return(map, class_size). add 2 extra classes
  
(codeToClassMap, classToCodeMap, num_classes) = buildMap(pd.concat([y_train, y_test, y_val]), diagnoses_dict, 200)

# create the dataset
# all classes not in codeToClassMap should be lumped into a single class 0
class MIMICDataset(Dataset):
    def __init__(self, x, y, codeToClassMap, classToCodeMap, num_classes):
      # import maps
      self.num_classes = num_classes
      self.codeToClassMap = codeToClassMap
      self.classToCodeMap = classToCodeMap

      # if there are y's, i.e. we are training
      if y is not None:
        # map the y's to a class
        self.y = self.mapCodeToClasses(y)
        self.frequency_dict = self.buildDict(self.y)

      # for the x's
      self.x = x

      # weights for the loss function
      self.classLossWeights = self.getClassLossWeights()
      pass


    def __len__(self):
      return len(self.x)
      pass


    def __getitem__(self, idx):
      # get the embeddings
      with torch.no_grad():    
        x_id, x_mask = tokenize(self.x.iloc[idx])
        x_id, x_mask = x_id.to(device), x_mask.to(device)
        x_embed = model(x_id, token_type_ids=None, attention_mask=x_mask)[1] # second to last hidden layer - best for classification? https://github.com/BramVanroy/bert-for-inference/blob/master/introduction-to-bert.ipynb
        length = torch.tensor(len(x_embed))


      # get y into one-hot
      if self.y is not None:
        y = torch.tensor(self.y[idx])
        y = y.unsqueeze(0)
        y = torch.zeros(y.size(0), self.num_classes).scatter(1, y, 1.)[0]
        return x_embed, y, length

      return x_embed, length
      pass


    # get the class of of ys
    def mapCodeToClasses(self, y):
      classes = []
      for i, row in enumerate(y):
        classes.append([])
        for code in row:
          if code in self.codeToClassMap: # this means we are trying to predict the class
            classes[i].append(self.codeToClassMap[code])
          elif 1 not in classes[i]: # we are not trying to predict the class - add a class 1 if not yet in the list
            classes[i].append(1)
      return classes


    # Build a dictionary of frequencies. Add 1 to the dict for each class appearance.
    def buildDict(self, y):
      frequency_dict = {}
      # iterate through all items in y
      for row in y:
        for class_label in row:
          # if the class is in the map, add the class frequency to the dictionary
          if class_label in frequency_dict:
            frequency_dict[class_label] += 1
          else:
            frequency_dict[class_label] = 1
      return frequency_dict
      pass


    def getClassLossWeights(self):
      weights = np.empty(self.num_classes, dtype='float32')
      for key, value in self.frequency_dict.items():
        weights[key] = 1 / value
      return torch.tensor(weights)

In [0]:
def collate(batch):
    batch_size = len(batch)
    batch_split = list(zip(*batch))
    seqs, targs, lengths = batch_split[0], batch_split[1], batch_split[2]
    seqs = pad_sequence(seqs, batch_first=True)
    targs = torch.stack(targs)
    lengths = torch.stack(lengths)
    return seqs, targs, lengths

In [0]:
train_dataset = MIMICDataset(x_train, y_train, codeToClassMap, classToCodeMap, num_classes)
train_loader = DataLoader(train_dataset, collate_fn=collate, batch_size=32)

val_dataset = MIMICDataset(x_val, y_val, codeToClassMap, classToCodeMap, num_classes)
val_loader = DataLoader(val_dataset, collate_fn=collate, batch_size=32)

test_dataset = MIMICDataset(x_val, y_val, codeToClassMap, classToCodeMap, num_classes)
test_loader = DataLoader(val_dataset, collate_fn=collate, batch_size=32)

In [157]:
for batch in val_loader:
  x, y, lengths = batch
  print(x.shape)
  print(y.shape)
  print(lengths)
  break

torch.Size([32, 35, 768])
torch.Size([32, 475])
tensor([23, 18, 16, 21, 19, 20, 23,  9, 21, 32, 15, 34, 18, 23,  7, 22, 25, 13,
        17, 22, 17, 20, 10, 18, 20, 22, 11, 35, 12, 22, 14, 22])


# Logging

#Training and Testing

In [0]:
def compute_f1(target, pred):
    ones = torch.ones(target.shape).to(device)
    tp = ((pred == target).float() == target).float().sum().item()
    fp = ((pred == (ones - target)).float() == (ones - target)).float().sum().item()
    fn = ((pred == (ones - target)).float() == target).float().sum().item()
    if tp is 0 and fp is 0:
      precision = 0
    else:
      precision = tp / (tp + fp)
    if tp is 0 and fn is 0:
      recall = 0
    else:
      recall = tp / (tp + fn)
    if precision is 0 and recall is 0:
      f1 is 0
    else:
      f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1
# set up the train/test infrastructure
# ALERT ALERT ALERT: make sure you have a folder named checkpoints within your hw directory
def train_model(iterator, model, criterion, optimizer, num_epochs, log_file=None):
  # set up logging infrastructure
  if log_file:
    logger = SummaryWriter(os.path.join(ROOT_LOG_DIR, log_file))

  # for each epoch
  for e in range(num_epochs):

    # save the model every 10 epochs. 
    # to reload, execute the following:
    #   model = MODEL()
    #   model.load_state_dict(torch.load(PATH))
    #   model.eval()
    if log_file and e is not 0 and e % 10 is 0 and e is not num_epochs - 1:
      CHECKPOINT_PATH = '/content/drive/My Drive/cis522/HW4/checkpoints'
      CHECKPOINT_PATH = os.path.join(CHECKPOINT_PATH,log_file+'_epoch'+str(e))
      torch.save(model.state_dict(), CHECKPOINT_PATH)

    # define loss and f1 for this epoch
    epoch_loss = 0
    epoch_f1 = 0

    # for each batch
    for idx, batch in enumerate(iterator):
      if (idx % 50 == 0):
        print("Epoch: "+str(e)+"\t on batch "+str(idx) + " of "+str(len(iterator)))

      # zero the gradients
      optimizer.zero_grad()

      # get the batch
      x, y, lengths = batch
      x, y, lengths = x.to(device), y.to(device), lengths.to(device)

      # run the model
      output = model(x, lengths)

      # calculate the loss
      if (output.dim() == 1 ):
        output.unsqueeze(0)
      loss = criterion(output, y)

      # backprop
      loss.backward()

      # update step
      optimizer.step()

      # update the loss and accuracy
      epoch_loss += loss.data.item()
      precision, recall, f1 = compute_f1(y.detach(), torch.round(torch.sigmoid(output.detach())))
      epoch_f1 += f1

      # clean up for memory
      del batch
      del x
      del y
      del lengths
      del output
      del loss
      while (gc.collect() != 0):
        x = 1
        del x
      

    # print and log
    epoch_loss = epoch_loss / (idx+1)
    epoch_f1 = epoch_f1 / (idx+1)

    print('Epoch:', e, '\tLoss:', epoch_loss, '\tF1:', epoch_f1)
    if log_file:
      logger.add_scalar('loss', epoch_loss, e)
      logger.add_scalar('f1', epoch_f1, e)

  # save the final model
  if log_file:
    CHECKPOINT_PATH = '/content/drive/My Drive/cis522/project/models/checkpoints'
    CHECKPOINT_PATH = os.path.join(CHECKPOINT_PATH,log_file+'_epoch'+str(e)+'_final')
    torch.save(model.state_dict(), CHECKPOINT_PATH)
  pass 

# haven't run - still need to debug.
# def test_model(iterator, model, returnResults = False):
#   with torch.no_grad():
#     labels = []
#     targets = []
#     for idx, batch in enumerate(iterator):
#       # get the batch
#       (text, text_lengths), score = batch
#       text, text_lengths, score = text.to(device), text_lengths.to(device), score.to(device)

#       # run the model
#       output = model(text, text_lengths)
#       labels = labels + (torch.argmax(output, dim=1)).tolist()
#       targets = targets + score.tolist()
    
#     print(f1_score(labels, targets, average='macro'))
#     if returnResults:
#       return targets, labels

# Model

In [0]:
import gc
from torch.nn.utils.rnn import pack_padded_sequence

In [0]:
class MIMICClassifier(nn.Module):
  """ 
  Mimic Classifier

  Parameters: 
  mode (string): Type of recurrent layer being used. Types are ['rnn', 'lstm', 'gru', 'bilstm']
  output_size (int): Size of the last layer for classification (hint: how many classes do you have?)
  hidden_size (int): Length of your hidden state vector
  vocab_size (int): Length of your vocab (can get this by doing len(TEXT.vocab))
  embedding_length (int): Dimension of your word embedding vector (hint: look at part 2c)
  word_embeddings (Tensor): All of the word embeddings generated. Can get this from TEXT.vocab.vectors
  """
  def __init__(self, mode, output_size, embedding_length, hidden_size):
    super(MIMICClassifier, self).__init__()

    self.mode = mode

    if mode is 'rnn':
      self.recurrent_layer = nn.RNN(embedding_length, hidden_size, nonlinearity='relu', batch_first=True)
    elif mode is 'lstm':
      self.recurrent_layer = nn.LSTM(embedding_length, hidden_size, batch_first=True)
    elif mode is 'gru':
      self.recurrent_layer = nn.GRU(embedding_length, hidden_size, batch_first=True)
    elif mode is 'bilstm':
      self.recurrent_layer = nn.LSTM(embedding_length, hidden_size, bidirectional=True, batch_first=True)
    else:
      raise ValueError("Choose a mode from - rnn / lstm / gru / bilstm")
    
    self.fc = nn.Linear(hidden_size, output_size)

  def getHidden(self, sequence):
    if self.mode is 'rnn':
      output, hidden = sequence
      return hidden.squeeze()
    elif self.mode is 'lstm':
      output, (hidden, cell) = sequence
      return hidden.squeeze()
    elif self.mode is 'gru':
      output, hidden = sequence
      return hidden.squeeze()
    elif self.mode is 'bilstm':
      output, (hidden, cell) = sequence
      hidden = (hidden[0, :, :] + hidden[1, : , :])
      return hidden.squeeze()

  def forward(self, embedding_sequence, lengths):
    packed = pack_padded_sequence(embedding_sequence, lengths, enforce_sorted = False, batch_first=True)
    sequence = self.recurrent_layer(packed)
    hidden = self.getHidden(sequence)
    return self.fc(hidden)

In [160]:
rnn = MIMICClassifier(
    mode='rnn', 
    output_size=num_classes, 
    hidden_size=512, 
    embedding_length=768, 
    )
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
  rnn = nn.DataParallel(rnn)
rnn.to(device)
criterion = nn.BCEWithLogitsLoss(weight=val_dataset.getClassLossWeights().to(device))
optimizer = torch.optim.Adam(rnn.parameters()) # need low learning rate - gradients are funky here.
train_model(val_loader, rnn, criterion, optimizer, 5)

Epoch: 0	 on batch 0 of 294


KeyboardInterrupt: ignored

In [0]:
del rnn
while (gc.collect() != 0):
  x = 1
  del x

In [112]:
x=1
del x
x = gc.collect()
print(x)

0
