In [2]:
import multiprocessing
import numpy as np
import pandas as pd
import torch
from torch import optim
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
from lightgbm import LGBMClassifier
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [5]:
import os
os.chdir("/gdrive/MyDrive/ml-for-healthcare")

In [6]:
from encoder import AutoEncoderCnn

In [7]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Q4: Finetuning Strategies using Encoder from Q2

## 1. Classic ML
Similar to Q3,
- obtain representations for the PTB dataset by feeding the dataset through the pre-trained encoder
- Use a classic ML method from Part 1 to train and test for the PTB task using these representations

### Encode dataset

In [None]:
# load data
train = pd.read_csv("ptbdb_train.csv", header=None)
test = pd.read_csv("ptbdb_test.csv", header=None)

X_train = train.drop(187, axis=1)
X_test = test.drop(187, axis=1)

y_train = train[187]
y_test = test[187]

In [9]:
# load autoencoder model
ae = AutoEncoderCnn()
ae.to(DEVICE)
ae.load_state_dict(torch.load("encoder_model.pth"))

In [None]:
# feed data through encoder model to get embeddings

def get_encodings(model, X, device, batchsize=64):
  X_tensor = torch.tensor(X.values, dtype=torch.float32)
  X_dataloader = DataLoader(X_tensor, batch_size=batchsize, shuffle=False)

  encodings = []

  model.eval()
  with torch.no_grad():
    for x_batch in X_dataloader:
      x_batch = x_batch.unsqueeze(1)
      x_batch = x_batch.to(device)
      x_batch_encoding = model(x_batch)
      encodings.append(x_batch_encoding.detach().cpu().numpy())

  return encodings

encoder = ae.encoder
X_train_encodings = get_encodings(model=encoder, X=X_train, device=DEVICE)
X_test_encodings = get_encodings(model=encoder, X=X_test, device=DEVICE)

In [None]:
# create numpy matrices from list of batches
X_train_encodings = np.vstack(X_train_encodings)
X_test_encodings = np.vstack(X_test_encodings)

### Train and Test classic ML model

In [None]:
# train classic ML model
clf = LGBMClassifier(verbose=-1, n_estimators=500, learning_rate=0.1, max_depth=9) # choose parameters from part 1
clf.fit(X_train_encodings, y_train)

In [None]:
# test performance
y_pred = clf.predict(X_test_encodings)
accuracy_score(y_pred, y_test), f1_score(y_pred, y_test), balanced_accuracy_score(y_pred, y_test)

In [None]:
# train classic ML model
param_grid = {'n_estimators': [100, 200, 300, 400, 500],
              'learning_rate': [0.01, 0.1, 1],
              'max_depth': [3, 5, 7, 9]}

clf = LGBMClassifier(verbose=-1)

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_encodings, y_train)

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))

Best parameters: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500}


In [None]:
# test performance
y_pred = grid_search.predict(X_test_encodings)
accuracy_score(y_pred, y_test), f1_score(y_pred, y_test), balanced_accuracy_score(y_pred, y_test)

(0.971830985915493, 0.9805410536307546, 0.9665816882845981)

## 2. ANNs
Add output layer(s) for the PTB binary class to your encoder model. Implement the following finetuning strategies
- Train the output layer(s) only on the PTB dataset, freezing the encoder
- Train the entire model on the PTB dataset (encoder + output layers).
- First, train the output layers, then unfreeze and train the entire joint model in two
separate stages.

### Add output layer(s) for the PTB binary class to your encoder model.

In [None]:
class TimeSeriesClassifier(torch.nn.Module):
  def __init__(self, encoder, hidden_size, layer_sizes, dropout_prob):
    super(TimeSeriesClassifier, self).__init__()
    self.encoder = encoder
    self.classifier = torch.nn.Sequential(
        torch.nn.Linear(hidden_size, layer_sizes[0]),
        torch.nn.ReLU(),
        torch.nn.Dropout(dropout_prob),
        torch.nn.Linear(layer_sizes[0], layer_sizes[1]),
        torch.nn.ReLU(),
        torch.nn.Dropout(dropout_prob),
        torch.nn.Linear(layer_sizes[1], 1),
        torch.nn.Sigmoid()
    )

  def forward(self, x):
    return self.classifier(self.encoder(x))

In [None]:
def train_clf(model, epochs, batch_size, train_loader, lr, device):
  criterion = torch.nn.BCELoss()
  optimizer = optim.Adam(model.parameters(), lr=lr)

  for epoch in range(epochs):

      model.train()

      epoch_loss = 0

      for X, y in train_loader:

          X = X.to(DEVICE)
          y = y.to(DEVICE)

          y_pred = model(X.unsqueeze(1))

          # loss = criterion(y_pred, batch["score"])
          loss = criterion(y_pred.squeeze(1), y)

          # backward pass
          optimizer.zero_grad()
          loss.backward()

          # update weights
          optimizer.step()

          epoch_loss += loss.item()

      print(f"epoch {epoch} train loss {epoch_loss}")

In [None]:
def get_predictions(model, test_loader):
  y_preds = []
  model.eval()
  with torch.no_grad():
    for X, y in test_loader:
      X = X.to(DEVICE)
      y = y.to(DEVICE)
      y_pred = model(X.unsqueeze(1))
      y_preds.append(y_pred.detach().cpu().numpy())
  return np.round(np.vstack(y_preds))

In [None]:
train_dataset = TensorDataset(torch.tensor(X_train.to_numpy(), dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test.to_numpy(), dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=BATCH_SIZE, pin_memory=True)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE, pin_memory=True)

In [None]:
N_EPOCHS = 30
LR = 0.0003
hidden_size = 128 # bottleneck dimension of encoder

### 1 Train the output layer(s) only on the PTB dataset, freezing the encoder

In [None]:
model1 = TimeSeriesClassifier(encoder=encoder, hidden_size=hidden_size, layer_sizes=[hidden_size, int(hidden_size/2)], dropout_prob=0.05)
model1 = model1.to(DEVICE)

# freeze encoder
for param in model1.encoder.parameters():
    param.requires_grad = False

In [None]:
train_clf(model=model1, epochs=N_EPOCHS, batch_size=BATCH_SIZE, train_loader=train_loader, lr=LR, device=DEVICE)

epoch 0 train loss 96.75105077028275
epoch 1 train loss 82.85167887806892
epoch 2 train loss 76.78185647726059
epoch 3 train loss 73.41295690834522
epoch 4 train loss 71.32681079208851
epoch 5 train loss 69.34938235580921
epoch 6 train loss 67.92075580358505
epoch 7 train loss 65.6915245205164
epoch 8 train loss 64.0440423488617
epoch 9 train loss 62.470800653100014
epoch 10 train loss 60.63560292124748
epoch 11 train loss 58.99235409498215
epoch 12 train loss 57.70361717045307
epoch 13 train loss 56.13388539850712
epoch 14 train loss 54.60328558087349
epoch 15 train loss 53.588440611958504
epoch 16 train loss 52.16248382627964
epoch 17 train loss 51.70906560122967
epoch 18 train loss 50.67980144917965
epoch 19 train loss 49.854457914829254
epoch 20 train loss 48.64652144908905
epoch 21 train loss 48.43485701829195
epoch 22 train loss 47.89348563551903
epoch 23 train loss 47.477348893880844
epoch 24 train loss 46.03241886943579
epoch 25 train loss 46.11780732125044
epoch 26 train loss 

In [None]:
y_pred = get_predictions(model1, test_loader)
accuracy_score(y_pred, y_test), f1_score(y_pred, y_test), balanced_accuracy_score(y_pred, y_test)

(0.9089659910683614, 0.9374262101534829, 0.8901142771745746)

### 2 Train the entire model on the PTB dataset (encoder + output layers).


In [None]:
model2 = TimeSeriesClassifier(encoder=encoder, hidden_size=hidden_size, layer_sizes=[hidden_size, int(hidden_size/2)], dropout_prob=0.05)
model2 = model2.to(DEVICE)

In [None]:
train_clf(model=model2, epochs=N_EPOCHS, batch_size=BATCH_SIZE, train_loader=train_loader, lr=LR, device=DEVICE)

epoch 0 train loss 93.90019416809082
epoch 1 train loss 81.07586461305618
epoch 2 train loss 75.7562313079834
epoch 3 train loss 72.7193745970726
epoch 4 train loss 70.04418049752712
epoch 5 train loss 67.75436583161354
epoch 6 train loss 65.6602114289999
epoch 7 train loss 63.72325348854065
epoch 8 train loss 62.15292306244373
epoch 9 train loss 60.66056151688099
epoch 10 train loss 59.041644752025604
epoch 11 train loss 57.73614148795605
epoch 12 train loss 56.61657218635082
epoch 13 train loss 54.83641530573368
epoch 14 train loss 53.98317164182663
epoch 15 train loss 52.626456037163734
epoch 16 train loss 52.05014856159687
epoch 17 train loss 50.891385301947594
epoch 18 train loss 49.78858503699303
epoch 19 train loss 49.4273085296154
epoch 20 train loss 48.27242286503315
epoch 21 train loss 47.96456269919872
epoch 22 train loss 47.34878271818161
epoch 23 train loss 46.62107890844345
epoch 24 train loss 45.81991498917341
epoch 25 train loss 45.03987929970026
epoch 26 train loss 45.

In [None]:
y_pred = get_predictions(model2, test_loader)
accuracy_score(y_pred, y_test), f1_score(y_pred, y_test), balanced_accuracy_score(y_pred, y_test)

(0.9089659910683614, 0.9365877004067958, 0.8841817253948405)

### 3 First, train the output layers, then unfreeze and train the entire joint model in two separate stages.

In [None]:
model3 = TimeSeriesClassifier(encoder=encoder, hidden_size=hidden_size, layer_sizes=[hidden_size, int(hidden_size/2)], dropout_prob=0.05)
model3 = model3.to(DEVICE)

# freeze encoder
for param in model3.encoder.parameters():
    param.requires_grad = False

In [None]:
train_clf(model=model3, epochs=N_EPOCHS, batch_size=BATCH_SIZE, train_loader=train_loader, lr=LR, device=DEVICE)

epoch 0 train loss 94.42457100749016
epoch 1 train loss 79.47241824865341
epoch 2 train loss 74.53904396295547
epoch 3 train loss 71.53491115570068
epoch 4 train loss 69.51486666500568
epoch 5 train loss 67.66246801614761
epoch 6 train loss 65.78127066791058
epoch 7 train loss 64.057232812047
epoch 8 train loss 62.46819940209389
epoch 9 train loss 61.24090501666069
epoch 10 train loss 60.1319864243269
epoch 11 train loss 58.83323009312153
epoch 12 train loss 58.01156145334244
epoch 13 train loss 56.44874532520771
epoch 14 train loss 55.8611836284399
epoch 15 train loss 54.71998347342014
epoch 16 train loss 54.296542167663574
epoch 17 train loss 53.40027917921543
epoch 18 train loss 52.50051248073578
epoch 19 train loss 51.83314363658428
epoch 20 train loss 50.68542338907719
epoch 21 train loss 50.211225643754005
epoch 22 train loss 49.482344046235085
epoch 23 train loss 48.64254926145077
epoch 24 train loss 47.87079732120037
epoch 25 train loss 47.70423060655594
epoch 26 train loss 47.

In [None]:
# unfreeze encoder
for param in model3.encoder.parameters():
    param.requires_grad = True

train_clf(model=model3, epochs=N_EPOCHS, batch_size=BATCH_SIZE, train_loader=train_loader, lr=LR, device=DEVICE)

epoch 0 train loss 43.77062365412712
epoch 1 train loss 39.25150250643492
epoch 2 train loss 36.64535838365555
epoch 3 train loss 34.47888941317797
epoch 4 train loss 33.02504141628742
epoch 5 train loss 30.55866066366434
epoch 6 train loss 29.768264915794134
epoch 7 train loss 28.224283169955015
epoch 8 train loss 25.375430420041084
epoch 9 train loss 25.268023643642664
epoch 10 train loss 24.12083588168025
epoch 11 train loss 23.509501680731773
epoch 12 train loss 22.748831935226917
epoch 13 train loss 21.27926165983081
epoch 14 train loss 20.231194971129298
epoch 15 train loss 20.172702809795737
epoch 16 train loss 19.447189010679722
epoch 17 train loss 18.060569409281015
epoch 18 train loss 17.55083186738193
epoch 19 train loss 17.16678594239056
epoch 20 train loss 17.135951979085803
epoch 21 train loss 15.627752775326371
epoch 22 train loss 15.311705376952887
epoch 23 train loss 15.306591225787997
epoch 24 train loss 14.94433636777103
epoch 25 train loss 14.0496222153306
epoch 26 

In [None]:
y_pred = get_predictions(model3, test_loader)
accuracy_score(y_pred, y_test), f1_score(y_pred, y_test), balanced_accuracy_score(y_pred, y_test)

(0.9725180350395053, 0.9810066476733144, 0.9671015496032802)