In [None]:
# Import necessary libraries
import numpy as np
import torch
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from sklearn.metrics import accuracy_score
import Levenshtein

# Import utility libraries for evaluating model results
import csv
import json
import time

In [None]:
# Check if cuda is available

cuda = torch.cuda.is_available()
use_amp = False
num_workers = 8 if cuda else 0
print("Cuda = %s with num_workers = %d" % (cuda, num_workers))

In [None]:
# Load data into numpy

def load_data(in_filename):
    data = np.load(in_filename, allow_pickle=True)
    for i in range(data.shape[0]):
        data[i] = data[i].astype(np.float32)
    return data

def load_labels(in_filename):
    data = np.load(in_filename, allow_pickle=True)
    return data

In [None]:
# Load data into numpy

x_train = load_data("../../project/data/train_data_balanced.npy")
labels_train = load_labels("../../project/data/train_label_balanced.npy")

x_val = load_data("../../project/data/validation_data_balanced.npy")
labels_val = load_labels("../../project/data/validation_label_balanced.npy")

In [None]:
# Get smaller subset of data to verify code correctness

def data_subset(x, labels, subset_size):
    assert(x.shape[0] == labels.shape[0])
    indices = np.random.randint(x.shape[0], size=subset_size)
    return (x[indices], labels[indices])

(x_subset, labels_subset) = data_subset(x_val, labels_val, 43432)
(x_subset_val, labels_subset_val) = data_subset(x_val, labels_val, 4842)

In [None]:
# Print data shape

print("Training data shape: ", x_train.shape, x_train.dtype)
print("Training labels shape: ", labels_train.shape, labels_train.dtype)

print("Validation data shape: ", x_val.shape, x_val.dtype)
print("Validation labels shape: ", labels_val.shape, labels_val.dtype)

print("Subset data shape: ", x_subset.shape, x_subset.dtype)
print("Subset labels shape: ", labels_subset.shape, labels_subset.dtype)

print("Subset validation data shape: ", x_subset_val.shape, x_subset.dtype)
print("Subset validation labels shape: ", labels_subset_val.shape, labels_subset_val.dtype)

In [None]:
# Dataset definition

class LSTMDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        x = torch.Tensor(self.x[index]).long()
        y = self.y[index]
        return (x,y)
    

def pad_collate(batch):
    # batch: A list of length batch_size, where each entry is an (x,y) tuple
    # xs: A list of length batch_size, where each entry is a tensor of x-values
    # ys: A list of length batch_size, where each entry is a tensor of y-values
    (xs,y) = zip(*batch)

    x_lengths = torch.Tensor([len(x) for x in xs]).long()
    xs = torch.nn.utils.rnn.pad_sequence(xs, batch_first=False, padding_value=0)
    y = torch.Tensor(y).long()
    
    return (xs, x_lengths, y)

In [None]:
# Dataloaders

dataloader_args = {
    "shuffle": True,
    "batch_size": 64,
    "drop_last": True,
    "pin_memory": True,
    "num_workers": 0,
    "collate_fn": pad_collate,
}

testloader_args = {
    "shuffle": False,
    "batch_size": 64,
    "drop_last": False,
    "pin_memory": True,
    "num_workers": 0,
    "collate_fn": pad_collate,
}

# Training dataloader
train_data = LSTMDataset(x_train, labels_train)
train_loader = torch.utils.data.DataLoader(train_data, **dataloader_args)

# Validation dataloader
val_data = LSTMDataset(x_val, labels_val)
val_loader = torch.utils.data.DataLoader(val_data, **dataloader_args)

# Subset dataloader
subset_data = LSTMDataset(x_subset, labels_subset)
subset_loader = torch.utils.data.DataLoader(subset_data, **dataloader_args)

# Subset validation dataloader
subset_val_data = LSTMDataset(x_subset_val, labels_subset_val)
subset_val_loader = torch.utils.data.DataLoader(subset_val_data, **dataloader_args)

In [None]:
# Save model

def save_model(model, optimizer, scaler, timestamp_str):
    torch.save({
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "scaler": scaler.state_dict()
    }, "models/model_%s.pt" % timestamp_str)

In [None]:
# Train the model

def train_model(train_loader, model, optimizer, criterion, scaler):
    training_loss = 0
    predictions = []
    actuals = []
    
    model.train()
    for (inputs, input_lengths, targets) in train_loader:
        if cuda:
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast(enabled=use_amp):
            (out, out_lengths) = model(inputs, input_lengths)

        out = out[out_lengths[0]-1,:,:]        
#         out = out.permute(1,2,0) # From (seq_len, batch_size, num_classes) to (batch_size, num_classes, seq_len)
#         targets = targets.unsqueeze(1).repeat(1, out.shape[2]) # From (batch_size) to (batch_size, seq_len)
        loss = criterion(out, targets)
        training_loss += loss.item()
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        out = out.detach()
        out = torch.argmax(out, 1)
        out = torch.reshape(out, (out.numel(), 1))
        predictions.append(out)
        
        actual = targets.detach()
        actual = torch.reshape(actual, (actual.numel(), 1))
        actuals.append(actual)

    predictions = torch.cat(predictions, 0).cpu().numpy()
    actuals = torch.cat(actuals, 0).cpu().numpy()
    training_loss /= len(train_loader)
    
    acc = accuracy_score(actuals, predictions)
    return (acc, training_loss, predictions, actuals)

In [None]:
# Evaluate the model

def evaluate_model(val_loader, model, criterion):
    validation_loss = 0
    predictions = []
    actuals = []
    
    model.eval()
    with torch.no_grad():
        for (inputs, input_lengths, targets) in val_loader:
            if cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()
                
            with torch.cuda.amp.autocast(enabled=use_amp):
                (out, out_lengths) = model(inputs, input_lengths)
            
            out = out[out_lengths[0]-1,:,:]
            loss = criterion(out, targets)
            validation_loss += loss.item()

            out = out.detach()
            out = torch.argmax(out, 1)
            out = torch.reshape(out, (len(out), 1))
            predictions.append(out)
            
            actual = targets.detach()
            actual = torch.reshape(actual, (len(actual), 1))
            actuals.append(actual)

        predictions = torch.cat(predictions, 0).cpu().numpy()
        actuals = torch.cat(actuals, 0).cpu().numpy()
        validation_loss /= len(val_loader)
        
    acc = accuracy_score(actuals, predictions)
    return (acc, validation_loss, predictions, actuals)

In [None]:
# Define model architecture

# Pyramidal BiLSTM
# Each layer reduces time resolution by a given reduction factor
class pBLSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, reduction_factor):
        super(pBLSTM, self).__init__()
        self.reduction_factor = reduction_factor
        self.blstm = torch.nn.LSTM(input_size * reduction_factor, hidden_size, num_layers=1, batch_first=False, bidirectional=True)
        
    def forward(self, x):
        r = self.reduction_factor
        
        x = x.permute(1,2,0) # From (seq_len, batch_size, input_size) to (batch_size, input_size, seq_len)
        (batch_size, input_size, seq_len) = x.shape
        
        x = x[:,:,:(r*(seq_len//r))]
        x = x.view(batch_size, input_size, seq_len//r, r)
        x = x.permute(0,1,3,2) # From (batch_size, input_size, seq_len//r, r) to (batch_size, input_size, r, seq_len//r)
        x = x.reshape(batch_size, input_size*r, seq_len//r)
        
        x = x.permute(2,0,1) # From (batch_size, input_size, seq_len) to (seq_len, batch_size, input_size)
        out = self.blstm(x)
        return out

In [None]:
# Define model architecture

class LSTM(torch.nn.Module):
    def __init__(self, in_channels, in_classes, out_classes, embedding_size, hidden_size):
        super(LSTM, self).__init__()
        
        self.conv = torch.nn.Sequential(
            torch.nn.Conv1d(1, hidden_size, kernel_size=3, stride=1, padding=1, bias=False),
            torch.nn.BatchNorm1d(hidden_size),
            torch.nn.ReLU(inplace=True)
        )
        
        self.embedding = torch.nn.Embedding(in_classes, embedding_size)
        
        self.lstm = torch.nn.LSTM(hidden_size, hidden_size, num_layers=3, batch_first=False, bidirectional=True)
#         self.pblstm1 = pBLSTM(2*hidden_size, hidden_size, 1)
#         self.pblstm2 = pBLSTM(2*hidden_size, hidden_size, 1)
#         self.pblstm3 = pBLSTM(2*hidden_size, hidden_size, 1)
        
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(2*hidden_size, hidden_size),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_size, out_classes),
            torch.nn.LogSoftmax(dim=2)
        )
        
    def forward(self, x, x_lengths):
        # x = self.embedding(x)
        x = x.unsqueeze(2).float()
        
        x = x.permute(1,2,0) # From (seq_len, batch_size, input_size) to (batch_size, input_size, seq_len)
        x = self.conv(x)
        x = x.permute(2,0,1) # From (batch_size, input_size, seq_len) to (seq_len, batch_size, input_size)
        
        (x, (_, _)) = self.lstm(x)
#         (x, (_, _)) = self.pblstm1(x)
#         (x, (_, _)) = self.pblstm2(x)
#         (x, (_, _)) = self.pblstm3(x)
        
        x = self.linear(x)
        return (x, x_lengths // 1)

In [None]:
# Model
in_channels = 1
in_classes = 25 # Length of amino mapping
out_classes = np.max(labels_train) + 1 # Add 1 to account for blank
model = LSTM(in_channels, in_classes, out_classes, 32, 128)

# Criterion / loss function
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=0)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2500)

scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
if cuda:
    model = model.cuda()
    
# Print model
# print(model)
# print(criterion)
# print(optimizer)

In [None]:
epochs = 20
lr_stop = 0.00005

# with torch.autograd.profiler.profile(record_shapes=True, use_cuda=True) as prof:
#     with torch.autograd.profiler.record_function("model_inference"):

train_accs = []
train_losses = []
val_accs = []
val_losses = []

start_timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
out_filename = "logs/output_%s.json" % start_timestamp
with open(out_filename, "w") as f:
    json.dump({
        "model": repr(model),
        "criterion": repr(criterion),
        "optimizer": repr(optimizer)
    }, f)
    f.write("\n")
    f.flush()

    for epoch in range(epochs):
        # Start timer
        start_time = time.perf_counter()

        # Training and validation
        lr = optimizer.param_groups[0]['lr']
        (train_acc, train_loss, predictions, actuals) = train_model(subset_loader, model, optimizer, criterion, scaler)
        (val_acc, val_loss, _, _) = evaluate_model(subset_val_loader, model, criterion)
        scheduler.step(val_acc)

        # Print log
        time_elapsed = time.perf_counter() - start_time
        timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
        print("Epoch: %d, Training loss: %.3f, Training accuracy: %.2f%%, Validation loss: %.3f, Validation accuracy: %.2f%%, Learning rate: %f, Time elapsed: %.3f, Timestamp: %s" %
              (epoch, train_loss, 100*train_acc, val_loss, 100*val_acc, lr, time_elapsed, timestamp))

        # Save log to disk
        json.dump({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc,
            "learning_rate": lr,
            "time_elapsed": time_elapsed,
            "timestamp": timestamp
        }, f)
        f.write("\n")
        f.flush()
        
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        # Early stopping
        if lr < lr_stop:
            save_model(model, optimizer, scaler, start_timestamp)
            break
            
# performance = prof.key_averages().table(sort_by="cpu_time_total", row_limit=20)
# print(performance)

In [None]:
# Plot training curves
plt.figure()
plt.plot(range(len(train_accs)), train_accs, label="Training accuracy")
plt.plot(range(len(val_accs)), val_accs, label="Validation accuracy")
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title("Model Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

plt.figure()
plt.plot(range(len(train_accs)), train_losses, label="Training losses")
plt.plot(range(len(val_losses)), val_losses, label="Validation losses")
plt.title("Model Losses")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

plt.figure()
plt.hist(actuals, range(out_classes+1), density=True, alpha=0.5, label="Ground truth")
plt.hist(predictions, range(out_classes+1), density=True, alpha=0.5, label="Model output")
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title("Label Frequencies")
plt.xlabel("Output class")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
# Compute confusion matrix

confusion = np.zeros((out_classes, out_classes))
for (prediction, actual) in zip(predictions, actuals):
    confusion[prediction, actual] += 1
confusion

plt.figure()
plt.imshow(confusion, cmap="gist_heat", interpolation="nearest")
plt.title("Confusion Matrix")
plt.xlabel("Model output")
plt.ylabel("Ground truth label")
plt.show()