In [1]:
# Import necessary libraries
import numpy as np
import torch
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from sklearn.metrics import accuracy_score
from torchsummaryX import summary

# Import utility libraries for evaluating model results
import csv
import json
import time

In [2]:
# Check if cuda is available

cuda = torch.cuda.is_available()
num_workers = 8 if cuda else 0
print("Cuda = %s with num_workers = %d" % (cuda, num_workers))

Cuda = True with num_workers = 8


In [3]:
# Load data into numpy

def load_data(in_filename):
    data = np.load(in_filename, allow_pickle=True)
    for i in range(data.shape[0]):
        data[i] = data[i].astype(np.float32)
    return data

def load_labels(in_filename):
    data = np.load(in_filename, allow_pickle=True)
    return data

In [4]:
# Load data into numpy

x_train = load_data("data/train_data_balanced.npy")
labels_train = load_labels("data/train_label_balanced.npy")

x_val = load_data("data/validation_data_balanced.npy")
labels_val = load_labels("data/validation_label_balanced.npy")

In [5]:
# Print data shape

print("Training data shape: ", x_train.shape, x_train.dtype)
print("Training labels shape: ", labels_train.shape, labels_train.dtype)

print("Validation data shape: ", x_val.shape, x_val.dtype)
print("Validation labels shape: ", labels_val.shape, labels_val.dtype)

Training data shape:  (43434,) object
Training labels shape:  (43434,) int64
Validation data shape:  (4842,) object
Validation labels shape:  (4842,) int64


In [6]:
# Dataset definition

class LSTMDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        x = torch.Tensor(self.x[index]).long()
        y = self.y[index]
        return (x,y)
    

def pad_collate(batch):
    # batch: A list of length batch_size, where each entry is an (x,y) tuple
    # xs: A list of length batch_size, where each entry is a tensor of x-values
    # ys: A list of length batch_size, where each entry is a tensor of y-values
    (xs,y) = zip(*batch)

    x_lengths = torch.Tensor([len(x) for x in xs]).long()
    xs = torch.nn.utils.rnn.pad_sequence(xs, batch_first=False, padding_value=0)
    y = torch.Tensor(y).long()
    
    return (xs, x_lengths, y)

In [18]:
# Dataloaders

dataloader_args = {
    "shuffle": True,
    "batch_size": 32,
    "drop_last": True,
    "pin_memory": True,
    "num_workers": 0,
    "collate_fn": pad_collate,
}

# Training dataloader
train_data = LSTMDataset(x_train, labels_train)
train_loader = torch.utils.data.DataLoader(train_data, **dataloader_args)

# Validation dataloader
val_data = LSTMDataset(x_val, labels_val)
val_loader = torch.utils.data.DataLoader(val_data, **dataloader_args)

In [19]:
# Train the model

def train_model(train_loader, model, optimizer, criterion):
    training_loss = 0
    predictions = []
    actuals = []
    
    model.train()
    for (inputs, input_lengths, targets) in train_loader:
        if cuda:
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        optimizer.zero_grad()
        
        (out, out_lengths) = model(inputs, input_lengths)

        out = out[out_lengths[0]-1,:,:]
        loss = criterion(out, targets)
        training_loss += loss.item()
            
        loss.backward()
        optimizer.step()
        
        out = out.detach()
        out = torch.argmax(out, 1)
        out = torch.reshape(out, (out.numel(), 1))
        predictions.append(out)
        
        actual = targets.detach()
        actual = torch.reshape(actual, (actual.numel(), 1))
        actuals.append(actual)

    predictions = torch.cat(predictions, 0).cpu().numpy()
    actuals = torch.cat(actuals, 0).cpu().numpy()
    training_loss /= len(train_loader)
    
    acc = accuracy_score(actuals, predictions)
    return (acc, training_loss, predictions, actuals)

In [20]:
# Evaluate the model

def evaluate_model(val_loader, model, criterion):
    validation_loss = 0
    predictions = []
    actuals = []
    
    model.eval()
    with torch.no_grad():
        for (inputs, input_lengths, targets) in val_loader:
            if cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()
                
            (out, out_lengths) = model(inputs, input_lengths)
            
            out = out[out_lengths[0]-1,:,:]
            loss = criterion(out, targets)
            validation_loss += loss.item()

            out = out.detach()
            out = torch.argmax(out, 1)
            out = torch.reshape(out, (len(out), 1))
            predictions.append(out)
            
            actual = targets.detach()
            actual = torch.reshape(actual, (len(actual), 1))
            actuals.append(actual)

        predictions = torch.cat(predictions, 0).cpu().numpy()
        actuals = torch.cat(actuals, 0).cpu().numpy()
        validation_loss /= len(val_loader)
        
    acc = accuracy_score(actuals, predictions)
    return (acc, validation_loss, predictions, actuals)

In [21]:
# Define model architecture

# Pyramidal BiLSTM
# Each layer reduces time resolution by a given reduction factor
class pBLSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, reduction_factor):
        super(pBLSTM, self).__init__()
        self.reduction_factor = reduction_factor
        self.blstm = torch.nn.LSTM(input_size * reduction_factor, hidden_size, num_layers=1, batch_first=False, bidirectional=True)
        
    def forward(self, x):
        r = self.reduction_factor
        
        x = x.permute(1,2,0) # From (seq_len, batch_size, input_size) to (batch_size, input_size, seq_len)
        (batch_size, input_size, seq_len) = x.shape
        
        x = x[:,:,:(r*(seq_len//r))]
        x = x.view(batch_size, input_size, seq_len//r, r)
        x = x.permute(0,1,3,2) # From (batch_size, input_size, seq_len//r, r) to (batch_size, input_size, r, seq_len//r)
        x = x.reshape(batch_size, input_size*r, seq_len//r)
        
        x = x.permute(2,0,1) # From (batch_size, input_size, seq_len) to (seq_len, batch_size, input_size)
        out = self.blstm(x)
        return out

In [25]:
# Define model architecture

class LSTM(torch.nn.Module):
    def __init__(self, in_channels, in_classes, out_classes, embedding_size, hidden_size):
        super(LSTM, self).__init__()
        
        self.embedding = torch.nn.Embedding(in_classes, embedding_size)
        
        self.conv = torch.nn.Sequential(
            torch.nn.Conv1d(embedding_size, hidden_size, kernel_size=3, stride=1, padding=1, bias=False),
            torch.nn.BatchNorm1d(hidden_size),
            torch.nn.ReLU(inplace=True)
        )
        
        self.lstm = torch.nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=False, bidirectional=True)
        self.pblstm1 = pBLSTM(2*hidden_size, hidden_size, 2)
        self.pblstm2 = pBLSTM(2*hidden_size, hidden_size, 2)
        self.pblstm3 = pBLSTM(2*hidden_size, hidden_size, 2)
        
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(2*hidden_size, hidden_size),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_size, out_classes),
            torch.nn.LogSoftmax(dim=2)
        )
        
    def forward(self, x, x_lengths):
        x = self.embedding(x)
        
        x = x.permute(1,2,0) # From (seq_len, batch_size, input_size) to (batch_size, input_size, seq_len)
        x = self.conv(x)
        x = x.permute(2,0,1) # From (batch_size, input_size, seq_len) to (seq_len, batch_size, input_size)
        
        (x, (_, _)) = self.lstm(x)
        (x, (_, _)) = self.pblstm1(x)
        (x, (_, _)) = self.pblstm2(x)
        (x, (_, _)) = self.pblstm3(x)
        
        x = self.linear(x)
        return (x, x_lengths // 8)

In [26]:
# Model
in_channels = 1
in_classes = 25 # Length of amino mapping
out_classes = np.max(labels_train) + 1 # Add 1 to account for blank
model = LSTM(in_channels, in_classes, out_classes, 32, 128)

# Criterion / loss function
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=0)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2500)

if cuda:
    model = model.cuda()
    
# Print model
# print(model)
# print(criterion)
# print(optimizer)

In [24]:
epochs = 80
lr_stop = 0.00005

train_accs = []
train_losses = []
val_accs = []
val_losses = []

start_timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
for epoch in range(epochs):
    # Start timer
    start_time = time.perf_counter()

    # Training and validation
    lr = optimizer.param_groups[0]['lr']
    (train_acc, train_loss, train_predictions, train_actuals) = train_model(train_loader, model, optimizer, criterion)
    (val_acc, val_loss, val_predictions, val_actuals) = evaluate_model(val_loader, model, criterion)
    scheduler.step(val_acc)

    # Print log
    time_elapsed = time.perf_counter() - start_time
    timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
    print("Epoch: %d, Training loss: %.3f, Training accuracy: %.2f%%, Validation loss: %.3f, Validation accuracy: %.2f%%, Learning rate: %f, Time elapsed: %.3f, Timestamp: %s" %
          (epoch, train_loss, 100*train_acc, val_loss, 100*val_acc, lr, time_elapsed, timestamp))

    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_losses.append(val_loss)
    val_accs.append(val_acc)

    # Early stopping
    if lr < lr_stop:
        break

KeyboardInterrupt: 

In [27]:
# Print model torchsummary
for (inputs, input_lengths, targets) in train_loader:
    if cuda:
        inputs = inputs.cuda()
    summary(model, 0*inputs, x_lengths=input_lengths)
    break

                        Kernel Shape     Output Shape    Params   Mult-Adds
Layer                                                                      
0_embedding                 [32, 25]   [1274, 32, 32]     800.0       800.0
1_conv.Conv1d_0         [32, 128, 3]  [32, 128, 1274]   12.288k  15.654912M
2_conv.BatchNorm1d_1           [128]  [32, 128, 1274]     256.0       128.0
3_conv.ReLU_2                      -  [32, 128, 1274]         -           -
4_lstm                             -  [1274, 32, 256]  264.192k    262.144k
5_pblstm1.LSTM_blstm               -   [637, 32, 256]  657.408k     655.36k
6_pblstm2.LSTM_blstm               -   [318, 32, 256]  657.408k     655.36k
7_pblstm3.LSTM_blstm               -   [159, 32, 256]  657.408k     655.36k
8_linear.Linear_0         [256, 128]   [159, 32, 128]   32.896k     32.768k
9_linear.Dropout_1                 -   [159, 32, 128]         -           -
10_linear.Linear_2          [128, 9]     [159, 32, 9]    1.161k      1.152k
11_linear.Lo

In [None]:
# Plot training curves
plt.figure()
plt.plot(range(len(train_accs)), train_accs, label="Training accuracy")
plt.plot(range(len(val_accs)), val_accs, label="Validation accuracy")
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title("Model Accuracy")
plt.xticks(range(0, len(train_accs), 2))
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

plt.figure()
plt.plot(range(len(train_accs)), train_losses, label="Training losses")
plt.plot(range(len(val_losses)), val_losses, label="Validation losses")
plt.title("Model Losses")
plt.xticks(range(0, len(train_accs), 2))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

plt.figure()
plt.hist(train_actuals, range(out_classes+1), density=True, alpha=0.5, label="Ground truth")
plt.hist(train_predictions, range(out_classes+1), density=True, alpha=0.5, label="Model output")
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title("Label Frequencies (Training)")
plt.xticks(range(out_classes))
plt.xlabel("Output class")
plt.ylabel("Frequency")
plt.legend()
plt.show()

plt.figure()
plt.hist(val_actuals, range(out_classes+1), density=True, alpha=0.5, label="Ground truth")
plt.hist(val_predictions, range(out_classes+1), density=True, alpha=0.5, label="Model output")
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title("Label Frequencies (Validation)")
plt.xticks(range(out_classes))
plt.xlabel("Output class")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
# Compute confusion matrix

train_confusion = np.zeros((out_classes, out_classes))
for (prediction, actual) in zip(train_predictions, train_actuals):
    train_confusion[actual, prediction] += 1

plt.figure()
plt.imshow(train_confusion, cmap="gist_heat", interpolation="nearest")
plt.title("Confusion Matrix (Training)")
plt.xticks(range(out_classes))
plt.xlabel("Model output")
plt.ylabel("Ground truth label")
plt.show()

val_confusion = np.zeros((out_classes, out_classes))
for (prediction, actual) in zip(val_predictions, val_actuals):
    val_confusion[actual, prediction] += 1

plt.figure()
plt.imshow(val_confusion, cmap="gist_heat", interpolation="nearest")
plt.title("Confusion Matrix (Validation)")
plt.xticks(range(out_classes))
plt.xlabel("Model output")
plt.ylabel("Ground truth label")
plt.show()