In [3]:
import json
import os
import mlflow

In [4]:
# Make tokenizer for amino acids

STOP = "*"
GAP = "-"
TOKENS=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'X', STOP, GAP]
TOKEN_DICT = {TOKENS[x]:x  for x in range(len(TOKENS))}

def tokenizer(seq):
    seq = seq + STOP
    return [AA for AA in seq]


In [5]:
# Get torch ready by setting device
# Note: a GPU is highly recommended for this example
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cpu')

In [9]:
# Get the data ready for torch
# This assumes that you have run the data transform script deepsol_transform.py

from torchtext import data

fix_len=100

seq = data.Field(sequential=True, tokenize=tokenizer, fix_length=fix_len, pad_token=GAP, unk_token='X')
tgt = data.Field(sequential=False, use_vocab=False, is_target=True)

train, val, test = data.TabularDataset.splits(path='.data/deepsol', train='train.tsv', validation='val.tsv', test='test.tsv', format='tsv', fields=[('seq', seq), ('tgt', tgt)])

seq.build_vocab(train)
seq_vocab = seq.vocab

batch_size = 32
train_iter, val_iter, test_iter = data.Iterator.splits((train, val, test), device=device, batch_size=batch_size, sort=False)

for x in train_iter:
    temp = x
    break

temp


[torchtext.data.batch.Batch of size 32]
	[.seq]:[torch.LongTensor of size 100x32]
	[.tgt]:[torch.LongTensor of size 32]

In [7]:
# Make whatever model you want
# This is a LSTM classifier that doesn't seem to work very well
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        
        
        
    def make_hidden(self, batch_size, device):
        self.hidden = (autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).to(device), autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).to(device))
        
    def forward(self, seq):
        out, self.hidden = self.lstm(seq, self.hidden)
        x = self.out(out)
        return x

In [8]:
# Build your training and validation functions

from torch import optim
import mlflow
import numpy as np

def train_batch(model, optimizer, train_iter, update_freq=128):
    losses = []
    i = 0
    num_batches = len(train_iter)
    
    for batch in train_iter:
        # Prepare data
        optimizer.zero_grad()
        embed = nn.functional.one_hot(batch.seq, len(seq_vocab)).float()
        target = batch.tgt
        model.make_hidden(len(target), device)
        
        # Get loss and perform gradient descent
        pred = model(embed)[-1].squeeze()
        loss = loss_func(pred, target)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        
        # Print to console sometimes so we can tell if we die
        i += 1
        if i%update_freq == 0:
            print("Batch " + str(i) + "/" + str(num_batches) + " Epoch " + str(100*i/num_batches) + "% complete")
            print("Current Loss: " + str(np.mean(losses)))
        
    
    # Get average loss for epoch
    avg_loss = np.mean(losses)
    print("Average loss: " + str(avg_loss))
    # Log to MLFlow
    mlflow.log_metric('Train Loss', float(avg_loss))
        
def validate(model, val_iter, is_val):
    losses = []
    total = 0
    correct = 0
    for batch in val_iter:
        # Prepare data
        embed = nn.functional.one_hot(batch.seq, len(seq_vocab)).float()
        target = batch.tgt
        model.make_hidden(len(target), device)
        
        with torch.no_grad():
            pred = model(embed)[-1].squeeze()
        
        # Get loss value
        loss = loss_func(pred, target)
        losses.append(loss.item())
        
        # Get actual error number 
        tgt_np = target.cpu().numpy()
        pred_np = pred.cpu().numpy().argmax(axis=1)
        total += len(pred_np)
        correct += (pred_np == tgt_np).sum()
    
    # Calculate average loss
    avg_loss = np.mean(losses)
    # Calculate accuracy
    accuracy = correct/total
    if is_val:
        # For validation set
        print("Validation Accuracy: " + str(accuracy) + ". " + str(correct) + " correct out of " + str(total))
        mlflow.log_metric('Val Acc', accuracy)
        mlflow.log_metric('Val Loss', avg_loss)
    else:
        # For test set
        print("Test: " + str(correct) + " correct out of " + str(total))
        mlflow.log_metric('Test Acc', float(correct/total))
            

def train(model, optimizer, epochs, train_iter, val_iter, test_iter):
    # Simple training loop
    for i in range(epochs):
        train_batch(model, optimizer, train_iter)
        validate(model, val_iter, True)

        print("Epoch " + str(i) + " complete")
    validate(model, test_iter, False)

In [None]:
# This is our experiment runner
import mlflow.pytorch

mlflow.set_experiment('deepsol')

# End run in case we have one hanging
mlflow.end_run()
for args in ([(512, 3), (656, 2)]):
    mlflow.start_run()
    model = LSTMClassifier(len(seq_vocab), args[0], args[1], 2)
    model.to(device)
    
    # Save this notebook so we know exactly what code ran
    mlflow.log_artifact('deepsol.ipynb')
    
    # Set some parameters
    learning_rate = 4e-3
    loss_func = torch.nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)

    # Log model params to MLFlow
    mlflow.log_param('Model Type', 'LSTM')
    mlflow.log_param('Hidden Size', args[0])
    mlflow.log_param('Number of Layers', args[1])
    mlflow.log_param('Learning Rate Init', learning_rate)
    mlflow.log_param('Fix Length', fix_len)
    
    # Call Training loop
    train(model, optimizer, 12, train_iter, val_iter, test_iter)
    
    # Log final Model
    mlflow.pytorch.log_model(model, "models")
    
    mlflow.end_run()

In [None]:
mlflow.end_run()