REQUIREMENTS + UTILS

In [7]:
from scripts.read_data import read_data_sequences
from scripts.seq_preprocess import tf_idf
import numpy as np

LOAD DATA

In [2]:
sequences_train, sequences_test, proteins_test, y_train = read_data_sequences()

In [8]:
X_train, X_test = tf_idf(sequences_train, sequences_test)

In [46]:
amino_acids = list(set("".join(sequences_train)))
aa_dict = {aa:i for i, aa in enumerate(amino_acids)}
print(aa_dict)

{'V': 0, 'S': 1, 'Y': 2, 'F': 3, 'A': 4, 'W': 5, 'K': 6, 'I': 7, 'N': 8, 'C': 9, 'L': 10, 'R': 11, 'T': 12, 'X': 13, 'M': 14, 'G': 15, 'E': 16, 'D': 17, 'Q': 18, 'P': 19, 'H': 20}


In [200]:
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(np.array(sequences_train), np.array(y_train), test_size=0.1)
print ('Training:', train_x.shape, train_y.shape)
print ('Validation:', val_x.shape, val_y.shape)

Training: (4399,) (4399,)
Validation: (489,) (489,)


In [229]:
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import one_hot
class ProteinSeqdataset(Dataset):
    def __init__(self, 
                 proteins_seq,
                 y= None, 
                 vocab = None,
                 padding_mark = "PAD"):

        self.y = y
        self.padding_mark = padding_mark
        self.proteins_seq = proteins_seq      
        #Maximum length of sequences
        self.max_len_sequence= max([len(proteins_seq[i]) for i in range(len(proteins_seq))])

        #length of each sequence
        self.lengths = torch.Tensor([len(prot) for prot in self.proteins_seq])

        # Allow to import a vocabulary (for valid/test datasets, that will use the training vocabulary)
        if vocab is not None:
            self.aa2id, self.id2aa = vocab
        else:
            # If no vocabulary imported, build it (and reverse)
            self.aa2id, self.id2aa = self.build_vocab()
        self.vocab_size = len(self.aa2id)
        #print(self.aa2id)
        #print(self.id2aa)
        #Convert to Tensor and apply the vocabulary
        sequences_tensor = list(map(lambda prot: torch.Tensor([self.aa2id[aa] for aa in prot]),self.proteins_seq))
        #Pad the sequence
        sequences_padded = pad_sequence(sequences_tensor,batch_first = True, padding_value =0)

        #One Hot encoding
        sequences_encoded = one_hot(sequences_padded.to(torch.int64),num_classes=self.vocab_size)

        #Convert to Torch 
        self.X = sequences_encoded
        self.y = torch.Tensor(self.y)

    def get_data(self):
        return self.X,self.y
    
    def get_lengths(self):
        return self.lengths

    def __len__(self):
        return len(self.proteins_seq)

    def __getitem__(self, idx):
        # The iterator just gets one particular example with its category
        # The dataloader will take care of the shuffling and batching
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.X[idx], self.y[idx], self.lengths[idx]
    
    def build_vocab(self):
        """
          Function to build the Amino Acids vocabulary
          
          Returns
          ----------------------------
            aa2id : <dict{str:int}> 
                Dictionary to go from an Amino Acids to an id
            id2aa : <dict{int:str}> 
                Dictionary to go from an id to an Amino Acids
        """
        #Set of the Amino Acids
        amino_acids = list(set("".join(self.proteins_seq)))
        #print(amino_acids)
        #Build Vocab (+1 for the padding)
        aa2id = {aa:i+1 for i, aa in enumerate(amino_acids)}
        id2aa = {i+1:aa for i, aa in enumerate(amino_acids)}

        #Add the Padding id 
        aa2id = {"PAD":self.padding_mark,**aa2id}
        id2aa = {**id2aa,self.padding_mark:"PAD"}

        return aa2id, id2aa
    
    def get_vocab(self):
        # A simple way to get the training vocab when building the valid/test 
        return self.aa2id, self.id2aa

In [230]:
train_dataset = ProteinSeqdataset(train_x,train_y)
val_dataset = ProteinSeqdataset(val_x,val_y)
#X,y = train_dataset.get_data()
aa2id,id2aa = train_dataset.get_vocab()

In [208]:
lengths = train_dataset.get_lengths()
lengths

tensor([ 96., 369., 183.,  ...,  81., 779., 836.])

In [209]:
nb_class = len(set(y_train))
nb_class

18

In [210]:
vocab_size = len(aa2id)
vocab_size

22

In [232]:
training_dataloader = DataLoader(train_dataset, batch_size = 200, shuffle=True)
valid_dataloader = DataLoader(val_dataset, batch_size = 25)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class ProteinLSTM(nn.Module):
    def __init__(self, hidden_size, num_layers, num_classes, vocab_size, embedding_dim):
        super(ProteinLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x, lengths):
        x = self.embedding(x)
        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        print(x.shape)
        out, _ = self.lstm(x)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        out = self.fc(out[:, -1, :])
        return out


In [197]:
# Instantiate the model
model = ProteinLSTM(hidden_size=64, num_layers=2, num_classes=nb_class, vocab_size=vocab_size, embedding_dim=64)
# Create an optimizer
opt = optim.Adam(model.parameters(), lr=0.0025, betas=(0.9, 0.999))
# The criterion is a binary cross entropy loss based on logits - meaning that the sigmoid is integrated into the criterion
criterion = nn.BCEWithLogitsLoss()

In [233]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def train_epoch(model, opt, criterion, dataloader):
    model.train()
    losses = []
    for i, (x, y, len_) in enumerate(dataloader):
        x,y = x.to(device),y.to(device)
        len_ = len_.to(device)
        opt.zero_grad()
        # (1) Forward
        pred = model.forward(x, len_)
        # (2) Compute the loss 
        loss = criterion(pred,y)
        # (3) Compute gradients with the criterion
        loss.backward()
        # (4) Update weights with the optimizer
        opt.step()    
        losses.append(loss.item())
        # Count the number of correct predictions in the batch - here, you'll need to use the sigmoid
        num_corrects = (torch.round(torch.sigmoid(pred)) == y).float().sum()
        acc = 100.0 * num_corrects/len(y)
        
        if (i%20 == 0):
            print("Batch " + str(i) + " : training loss = " + str(loss.item()) + "; training acc = " + str(acc.item()))
    return losses

In [234]:
# Same for the evaluation ! We don't need the optimizer here. 
def eval_model(model, criterion, evalloader):
    model.eval()
    total_epoch_loss = 0
    total_epoch_acc = 0
    with torch.no_grad():
        for i, (x, y, len_) in enumerate(evalloader):
            x,y = x.to(device),y.to(device)
            len_ = len_.to(device)
            pred =  model.forward(x, len_)
            loss = criterion(pred,y)
            num_corrects =(torch.round(torch.sigmoid(pred)) == y).float().sum()
            acc = 100.0 * num_corrects/len(y)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/(i+1), total_epoch_acc/(i+1)

In [235]:
# A function which will help you execute experiments rapidly - with a early_stopping option when necessary. 
def experiment(model, opt, criterion, num_epochs = 5, early_stopping = True):
    train_losses = []
    if early_stopping: 
        best_valid_loss = 10. 
    print("Beginning training...")
    for e in range(num_epochs):
        print("Epoch " + str(e+1) + ":")
        train_losses += train_epoch(model, opt, criterion, training_dataloader)
        valid_loss, valid_acc = eval_model(model, criterion, valid_dataloader)
        print("Epoch " + str(e+1) + " : Validation loss = " + str(valid_loss) + "; Validation acc = " + str(valid_acc))
        if early_stopping:
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
            else:
                print("Early stopping.")
                break  
    return train_losses

In [236]:
train_losses = experiment(model, opt, criterion,num_epochs=10)

Beginning training...
Epoch 1:


RuntimeError: input must have 2 dimensions, got 3