In [37]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
from sklearn.metrics import accuracy_score,f1_score

In this notebook I decided to test if the results can improve if the sequences were padded only to the maximal length in a batch and not in the entire dataset. I wanted to train the network in batches sorted by length with the aim of gathering together similar length sequences and padding them as necessary only to the length of longest sequence in a batch. I was hoping that it would minimise the effect of padding on the results.

Preprocessing was similar to the previous notebook but I have to turn the preprocessing stage into a function and call it during training loop.

### Preprocessing

In [38]:
proteins = pd.read_csv("./Data/2018-06-06-ss.cleaned.csv")

In [40]:
def remove_empty(sequence):
    s = set(list(sequence))
    if len(s)==1:
        letter = s.pop()
        if letter == "*": 
            return 1
        else: 
            return 0
    return 0 

In [41]:
sample = proteins[
    (proteins["len"]>=1) &
    (proteins["len"]<=100)]

In [42]:
MAX_LENGTH = sample["len"].max()

In [43]:
sample = sample[["seq","sst3","sst8"]]

In [44]:
sample = sample.drop_duplicates()

In [45]:
sample["remove"] = sample["seq"].apply(remove_empty) 
sample = sample[sample["remove"]==0].copy()

In [46]:
# sample = sample.sample(10000)

In [47]:
sample["len"] = sample["seq"].apply(len)
sample

Unnamed: 0,seq,sst3,sst8,remove,len
0,EDL,CEC,CBC,0,3
1,KCK,CEC,CBC,0,3
2,KAK,CEC,CBC,0,3
3,KFK,CEC,CBC,0,3
5,KMK,CEC,CBC,0,3
...,...,...,...,...,...
61918,MAVKTGIAIGLNKGKKVTQMTPAPKISYKKGAASNRTKFVRSLVRE...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHH...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCSCCCCCHHHHHHHHHHHH...,0,100
61920,MAVKTGIAIGLNKGKKVTQMTPAPKISYKKGAASNRTKFVRSLVRE...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHH...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCHHHHHHHHHHHH...,0,100
61921,RYNDYKLDFRRQQMQDFFLAHKDEEWFRSKYHPDEVGKRRQEARGA...,CCCCHHHHHHHHHHHHHHHHCCCCHHHHHHHCHHHHHHHHHHHHHH...,CCCCHHHHHHHHHHHHHHHHTSSCHHHHHHHCHHHHHHHHHHHHHH...,0,100
61922,RYNDYKLDFRRQQMQDFFLAHKDEEWFRSKYHPDEVGKRRQEARGA...,CCCCCHHHHHHHHHHHHHHHCCCCHHHHHHHCHHHHHHHHHHHHHH...,CCCCCHHHHHHHHHHHHHHHTSSCHHHHHHHCHHHHHHHHHHHHHH...,0,100


In [48]:
sample = sample.sample(frac=1)

In [49]:
SOS_token = 0

class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS"}
        self.n_words = len(self.index2word)

    def addSentence(self, sentence):
        for word in list(sentence):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [50]:
def prepareData(lang1, lang2, reverse=False):

    input_lang = Lang()
    output_lang = Lang() 

    pairs = list(zip(lang1,lang2))

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print("Counted words:")
    print(f"Sequence: {input_lang.n_words}")
    print(f"Structure: {output_lang.n_words}")
    return input_lang, output_lang, pairs

In [51]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    return torch.tensor(indexes, dtype=torch.long).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [52]:
def preprocess_dataset(input_lang,output_lang,pairs, max_len):

    MAX_LENGTH = max_len

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    X = torch.tensor(input_ids, dtype=torch.long)
    y = torch.tensor(target_ids, dtype=torch.long)

    return X,y

In [53]:
class LSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers=1):
        super().__init__()

        self.inpit_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.embed = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, output_size)

    def forward(self, x):
        x = self.embed(x)
        x = F.relu(x)
        x , hidden = self.lstm(x)
        x = self.fc(x)

        x = F.log_softmax(x, dim = -1)
        return x


In [54]:
learning_rate=0.01
batch_size = 128
hidden_size = 256
n_epochs = 100

patience = 5

In [55]:
train_size = int(len(sample)*0.6)
test_size = int(len(sample)*0.2)

In [56]:
train = sample[:train_size].sort_values("len")
test = sample[train_size:train_size+test_size].sort_values("len")
val = sample[train_size+test_size:]

X_train = train[["seq","len"]]
y_train = train["sst3"]

X_test = test[["seq","len"]]
y_test = test["sst3"]

X_val = val[["seq","len"]]
y_val = val["sst3"]

In [57]:
pairs = list(zip(sample["seq"],sample["sst3"]))
input_lang, output_lang, pairs = prepareData(sample["seq"], sample["sst3"])

Counted words:
Sequence: 22
Structure: 4


### Weights

A problem that appeared with dynamic length approach was to define the weight of SOS token for the loss function. I wanted to keep using weights but it depends on the SOS padding token frequency which varies in the batch. 

I decided to calculate the average value of SOS frequency by iterating over prepared batches and collecting the results on the list. I used mean value as the SOS token frequency. Using weights like that may result in some inconsistency in the training but I decided to try it. 

In [58]:
def calculate_weights(word2index, word2count, sos_freq):
        
    vocab = word2index
    word_freq = word2count

    vocab.update({"SOS":0})
    word_freq.update({"SOS":int(sos_freq)})

    vocab_size = len(vocab)

    weights = torch.zeros(vocab_size)

    for word, idx in vocab.items():
        weights[idx] = 1.0 / (word_freq[word]) 
        
    weights = weights / weights.sum()

    return weights

In [59]:
batches = len(X_train) // batch_size

sos_freq = []

for batch in range(batches):
    i = batch * batch_size
    X_batch = X_train[i:i+batch_size]
    y_batch = y_train[i:i+batch_size]

    MAX_LENGTH = (X_batch["len"].max()+1)

    pairs = list(zip(X_batch["seq"],y_batch))
    X,y = preprocess_dataset(input_lang, output_lang, pairs, MAX_LENGTH)
    SOS_freq = (y.shape[0] * y.shape[1]) - torch.count_nonzero(y)
    sos_freq.append(int(SOS_freq))


In [60]:
weights = calculate_weights(output_lang.word2index, output_lang.word2count,int(np.mean(sos_freq)))

In [61]:
model = LSTM(input_lang.n_words, 64, hidden_size, output_lang.n_words)

In [62]:
optimizer = torch.optim.Adam(params = model.parameters(), lr = learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

In [64]:
test_loss_array = []
best_result = np.inf

for epoch in range(n_epochs):

    total_loss = 0
    batches = len(X_train) // batch_size

    for batch in range(batches):
        
        optimizer.zero_grad()
        i = batch * batch_size

        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        MAX_LENGTH = (X_batch["len"].max()+1)

        pairs = list(zip(X_batch["seq"],y_batch))
        X,y = preprocess_dataset(input_lang, output_lang, pairs, MAX_LENGTH)

        output = model(X)

        loss = loss_fn(output.permute(1,2,0), y.permute(1,0))
        
        loss.backward()

        optimizer.step()

        total_loss += loss.item() 


    batches = len(X_test) // batch_size
    test_loss = 0
    with torch.no_grad():  

        for batch in range(batches):
            i = batch * batch_size

            X_batch = X_test[i:i+batch_size]
            y_batch = y_test[i:i+batch_size]

            MAX_LENGTH = (X_batch["len"].max()+1)

            pairs = list(zip(X_batch["seq"],y_batch))
            X,y = preprocess_dataset(input_lang, output_lang, pairs, MAX_LENGTH)

            output = model(X)
            loss = loss_fn(output.permute(1,2,0), y.permute(1,0))

            test_loss+=loss

    loss = total_loss / (len(X_train) // batch_size)
    loss_test = test_loss / (len(y_test) // batch_size)

    test_loss_array.append(loss_test)

    if loss_test < best_result:
        torch.save(model.state_dict(), "./lstm_var_len.pth")

    print(f"Epoch: {epoch}, Train loss: {loss}, Test loss: {loss_test}")

    if len(test_loss_array)>patience+1:
        if not (any(x > (test_loss_array[-1]+0.015) for x in test_loss_array[len(test_loss_array)-patience-1:-1])):
            break

Epoch: 0, Train loss: 0.698674623562832, Test loss: 0.7236223220825195
Epoch: 1, Train loss: 0.6567023270057909, Test loss: 0.6830551624298096
Epoch: 2, Train loss: 0.628772540224923, Test loss: 0.6652363538742065
Epoch: 3, Train loss: 0.6041244745555551, Test loss: 0.6249672174453735
Epoch: 4, Train loss: 0.5804501353490232, Test loss: 0.6082342863082886
Epoch: 5, Train loss: 0.5631513526343336, Test loss: 0.5988062024116516
Epoch: 6, Train loss: 0.5476748418025296, Test loss: 0.5607547163963318
Epoch: 7, Train loss: 0.5311279977963428, Test loss: 0.5503723621368408
Epoch: 8, Train loss: 0.5147879834126945, Test loss: 0.5407018065452576
Epoch: 9, Train loss: 0.5063648992265114, Test loss: 0.5263572931289673
Epoch: 10, Train loss: 0.4952049803252172, Test loss: 0.5239729285240173
Epoch: 11, Train loss: 0.48769929696514147, Test loss: 0.5175638198852539
Epoch: 12, Train loss: 0.4878351463362424, Test loss: 0.5342221856117249
Epoch: 13, Train loss: 0.4917685898265453, Test loss: 0.519970

In [66]:
model.load_state_dict(torch.load("./lstm_var_len.pth"))

<All keys matched successfully>

In [67]:
MAX_LENGTH = (X_val["len"].max()+1)
pairs = list(zip(X_val["seq"],y_val))
X,y = preprocess_dataset(input_lang, output_lang, pairs, MAX_LENGTH)

In [68]:
with torch.no_grad():

    outputs_pred = model(X)

    _, topi = outputs_pred.topk(1)
    decoded_ids = topi.squeeze()

    pred = []
    for idx in decoded_ids:
        decoded_structure = []
        for id in idx:
            if id.item() == SOS_token:
                break
            decoded_structure.append(output_lang.index2word[id.item()])
        pred.append("".join(decoded_structure))
    
    print(pred)

['CCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCHHHHHHHHHHHHHHHHHCCCCCHHHCHHHHCCCCCCC', 'CCCCCCCECCCCCCCECCCCEECCCCCEEEEEECCCEEEEEEECCCCCCCHEEECECCCCEECEEECC', 'CCCCHCHHHHHHHCCCCHHHHHHHHHHHHCHHHHHCHHHHHHHHHHHHHHCHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHCCHHHCHCCCCCCCC', 'CCCCCCCHCCCCC', 'CCCCCCHHHHHHHHCCHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCEHCCHCCC', 'CCCCCCCCCCHHHHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHHHCCHHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHHCCCCCCC', 'CCECCHCCCCCCCCCCC', 'CCCCCCCCCCCCCCCCCCCCCCCEECHCHHHCEHEEECCCCCHHHHHHHHHHHCCCCCHHHHHHCCCCCCCCCC', 'CCCCCHHHHHHHHHHHHHHHHHHCCCCHHHHHHHHHHHHCCCCHCCHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHHHHHHCCCCCCCCCC', 'CCCCCCCCCCCEEHHCCHHHHHHHHHHHHHHHCHHEEEECCCCHHHHHHHHHHHHHHCCCCCCCCCHCEEEECC', 'CCCCCCCCHHHHHHHHHHHHHHHHHCCHHHHHHHHHHCCCCCCCHHHHHHHHHHHHHHCCCCCECCCCCCCCC', 'CCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCHHHHHHHHCCC', 'CCCCCHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCECHHHHHHHHHHHCCHHHHHHHHCCCCHHHEHCCCC', 'CCCHHHHHHCCCCCCCHCCCCCCCCCCCCCCCCCCCCCCEHHHHHHHHHC

In [69]:
target=[]
for idx in y:
    decoded_structure = []
    for id in idx:
        if id.item() == SOS_token:
            break
        decoded_structure.append(output_lang.index2word[id.item()])
    target.append("".join(decoded_structure))

print(target)

['CCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHCCCC', 'CCCCCEEEEECCCCCEEEEEEEEEECCEEEEECCCCCEEEEECCCECHHHHHHHHHHHCCCCHHHCCC', 'CCCCCCCCHHHHHHCCHHHHHHHHHHHHHHHHHCCCCCCHHHHHHHHCCCCHHHHHHHHHCHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCC', 'CCHHHHHHHHHCC', 'CCCCCCCHHHHHCCCCHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCHHHHCCCC', 'CCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCC', 'CCCCHHHHCHHHCCCCC', 'CCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHCCCCCCCHHHHHHCCCCCCCHHHHCCCCCCCCCCCCC', 'CCCCHHHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHHHHHCCCCCCCCCHHHHHHHHHHHHHHHHHHCCCHHHHHHCCCCCCCCCCCCCCC', 'CCCCCEEEEEEEEECCCHHHHHHHHHHHHCCCCCCCEEEEECCCEEEECHHHHHHHHHHHHHCCCCCCCCCCCC', 'CCHHHHHHHHHHHHHHHHHHCCCCCCEECHHHHHHHHHCCCCCCHHHHHHHHHHHHHHCCCCEECHHHHHCCC', 'CCCCCCCCCHHHHHHHHHHCCCHHHCCCCCCCCCCCHHHHHHHHHHC', 'CCCCCCCCHHHHHHHHHHHCCCCEECHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCEECHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHCC', 'CCCCCCCHHHHCCCCCCCCCCCCCCCCCCCCCEECCCCEEHHHHHHHHHH

In [70]:
def char_level_metrics(predictions, targets):
    accuracy = 0
    f1 = 0
    
    for pred, target in zip(list(predictions), list(targets)):
        if len(pred)<len(target):
            pred = pred + ("$" * (len(target)-len(pred)))
        if len(pred)>len(target):
            target = target + ("$" * (len(pred)-len(target)))

        accuracy += accuracy_score(list(pred),list(target))
        f1 += f1_score(list(pred),list(target), average="macro")

    return accuracy/len(predictions), f1/len(predictions)

ac, f1 = char_level_metrics(pred, target)

print(f'Character-level accuracy: {ac*100}%')
print(f'Character-level f1: {f1*100}%')
print(f'Exact match: {accuracy_score(pred,target)*100}%')

Character-level accuracy: 76.57748000063567%
Character-level f1: 64.41910728061583%
Exact match: 3.9749941023826376%
