In [4]:
import numpy  as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

dtype = torch.FloatTensor

### Data

In [6]:
df = pd.read_csv("../data/macros.csv", index_col="Uniprot Code")
df["Longitud"] = df.Secuencia.str.len() # Add lenght column
df.head()

Unnamed: 0_level_0,Tipo de Macro,Secuencia,Longitud
Uniprot Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
O28751,AF-1521-like,MEVLFEAKVGDITLKLAQGDITQYPAKAIVNAANKRLEHGGGVAYA...,192
D3RWS7,AF-1521-like,MEVEVVRELEMDKLKVKLAGGDITKYPAEAIVNAANKYLEHGGGVA...,193
D2RH24,AF-1521-like,MVVKKFGSVEVVLEKGDITKYPAEAIVNAANKYLEHGGGVALAIAK...,193
A0A0F7ICE9,AF-1521-like,MKPEVVLRFSGVEVRLVQGDITKYPAEAIVNAANRHLEHGGGVAYA...,194
A0A075LQ95,AF-1521-like,MNLTELTFGNLTFKLAQGDITKLPAEAIVNAANKYLEHGGGVALAI...,190


In [46]:
total_num_aminoacids = df["Longitud"].sum()
total_num_aminoacids

182079

### Tokenizer

In [41]:
aminoacids = sorted(list(set(df['Secuencia'].apply(set).apply(list).sum())))
aminoacids = aminoacids + ["[PAD]", "[CLS]", "[SEP]", "[MASK]"]
vocab_size = len(tokens)

amin_dict = {a: i for i, a in enumerate(aminoacids)}
numb_dict = {i: a for i, a in enumerate(aminoacids)}
amin_dict

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'X': 19,
 'Y': 20,
 '[PAD]': 21,
 '[CLS]': 22,
 '[SEP]': 23,
 '[MASK]': 24}

### DataLoader

In [None]:
class SequencesDataset(data.Dataset):
    def __init__(self, images):
        self.images_fn = images

    def __getitem__(self, index):
        global images
        file1 = images[self.images_fn[index][0]]
        file2 = images[self.images_fn[index][1]]
        val = self.images_fn[index][2]
        files = [file1, file2]
        return files, val
    
    def __len__(self):
        return total_num_aminoacids

loader_train = torch.utils.data.DataLoader(
        ImagesFromList(images=trainset),
        batch_size=1, shuffle=True, num_workers=1, pin_memory=True, collate_fn = my_collate
    )

### Model

In [2]:
# Parameters
emb_size = 8
n_step   = 2
n_hidden = 2 # h in paper

def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split()
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(input)
        target_batch.append(target)

    return input_batch, target_batch

# Model
class LM(nn.Module):
    def __init__(self):
        super(LM, self).__init__()
        self.E  = nn.Embedding(n_class, emb_size)                                     # Embedding
        self.W1 = nn.Parameter(torch.randn(n_step * emb_size, n_hidden).type(dtype))  # Dense 1 weights
        self.B1 = nn.Parameter(torch.randn(n_hidden).type(dtype))                     # Dense 1 bias
        self.W2 = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))            # Dense 2 weights
        self.RW = nn.Parameter(torch.randn(n_step * emb_size, n_class).type(dtype))   # Dense 2 residual weights
        self.B2 = nn.Parameter(torch.randn(n_class).type(dtype))                      # Dense 2 bias

    def forward(self, X):
        X = self.E(X)                     # Embeding layer          [bs, n_step,  emb_size]
        X = X.view(-1, n_step * emb_size) # Embedings concatenation [bs, n_step * emb_size]
        tanh = torch.tanh(self.B1 + torch.mm(X, self.W1)) # Dense layer 1 [bs, hidden_size]
        output = self.B2 + torch.mm(X, self.RW) + torch.mm(tanh, self.W2) # Dense layer 2 with residual [bs, vocab_size]
        return output

model = LM()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

# Training
for epoch in range(5000):

    optimizer.zero_grad()
    output = model(input_batch)

    # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

Epoch: 1000 cost = 0.241908
Epoch: 2000 cost = 0.033025
Epoch: 3000 cost = 0.010251
Epoch: 4000 cost = 0.004399
Epoch: 5000 cost = 0.002176
[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']
