In [1]:
import torch 
import torch.nn
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM, AdamW, WarmupLinearSchedule
import logging
import pandas as pd
from biopandas.pdb import PandasPdb
import numpy as np


In [2]:
sample_sub = pd.read_csv("/srv01/technion/morant/Storage/sample_submission.csv")
test = pd.read_csv("/srv01/technion/morant/Storage/test.csv")
train = pd.read_csv("/srv01/technion/morant/Storage/train.csv")
train_updates = pd.read_csv("/srv01/technion/morant/Storage/train_updates_20220929.csv")
ppdb=PandasPdb()
wt_structure_pred = ppdb.read_pdb("/srv01/technion/morant/Storage/wildtype_structure_prediction_af2.pdb")

In [3]:
# Tokenization of train
aa2num = {'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5, 'Q': 6, 'E': 7, 'G': 8, 'H': 9, 'I': 10, 'L': 11, 'K': 12, 'M': 13,
          'F': 14, 'P': 15, 'O': 16, 'S': 17, 'U': 18, 'T': 19, 'W': 20, 'Y': 21, 'V': 22, 'B': 23, 'Z': 24, 'X': 25, 'J': 26}

# Tokenization!!
train['protein_sequence_tokenized'] = train['protein_sequence'].apply(lambda s: [aa2num[x] for x in s])
len_Before_tokenization = train['protein_sequence'].apply(len)

In [4]:
train['len_Before_tokenization'] = len_Before_tokenization

In [5]:
max_len = train['protein_sequence_tokenized'].apply(len).max()

train['protein_sequence_tokenized'] = train['protein_sequence_tokenized'].apply(lambda x: np.pad(x, (0, max_len-len(x))))

In [23]:
tmp = torch.split(torch.tensor(train['len_Before_tokenization']), 5)
tmp

(tensor([ 341,  286,  497,  265, 1451]),
 tensor([380, 380, 301, 287, 163]),
 tensor([ 217,  265,   55, 1643,   81]),
 tensor([228, 114, 380, 904, 284]),
 tensor([203, 645, 213, 192, 341]),
 tensor([501, 400, 206, 313, 109]),
 tensor([329, 354, 324, 278, 506]),
 tensor([150, 448, 155, 477, 352]),
 tensor([449, 448, 345, 346, 330]),
 tensor([455, 448, 286, 446, 530]),
 tensor([169,  88, 210, 352, 499]),
 tensor([ 461, 1417,  530, 1539,  676]),
 tensor([341, 247, 203, 792, 299]),
 tensor([ 56, 415, 225, 631, 106]),
 tensor([106, 106, 106, 106, 106]),
 tensor([106, 106, 106, 106, 106]),
 tensor([106, 106, 106, 384, 495]),
 tensor([436, 194, 566, 189, 219]),
 tensor([518, 480, 636, 332, 679]),
 tensor([621, 530, 686, 406, 181]),
 tensor([1515,  194,  308,  306,  516]),
 tensor([514, 348, 291, 168, 867]),
 tensor([761, 255, 389, 301, 402]),
 tensor([ 469,  322,  247, 1047, 1041]),
 tensor([1019, 1006,  340,  722,  789]),
 tensor([329, 462,  90,  57, 942]),
 tensor([339, 319, 487, 300, 223])

In [24]:
tokens_tensor = train['protein_sequence_tokenized']
tokens_tensor = torch.tensor(np.array([ x for x in tokens_tensor.values ]))[:, :512]
tokens_mskd_tensor = train['len_Before_tokenization']
tokens_mskd_tensor = torch.tensor([np.pad(np.ones(x),(0,max_len-x)) for x in tokens_mskd_tensor.values])[:, :512].float()

In [26]:
tokens_mskd_tensor.shape, tokens_tensor.shape

(torch.Size([31390, 512]), torch.Size([31390, 512]))

In [31]:
batched_tok_mskd = torch.split(tokens_mskd_tensor, 20)
batched_tok_ten = torch.split(tokens_tensor, 20)
batched_train_tm = torch.split(torch.tensor(train['tm']), 20)

In [9]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model in evaluation mode to desactivate the DropOut modules
# This is IMPORTANT to have reproductible results during evaluation!
# model.eval()


# See the models docstrings for the detail of the inputs
outputs = model(tokens_tensor, token_type_ids=torch.zeros_like(tokens_tensor))
# PyTorch-Transformers models always output tuples.
# See the models docstrings for the detail of all the outputs
# In our case, the first element is the hidden state of the last layer of the Bert model
encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
# assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

In [17]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=393216, out_features=10000),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=10000, out_features=1000),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=1000, out_features=1))
        
    def forward(self, batch, attention_mask):
        result = self.model(batch, token_type_ids=torch.zeros_like(batch), attention_mask=attention_mask)[0]
        res_flat = torch.flatten(result, start_dim=1)
        lin = self.linear(res_flat)

        return lin        

In [18]:
model = Model()

In [19]:
loss = torch.nn.MSELoss()

In [None]:
# Training (when we'll get there)
# Parameters:
lr = 1e-5
max_grad_norm = 0.7
num_total_steps = 1000
num_warmup_steps = 500
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1

### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
### and used like this:
for i in range(num_total_steps):
    for batch,attention_mask, train_tm in zip(batched_tok_ten, batched_tok_mskd, batched_train_tm):
        loss_new = loss(model(batch, attention_mask),train_tm.float()[:,None])
        loss_new.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
        optimizer.step()
        scheduler.step()
        print(f"loss_new={loss_new}")


loss_new=398.10546875
loss_new=454.2421875
loss_new=526.1185302734375
loss_new=421.476806640625
loss_new=600.7506103515625
loss_new=402.9046936035156
loss_new=428.60791015625
loss_new=199.71939086914062
loss_new=306.4364929199219
loss_new=268.25079345703125
loss_new=385.46124267578125
loss_new=184.8218231201172
loss_new=117.76461029052734


In [192]:
train.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm,protein_sequence_tokenized
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7,"[1, 1, 1, 1, 12, 1, 1, 1, 11, 1, 11, 11, 8, 7,..."
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5,"[1, 1, 1, 4, 8, 7, 15, 11, 9, 3, 7, 7, 7, 2, 1..."
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5,"[1, 1, 1, 14, 17, 19, 15, 2, 1, 19, 17, 21, 2,..."
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2,"[1, 1, 1, 17, 8, 11, 2, 19, 1, 10, 15, 1, 6, 1..."
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5,"[1, 1, 1, 19, 12, 17, 8, 15, 2, 2, 6, 17, 6, 8..."


In [44]:
train.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,"[1, 1, 1, 1, 12, 1, 1, 1, 11, 1, 11, 11, 8, 7,...",7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,"[1, 1, 1, 4, 8, 7, 15, 11, 9, 3, 7, 7, 7, 2, 1...",7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,"[1, 1, 1, 14, 17, 19, 15, 2, 1, 19, 17, 21, 2,...",7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,"[1, 1, 1, 17, 8, 11, 2, 19, 1, 10, 15, 1, 6, 1...",7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,"[1, 1, 1, 19, 12, 17, 8, 15, 2, 2, 6, 17, 6, 8...",7.0,doi.org/10.1038/s41592-020-0801-4,49.5


In [32]:
train

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5
...,...,...,...,...,...
31385,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,doi.org/10.1038/s41592-020-0801-4,51.8
31386,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,doi.org/10.1038/s41592-020-0801-4,37.2
31387,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,doi.org/10.1038/s41592-020-0801-4,64.6
31388,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.7


In [5]:
test.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes


In [6]:
train.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5


In [26]:
train_updates

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,69,,,,
1,70,,,,
2,71,,,,
3,72,,,,
4,73,,,,
...,...,...,...,...,...
2429,30738,,,,
2430,30739,,,,
2431,30740,,,,
2432,30741,,,,
