Mount to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
! cp -r ./drive/MyDrive/ESM/esm/* .

In [None]:
import random
from collections import Counter
from tqdm import tqdm

import torch
from torch import nn 
from torch.utils.data import Dataset,DataLoader,TensorDataset


import numpy as np
import pandas as pd
import seaborn as sns

import esm

import matplotlib.pyplot as plt
from scipy.stats import pearsonr

Load the training dataset...

In [None]:


data_mul = pd.read_csv("./multiple_muts_train.csv", )
data_single = pd.read_csv("./single_muts_train.csv")

data = pd.concat((data_mul, data_single), axis=0)
data["name"] = ["protein" + str(i) for i in range(data.shape[0])]

data = data.loc[:, ["name", "secondary_structure", "stabilityscore"]]
data.reset_index(drop=True, inplace=True)


# Store the proteins as .fasta file
with open("dataset.fasta", "w") as f:
    for i in range(data.shape[0]):
        f.write(">"+data.loc[i, "name"]+"\n")
        f.write(data.loc[i, "sequence"]+"\n"

In [None]:
nan_list = []
for i in range(data.shape[0]):
  if np.isnan(np.array(data.loc[i, "stabilityscore"])):
    nan_list.append(i)
data = data.drop(nan_list)
data.reset_index(drop=True, inplace=True)
data.shape

Load the Model and Packages...

In [None]:
device = "cuda"

In [None]:
! python extract.py esm1b_t33_650M_UR50S dataset.fasta ./train_all_reprs/ --include per_tok

In [None]:
SS = []
dict = {"H": 0, "E": 1, "T": 2}
for i in range(data.shape[0]):
  ss = data.loc[i, "secondary_structure"]
  s = np.array([dict[ss[i]] for i in range(len(ss))])
  SS.append(torch.from_numpy(s))

In [None]:

EMB_PATH = "./train_all_reprs/"
EMB_LAYER = 33

import torch 


class ProteinData(Dataset):
        
    def __len__(self):
        return data.shape[0]
    
    def __getitem__(self,i):
        x = torch.load(EMB_PATH + data.loc[i, "name"] + ".pt")["representations"][33]
        ss = SS[i]
        y = data.loc[i, "stabilityscore"]
        label = torch.tensor(y).float()
        return (x,label,ss)

    
ds_train = ProteinData()



In [None]:
train_size = int(len(ds_train) * 0.7)
test_size = len(ds_train) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(ds_train, [train_size, test_size], generator=torch.Generator().manual_seed(42))

dl_train = DataLoader(train_dataset, batch_size = 128, shuffle=True, drop_last=True)
dl_test = DataLoader(test_dataset, batch_size = 128, shuffle=True, drop_last=True)

In [None]:
class LSTMs(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm1 = nn.LSTM(input_size = 1280, 
                             hidden_size = 100,
                             num_layers = 2,
                             batch_first = True,
                             bidirectional = True,
                             dropout = 0.1
                            )

        self.lstm2 = nn.LSTM(input_size = 200, 
                             hidden_size = 10,
                             num_layers = 2,
                             batch_first = True,
                             bidirectional = True,
                             dropout = 0.1
                            )
        self.embedding = nn.Embedding(3, 5)
        self.lstm4 = nn.LSTM(input_size = 5, 
                             hidden_size = 10,
                             num_layers = 2,
                             batch_first = True,
                             bidirectional = True,
                             dropout = 0.1
                            )
        self.lstm3 = nn.LSTM(input_size = 40, 
                             hidden_size = 10,
                             num_layers = 2,
                             batch_first = True,
                             bidirectional = True,
                             dropout = 0.1
                            )
        
        self.linear1 = nn.Linear(20, 20)
        self.linear2 = nn.Linear(20, 1)
        

    def forward(self, x, ss):
        ss = self.embedding(ss)
        ss = self.lstm4(ss)[0]
        y = self.lstm1(x)[0]
        y = self.lstm2(y)[0]

        y = torch.cat((y, ss), dim=2)   
        y = self.lstm3(y)[0]   
        y = torch.mean(y, dim=1)
        y = self.linear1(y)
        #y = self.dropout(y)
        y = self.linear2(y)
        
        return y

In [None]:
model = LSTMs().to(device)

# Kaiming And orthogonal Initialization
for m in model.modules():
    if isinstance(m, (nn.Linear)):
        nn.init.kaiming_normal_(m.weight)
    if isinstance(m, (nn.LSTM)):
        for name, param in m.named_parameters():
          if 'weight' in name:
            nn.init.orthogonal_(param)
                


model.load_state_dict(torch.load("./drive/MyDrive/ESM/LSTM_with_SS_no_Contact_Map.pkl"))

<All keys matched successfully>

In [None]:
def clip_gradient(optimizer, grad_clip):
    for group in optimizer.param_groups:
        for param in group["params"]:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

In [None]:
Loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01, weight_decay=0.)

In [None]:
def train_step(model, features, labels, ss):

    model.train()

    optimizer.zero_grad()

    predictions = model(features.to(device), ss.to(device))
    loss = Loss(predictions, labels.reshape(-1, 1).to(device))
    loss.backward()
    clip_gradient(optimizer, 10)
    optimizer.step()

    pccs = pearsonr(predictions.reshape(-1).detach().cpu().numpy(), labels.reshape(-1).detach().cpu().numpy())
    return loss.item(), pccs[0]

def valid_step(model, features, labels, ss):

    model.eval()

    with torch.no_grad():
        predictions = model(features.to(device), ss.to(device))
        loss = Loss(predictions, labels.reshape(-1, 1).to(device))
    
    return loss.item(), predictions


In [None]:

epochs = 200
TRAIN_LOSS = []
TEST_LOSS = []
TRAIN_CORR = []
TEST_CORR = []
plt.rcParams['figure.figsize'] = (15, 4)
for epoch in range(epochs):
    losses = []
    pcces = []
    for batch in dl_train:
        loss1, pcc = train_step(model, batch[0], batch[1], batch[2])
        losses.append(loss1)
        pcces.append(pcc)
    TRAIN_LOSS.append(sum(losses) / len(losses))
    TRAIN_CORR.append(sum(pcces) / len(pcces))
    plt.subplot(1,3,1)
    plt.plot(losses)
    plt.subplot(1,3,2)
    plt.plot(pcces)
    
    losses = []
    pcces = []
    preds = []
    labels = []
    for batch in dl_test:
        features = batch[0]#[mask]
        label = batch[1]#[mask]
        loss2, predictions = valid_step(model, features, label, batch[2])
        losses.append(loss2)
        for i in range(predictions.shape[0]):
          preds.append(predictions[i].cpu().numpy().item())
          labels.append(label[i].cpu().numpy().item())

    pccs = pearsonr(np.array(preds), np.array(labels))[0]
    TEST_LOSS.append(sum(losses) / len(losses))
    TEST_CORR.append(pccs)
    plt.subplot(1,3,3)
    plt.scatter(preds, labels, s=1)
    plt.axis("equal")
    plt.show()
    print("valid loss: ", sum(losses) / len(losses))
    print("valid correlation: ", pccs)

    if TEST_CORR[-1] == max(TEST_CORR): 
      print("sota!")
      torch.save(model.state_dict(), "./drive/MyDrive/ESM/LSTM_with_SS_no_Contact_Map.pkl")
    

    





Plot the result for training set:

In [None]:
preds = []
labels = []

for batch in dl_train:
    model.eval()

    with torch.no_grad():
        predictions = model(batch[0].to(device), batch[2].to(device)).reshape(-1)
    for i in range(predictions.shape[0]):
      preds.append(predictions[i].cpu().numpy().item())
      labels.append(batch[1][i].cpu().numpy().item())

plt.rcParams['figure.figsize'] = (6, 6)
plt.scatter(preds, labels, s=20, alpha=0.5, edgecolors="black")
plt.text(x=0, y=2, s="Correlation Coefficient:"+str(pearsonr(np.array(preds), np.array(labels))[0]))
plt.plot((-0.6, 1.7), (-0.6, 1.7), "red", alpha=0.4, linewidth=3)
plt.ylabel("Stability Score")
plt.xlabel("Predicted Score")
plt.title("Single & Multiple Mutations (on Training Dataset)")
plt.show()

In [None]:
"""
class LSTMs(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm1 = nn.LSTM(input_size = 1280, 
                             hidden_size = 100,
                             num_layers = 2,
                             batch_first = True,
                             bidirectional = True,
                             dropout = 0.3
                            )
        self.lstm2 = nn.LSTM(input_size = 200, 
                             hidden_size = 50,
                             num_layers = 2,
                             batch_first = True,
                             bidirectional = True,
                             dropout = 0.3
                            )
        self.linear1 = nn.Linear(100, 20)
        self.linear2 = nn.Linear(20, 1)

    def forward(self, x, ss):
        #x = torch.cat((x, ss), dim=2) 
        y = self.lstm1(x)[0]
        y = self.lstm2(y)[0]
        y = torch.mean(y, dim=1)
        y = self.linear1(y)
        y = self.linear2(y)

        return y
"""      
