REQUIREMENTS + UTILS

In [1]:
from scripts.read_data import read_data_sequences
from scripts.seq_preprocess import tf_idf
import numpy as np

LOAD DATA

In [2]:
sequences_train, sequences_test, proteins_test, y_train = read_data_sequences()

In [3]:
X_train, X_test = tf_idf(sequences_train, sequences_test)

In [4]:
amino_acids = list(set("".join(sequences_train)))
aa_dict = {aa:i for i, aa in enumerate(amino_acids)}
print(aa_dict)

{'E': 0, 'M': 1, 'V': 2, 'W': 3, 'X': 4, 'T': 5, 'D': 6, 'G': 7, 'C': 8, 'I': 9, 'Y': 10, 'A': 11, 'Q': 12, 'N': 13, 'F': 14, 'S': 15, 'R': 16, 'H': 17, 'P': 18, 'L': 19, 'K': 20}


In [5]:
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(np.array(sequences_train), np.array(y_train), test_size=0.1)
print ('Training:', train_x.shape, train_y.shape)
print ('Validation:', val_x.shape, val_y.shape)

Training: (4399,) (4399,)
Validation: (489,) (489,)


In [6]:
%load_ext autoreload

In [7]:
%autoreload 2

In [8]:
from scripts.seq_dataset import ProteinSeqdataset
train_dataset = ProteinSeqdataset(train_x,train_y)
val_dataset = ProteinSeqdataset(val_x,val_y)
#X,y = train_dataset.get_data()
aa2id,id2aa = train_dataset.get_vocab()

In [10]:
a,b = train_dataset.get_data()
a[0]

tensor([21,  9, 16,  4, 12, 16, 11,  2,  6, 14, 16, 19,  6, 20, 10,  3,  2, 10,
         8, 20, 19, 12, 17,  8, 21,  6, 11,  3, 16, 21, 21, 20,  6, 17, 11, 20,
        14,  4, 10,  8,  3, 19,  6, 21,  3, 15, 14, 20,  8,  3, 11, 17, 17, 13,
        12,  3, 21, 16, 11, 21, 16, 11,  7, 15, 15, 17, 18,  7, 14,  1,  1, 12,
         2, 21, 10, 17, 21, 13,  9, 12, 20,  3, 12, 20, 21,  7,  3, 21, 12, 11,
        20,  6,  1,  1, 16,  8, 13, 10, 12,  3, 15,  7, 12,  6, 14,  6,  6, 17,
         1, 17, 17,  7, 20, 10, 20, 14, 15, 12,  1,  1, 14, 16, 15, 21,  3, 15,
        15,  3,  1, 16,  3,  9,  7,  7, 19,  7,  3, 10, 12, 12, 14, 10, 20,  1,
         3, 21,  3, 16, 16, 19,  7, 11, 19,  1, 17, 14, 17,  1, 14,  3,  2,  7,
         7, 15, 20, 21, 17, 10,  1,  9, 11, 21,  3,  6, 11, 13, 19, 20,  7, 19,
         7, 16, 18,  7, 21,  7, 20, 16, 15, 10, 21,  3, 10, 14,  3,  8, 13, 17,
        15, 20,  3, 14, 21,  3, 13,  7, 11, 10, 13, 16, 21, 10,  3, 11, 11, 20,
         2, 14, 10, 18,  3, 18, 19, 17, 

In [10]:
lengths = train_dataset.get_lengths()
lengths

tensor([330., 194., 121.,  ..., 263., 116., 971.])

In [11]:
nb_class = len(set(y_train))
nb_class

18

In [12]:
vocab_size = len(aa2id)
vocab_size

22

In [13]:
from torch.utils.data import DataLoader
training_dataloader = DataLoader(train_dataset, batch_size = 200, shuffle=True)
valid_dataloader = DataLoader(val_dataset, batch_size = 25)

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

class ProteinLSTM(nn.Module):
    def __init__(self, hidden_size, num_layers, num_classes, vocab_size, embedding_dim, lstm_dropout):
        super(ProteinLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(lstm_dropout)
        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, inputs):
        embedded_sequence = self.dropout(self.embedding(inputs)) # Inputs:[batch_size,len of sequences]
        embedded_sequence = embedded_sequence.permute(1,0,2) #Output: [len of sequences, batch_size, hidden_dim] 
        lstm_output,(hidden_state,cell_state) = self.lstm(embedded_sequence) 
        #take the last hidden state for each element in the sequence + Dropout 
        fc_input = self.dropout(hidden_state[0,:,:]) # [batch_size, len of sequences, hidden state] -> [batch size, last hidden state]
        out = self.linear(fc_input) # [batch size, hidden_dim] -> [batch_size, 1 (pred)]
        return out.ravel() 


In [15]:
from scripts.seq_train_eval import experiment
# Instantiate the model
model = ProteinLSTM(
    hidden_size=64,
    num_layers=2,
    num_classes=nb_class,
    vocab_size=vocab_size,
    embedding_dim=64,
    lstm_dropout=.4
)
# Create an optimizer
opt = optim.Adam(model.parameters(), lr=0.0025, betas=(0.9, 0.999))
# The criterion is a binary cross entropy loss based on logits - meaning that the sigmoid is integrated into the criterion
criterion = nn.BCEWithLogitsLoss()

train_losses = experiment(
    model, training_dataloader, valid_dataloader, opt, criterion, num_epochs=10
)

ModuleNotFoundError: No module named 'read_data'

In [197]:
# Instantiate the model
model = ProteinLSTM(hidden_size=64, num_layers=2, num_classes=nb_class, vocab_size=vocab_size, embedding_dim=64)
# Create an optimizer
opt = optim.Adam(model.parameters(), lr=0.0025, betas=(0.9, 0.999))
# The criterion is a binary cross entropy loss based on logits - meaning that the sigmoid is integrated into the criterion
criterion = nn.BCEWithLogitsLoss()

In [233]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def train_epoch(model, opt, criterion, dataloader):
    model.train()
    losses = []
    for i, (x, y, len_) in enumerate(dataloader):
        x,y = x.to(device),y.to(device)
        len_ = len_.to(device)
        opt.zero_grad()
        # (1) Forward
        pred = model.forward(x, len_)
        # (2) Compute the loss 
        loss = criterion(pred,y)
        # (3) Compute gradients with the criterion
        loss.backward()
        # (4) Update weights with the optimizer
        opt.step()    
        losses.append(loss.item())
        # Count the number of correct predictions in the batch - here, you'll need to use the sigmoid
        num_corrects = (torch.round(torch.sigmoid(pred)) == y).float().sum()
        acc = 100.0 * num_corrects/len(y)
        
        if (i%20 == 0):
            print("Batch " + str(i) + " : training loss = " + str(loss.item()) + "; training acc = " + str(acc.item()))
    return losses

In [234]:
# Same for the evaluation ! We don't need the optimizer here. 
def eval_model(model, criterion, evalloader):
    model.eval()
    total_epoch_loss = 0
    total_epoch_acc = 0
    with torch.no_grad():
        for i, (x, y, len_) in enumerate(evalloader):
            x,y = x.to(device),y.to(device)
            len_ = len_.to(device)
            pred =  model.forward(x, len_)
            loss = criterion(pred,y)
            num_corrects =(torch.round(torch.sigmoid(pred)) == y).float().sum()
            acc = 100.0 * num_corrects/len(y)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/(i+1), total_epoch_acc/(i+1)

In [235]:
# A function which will help you execute experiments rapidly - with a early_stopping option when necessary. 
def experiment(model, opt, criterion, num_epochs = 5, early_stopping = True):
    train_losses = []
    if early_stopping: 
        best_valid_loss = 10. 
    print("Beginning training...")
    for e in range(num_epochs):
        print("Epoch " + str(e+1) + ":")
        train_losses += train_epoch(model, opt, criterion, training_dataloader)
        valid_loss, valid_acc = eval_model(model, criterion, valid_dataloader)
        print("Epoch " + str(e+1) + " : Validation loss = " + str(valid_loss) + "; Validation acc = " + str(valid_acc))
        if early_stopping:
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
            else:
                print("Early stopping.")
                break  
    return train_losses

In [236]:
train_losses = experiment(model, opt, criterion,num_epochs=10)

Beginning training...
Epoch 1:


RuntimeError: input must have 2 dimensions, got 3

In [3]:
import csv
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Read sequences
sequences = list()
with open("data/sequences.txt", "r") as f:
    for line in f:
        sequences.append(line[:-1])

# Split data into training and test sets
sequences_train = list()
sequences_test = list()
proteins_test = list()
y_train = list()
with open("data/graph_labels.txt", "r") as f:
    for i, line in enumerate(f):
        t = line.split(",")
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            sequences_test.append(sequences[i])
        else:
            sequences_train.append(sequences[i])
            y_train.append(int(t[1][:-1]))

# Map sequences to
vec = TfidfVectorizer(analyzer="char", ngram_range=(1, 3))
X_train = vec.fit_transform(sequences_train)
X_test = vec.transform(sequences_test)

# Train a logistic regression classifier and use the classifier to
# make predictions
clf = LogisticRegression(solver="liblinear")
clf.fit(X_train, y_train)
y_pred_proba_seq = clf.predict_proba(X_test)


In [14]:
import pandas as pd
seq = pd.read_csv("sample_submission.csv")
seq.head()
#np.sum(seq,axis=1)

Unnamed: 0,name,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12,class13,class14,class15,class16,class17
0,11as,0.013421,0.010125,0.466346,0.0096,0.01556,0.110409,0.040535,0.015619,0.055511,0.010177,0.009209,0.04463,0.006132,0.005926,0.082652,0.076744,0.01749,0.009913
1,16pk,0.021742,0.016125,0.417447,0.009342,0.013803,0.0564,0.063944,0.019685,0.088272,0.010683,0.008819,0.081549,0.008722,0.009492,0.081467,0.069259,0.015457,0.007792
2,19hc,0.058075,0.012037,0.224616,0.004999,0.011772,0.043618,0.369082,0.011394,0.122394,0.008381,0.006151,0.020255,0.005745,0.009169,0.030656,0.04407,0.012128,0.005458
3,1ag9,0.017705,0.009405,0.45672,0.010811,0.011597,0.09742,0.034768,0.012282,0.097511,0.013075,0.01256,0.034495,0.007457,0.008537,0.095374,0.054399,0.018305,0.00758
4,1agx,0.104038,0.013102,0.274372,0.023038,0.014659,0.019519,0.089925,0.012469,0.140387,0.0143,0.006727,0.034096,0.006613,0.019848,0.174117,0.032383,0.013295,0.007112


In [None]:
struct = pd.read_csv("sample_submission.csv")
struct.head()
#np.sum(struct,axis=1)

In [8]:
from numpy import genfromtxt
my_data_seq = genfromtxt('sample_submission.csv', delimiter=',')
my_data_seq

array([[       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan, 0.01342103, 0.01012509, ..., 0.07674363, 0.0174905 ,
        0.00991321],
       [       nan, 0.02174239, 0.01612491, ..., 0.06925897, 0.01545664,
        0.00779209],
       ...,
       [       nan, 0.02276303, 0.01242107, ..., 0.04877692, 0.01632783,
        0.00632687],
       [       nan, 0.02810728, 0.01194174, ..., 0.04407273, 0.01484368,
        0.02052648],
       [       nan, 0.13940033, 0.00669199, ..., 0.01033002, 0.00609641,
        0.00639728]])

In [None]:
y_pred_proba_seq,y_pred_proba_struct

In [2]:
import pandas as pd
seq_res = pd.read_csv("sample_submission.csv",delimiter= ",")
struct_res = pd.read_csv("sample_submission_struct.csv", delimiter=",")

In [3]:
seq_res

Unnamed: 0,name,class0,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12,class13,class14,class15,class16,class17
0,11as,0.013421,0.010125,0.466346,0.009600,0.015560,0.110409,0.040535,0.015619,0.055511,0.010177,0.009209,0.044630,0.006132,0.005926,0.082652,0.076744,0.017490,0.009913
1,16pk,0.021742,0.016125,0.417447,0.009342,0.013803,0.056400,0.063944,0.019685,0.088272,0.010683,0.008819,0.081549,0.008722,0.009492,0.081467,0.069259,0.015457,0.007792
2,19hc,0.058075,0.012037,0.224616,0.004999,0.011772,0.043618,0.369082,0.011394,0.122394,0.008381,0.006151,0.020255,0.005745,0.009169,0.030656,0.044070,0.012128,0.005458
3,1ag9,0.017705,0.009405,0.456720,0.010811,0.011597,0.097420,0.034768,0.012282,0.097511,0.013075,0.012560,0.034495,0.007457,0.008537,0.095374,0.054399,0.018305,0.007580
4,1agx,0.104038,0.013102,0.274372,0.023038,0.014659,0.019519,0.089925,0.012469,0.140387,0.014300,0.006727,0.034096,0.006613,0.019848,0.174117,0.032383,0.013295,0.007112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,6o81,0.061883,0.013655,0.436305,0.005872,0.018467,0.079179,0.013444,0.015116,0.022340,0.009787,0.006971,0.075634,0.007252,0.005836,0.106479,0.103218,0.009734,0.008830
1219,6o93,0.090354,0.011040,0.160371,0.013107,0.027886,0.180979,0.055051,0.022332,0.233293,0.011121,0.011052,0.037230,0.008301,0.007795,0.076882,0.027992,0.015508,0.009706
1220,6ogd,0.022763,0.012421,0.241626,0.013899,0.012017,0.042000,0.055565,0.017378,0.371745,0.010099,0.009096,0.014772,0.004268,0.009343,0.091575,0.048777,0.016328,0.006327
1221,6okk,0.028107,0.011942,0.059683,0.020317,0.028020,0.080133,0.030134,0.026982,0.161717,0.012803,0.014893,0.324253,0.026413,0.018223,0.076939,0.044073,0.014844,0.020526


In [4]:
from scripts.write_data import write_sub

In [6]:
import numpy as np 
res = struct_res.iloc[:,1:]*0.4+seq_res.iloc[:,1:]*0.6
write_sub(np.array(res),"submissions_avg.csv")
#res['name'] = struct_res['name']
#res.to_csv("submissions_avg.csv",sep = ",")

In [2]:
from transformers import BertModel, BertTokenizer
import re
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertModel.from_pretrained("Rostlab/prot_bert")
sequence_Example = "A E T C Z A O"
sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
encoded_input = tokenizer(sequence_Example, return_tensors='pt')
output = model(**encoded_input)

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 81.0/81.0 [00:00<00:00, 5.56kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 29.7kB/s]
Downloading: 100%|██████████| 86.0/86.0 [00:00<00:00, 19.2kB/s]
Downloading: 100%|██████████| 361/361 [00:00<00:00, 26.1kB/s]
Downloading: 100%|██████████| 1.68G/1.68G [02:44<00:00, 10.3MB/s]
Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertFor

In [5]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0454,  0.1140, -0.0117,  ..., -0.0875, -0.1143,  0.0204],
         [ 0.0923,  0.1391, -0.0524,  ..., -0.1395, -0.0428,  0.0743],
         [ 0.1151,  0.0200, -0.0863,  ..., -0.0095, -0.1873,  0.1317],
         ...,
         [ 0.1079,  0.0977, -0.0583,  ..., -0.1277, -0.0649,  0.1289],
         [ 0.0546,  0.0364, -0.0782,  ..., -0.0302, -0.0602,  0.0890],
         [ 0.0515,  0.0571, -0.0693,  ..., -0.0394, -0.0663,  0.0977]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.2487,  0.2626, -0.2367,  ...,  0.2503,  0.2339, -0.2556]],
       grad_fn=<TanhBackward0>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

In [6]:
output[0].shape

torch.Size([1, 9, 1024])

In [1]:
from transformers import T5Tokenizer, T5Model
import re
import torch

tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_uniref50', do_lower_case=False)

model = T5Model.from_pretrained("Rostlab/prot_t5_xl_uniref50")

sequences_Example = ["A E T C Z A O","S K T Z P"]

sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True)

input_ids = torch.tensor(ids['input_ids'])
attention_mask = torch.tensor(ids['attention_mask'])

with torch.no_grad():
    embedding = model(input_ids=input_ids,attention_mask=attention_mask,decoder_input_ids=None)

# For feature extraction we recommend to use the encoder embedding
encoder_embedding = embedding[2].cpu().numpy()
decoder_embedding = embedding[0].cpu().numpy()


  from .autonotebook import tqdm as notebook_tqdm
Downloading:   1%|          | 133M/11.3G [00:05<11:12, 16.6MB/s] 

KeyboardInterrupt: 

Downloading:   1%|          | 133M/11.3G [00:20<11:12, 16.6MB/s]