In [60]:
import numpy as np
from numpy import vstack
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
import re
import gc
import pickle
import psutil
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import Adam
from torch.nn import MultiLabelMarginLoss
from torch.nn.init import kaiming_uniform_, xavier_uniform_
from sklearn.metrics import f1_score
import joblib
#from transformers import T5EncoderModel, T5Tokenizer
# from tape import ProteinBertModel, UniRepModel, TAPETokenizer

#### Extract protein embeddings

In [2]:
train_embeddings = np.load(".../train_embeddings.npy")

In [3]:
train_ids = np.load(".../train_ids.npy")

In [4]:
train_embeddings = pd.DataFrame(train_embeddings)
train_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.155392,0.035353,0.089697,-0.250368,0.248972,-0.032696,-0.073959,0.011419,-0.064566,-0.149671,...,-0.031426,-0.067552,-0.070645,-0.10829,0.036037,-0.180188,-0.099818,-0.039517,-0.073583,0.045931
1,-0.072053,0.093417,-0.002659,-0.00569,0.113906,-0.010813,-0.101786,0.021196,-0.012389,-0.028101,...,0.064644,-0.020896,-0.120009,-0.104604,-0.033123,0.030718,0.002714,-0.019167,-0.022672,0.072778
2,0.14378,0.019153,0.086995,-0.215061,0.219807,-0.028687,-0.101207,0.031871,-0.043015,-0.121149,...,-0.037286,-0.064872,-0.08231,-0.189828,0.027312,-0.189434,-0.097486,-0.054416,-0.042178,0.059392
3,-0.037358,0.007036,0.083136,-0.116788,-0.000758,-0.025243,-0.105427,0.070486,0.069643,-0.031957,...,0.10348,-0.023106,-0.105887,-0.105809,0.004708,-0.051759,-0.020586,-0.078935,-0.056303,0.01184
4,0.134157,0.035627,0.092638,-0.181336,0.196127,-0.028752,-0.089066,0.031362,-0.039287,-0.114085,...,-0.079311,-0.003805,-0.094771,-0.124049,-0.015014,-0.155916,-0.068633,-0.071746,-0.022954,0.034192


In [5]:
train_embeddings["id"] = train_ids
train_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,id
0,0.155392,0.035353,0.089697,-0.250368,0.248972,-0.032696,-0.073959,0.011419,-0.064566,-0.149671,...,-0.067552,-0.070645,-0.10829,0.036037,-0.180188,-0.099818,-0.039517,-0.073583,0.045931,P20536
1,-0.072053,0.093417,-0.002659,-0.00569,0.113906,-0.010813,-0.101786,0.021196,-0.012389,-0.028101,...,-0.020896,-0.120009,-0.104604,-0.033123,0.030718,0.002714,-0.019167,-0.022672,0.072778,O73864
2,0.14378,0.019153,0.086995,-0.215061,0.219807,-0.028687,-0.101207,0.031871,-0.043015,-0.121149,...,-0.064872,-0.08231,-0.189828,0.027312,-0.189434,-0.097486,-0.054416,-0.042178,0.059392,O95231
3,-0.037358,0.007036,0.083136,-0.116788,-0.000758,-0.025243,-0.105427,0.070486,0.069643,-0.031957,...,-0.023106,-0.105887,-0.105809,0.004708,-0.051759,-0.020586,-0.078935,-0.056303,0.01184,A0A0B4J1F4
4,0.134157,0.035627,0.092638,-0.181336,0.196127,-0.028752,-0.089066,0.031362,-0.039287,-0.114085,...,-0.003805,-0.094771,-0.124049,-0.015014,-0.155916,-0.068633,-0.071746,-0.022954,0.034192,P54366


In [6]:
train_embeddings = train_embeddings.sort_values("id").reset_index().drop("index", axis = 1)

In [7]:
X = train_embeddings.drop("id", axis = 1).values
train_ids = train_embeddings["id"]

In [8]:
del train_embeddings

#### Get label for each protein

In [9]:
train_terms = pd.read_table(".../train_terms.tsv")
train_terms.head()

Unnamed: 0,EntryID,term,aspect
0,A0A009IHW8,GO:0008152,BPO
1,A0A009IHW8,GO:0034655,BPO
2,A0A009IHW8,GO:0072523,BPO
3,A0A009IHW8,GO:0044270,BPO
4,A0A009IHW8,GO:0006753,BPO


In [10]:
# get most occured label
num_label = 1500

freqCount = (train_terms['term'].value_counts())
print(freqCount)
considered_one = list(freqCount.index[:num_label])

GO:0005575    92912
GO:0008150    92210
GO:0110165    91286
GO:0003674    78637
GO:0005622    70785
              ...  
GO:0031772        1
GO:0042324        1
GO:0031771        1
GO:0051041        1
GO:0102628        1
Name: term, Length: 31466, dtype: int64


In [11]:
# check if these features are enough to cover all cases
train_terms[train_terms["term"].isin(considered_one)]["EntryID"].nunique() - 142246 # need to be 0
# good for case of 1000+

0

In [12]:
# make multilabel data
train_size = len(train_ids)
Y = np.zeros((train_size ,num_label))
train_terms_smaller = train_terms[train_terms["term"].isin(considered_one)]
for i in tqdm(range(Y.shape[1])):
    m = train_terms_smaller['term'] ==  considered_one[i]
    Y[:,i] =  train_ids.isin( set(train_terms_smaller[m]['EntryID'] ) ).astype(float )
Y

100%|██████████| 1500/1500 [07:13<00:00,  3.46it/s]


array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

In [16]:
Y = Y.astype(int)

In [17]:
print(X.shape)
print(Y.shape)

(142246, 1024)
(142246, 1500)


In [14]:
del train_terms
del freqCount

#### Data preprocessing for Torch model

In [18]:
# prepare the dataset
class ProtDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X 
        self.Y = Y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.Y[idx]]

In [19]:
prot_data = ProtDataset(X, Y)

In [20]:
train_size = int(0.8 * len(prot_data))
test_size = len(prot_data) - train_size

train_data, test_data = random_split(prot_data, [train_size, test_size])

In [21]:
del prot_data

In [22]:
train_dl = DataLoader(train_data, batch_size=32, shuffle=True, num_workers = 2)
test_dl = DataLoader(test_data, batch_size=1024, shuffle=False, num_workers = 2)

#### Train using neural networks

In [50]:
class MLP(Module):
    # define model elements
    def __init__(self, n_inputs, n_labels):
        super(MLP, self).__init__()
        # first hiddern
        self.hidden1 = Linear(n_inputs, 64)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.activation1 = ReLU()
        # second hidden
        self.hidden2 = Linear(64, 128)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.activation2 = ReLU()
        # third hidden
        self.hidden3 = Linear(128, n_labels)
        xavier_uniform_(self.hidden3.weight) 
        self.activation3 = Sigmoid()
        
 
    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.activation1(X)
        # second hidden layer
        X = self.hidden2(X)
        X = self.activation2(X)
        # third hidden layer
        X = self.hidden3(X)
        X = self.activation3(X)
        return X

In [65]:
# train the model
def train_model(train_dl, model):
    # define the optimization
    criterion = MultiLabelMarginLoss()
    optimizer = Adam(model.parameters(), lr=0.01)
    # enumerate epochs
    for epoch in tqdm(range(50)):
        # enumerate mini batches
        for i, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs)
            yhat = yhat.round()
            # calculate loss
            loss = criterion(yhat, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()

In [66]:
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1500))
        # round to class values
        yhat = yhat.round()
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate accuracy
    f1_micro = f1_score(actuals, predictions, average = "micro")
    print("Micro f1-score: " + str(f1_micro))
    f1_macro = f1_score(actuals, predictions, average = "macro")
    print("Macro f1-score: " + str(f1_micro))

In [67]:
model = MLP(1024, 1500)

In [68]:
train_model(train_dl, model)

  0%|          | 0/50 [02:44<?, ?it/s]


KeyboardInterrupt: 

In [59]:
evaluate_model(test_dl, model)

Micro f1-score: 0.04211101748284646
Macro f1-score: 0.04211101748284646


In [None]:
del X
del Y
del X_train
del Y_train
del X_test
del Y_test
del pred_br_gnb

Try on test data

In [None]:
test_id = []
test_seq = []

for seq_record in SeqIO.parse(".../testsuperset.fasta", "fasta"):
  test_id.append(seq_record.id)
  test_seq.append(str(seq_record.seq))

print(len(test_id))

In [None]:
test_df = pd.DataFrame({"id": test_id, "sequence": test_seq})
del test_id
del test_seq
test_df.head()

In [None]:
test_df = test_df.sort_values("id").reset_index().drop("index", axis = 1)
test_df.head()

In [None]:
id_lst = test_df["id"].tolist()

Make test data and remove all those unneeded or write the needed into disk in order to prevent ram overloading

In [None]:
test_lst = []
for s in test_df["sequence"]:
  test_lst.append(ProteinAnalysis(s).get_amino_acids_percent())
test = pd.DataFrame(test_lst)
test = test.values

In [None]:
del test_df
del test_lst

In [None]:
test.shape

In [None]:
test_sub1 = test[:30000]
test_sub2 = test[30000:60000]
test_sub3 = test[60000:90000]
test_sub4 = test[90000:120000]
test_sub5 = test[120000:]
del test

In [None]:
prob_1 = br_gnb.predict(test_sub1)
del test_sub1
with open('prob_1.pickle', 'wb') as f:
    pickle.dump(prob_1, f)
del prob_1

In [None]:
gc.collect()

In [None]:
prob_2 = br_gnb.predict_proba(test_sub2)
del test_sub2
with open('prob_2.pickle', 'wb') as f:
    pickle.dump(prob_2, f)
del prob_2

In [None]:
gc.collect()

In [None]:
prob_3 = br_gnb.predict_proba(test_sub3)
del test_sub3
with open('prob_3.pickle', 'wb') as f:
    pickle.dump(prob_3, f)
del prob_3

In [None]:
gc.collect()

In [None]:
prob_4 = br_gnb.predict_proba(test_sub4)
del test_sub4
with open('prob_4.pickle', 'wb') as f:
    pickle.dump(prob_4, f)
del prob_4

In [None]:
gc.collect()

In [None]:
prob_5 = br_gnb.predict_proba(test_sub5)
del test_sub5
with open('prob_5.pickle', 'wb') as f:
    pickle.dump(prob_5, f)
del prob_5

In [None]:
gc.collect()

Now concat the data with the label

In [None]:
final_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
final_df["id"] = [id_lst[i] for i in range(30000) for _ in range(1500)]

In [None]:
final_df["terms"] = considered_one * 30000

In [None]:
with open('prob_1.pickle', 'rb') as f:
    prob_1 = pickle.load(f)
prob_1.shape

In [None]:
prob_1 = prob_1.toarray().ravel()

In [None]:
final_df["prob"] = prob_1

In [None]:
del prob_1

In [None]:
final_df = final_df[final_df["prob"] >= 0.6]

In [None]:
final_df.shape

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(30000, 60000) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 30000

In [None]:
with open('prob_2.pickle', 'rb') as f:
    prob_2 = pickle.load(f)

In [None]:
prob_2 = prob_2.toarray().ravel()

In [None]:
temp_df["prob"] = prob_2

In [None]:
del prob_2

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(60000, 90000) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 30000

In [None]:
with open('prob_3.pickle', 'rb') as f:
    prob_3 = pickle.load(f)

In [None]:
prob_3 = prob_3.toarray().ravel()

In [None]:
temp_df["prob"] = prob_3

In [None]:
del prob_3

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(90000, 120000) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 30000

In [None]:
with open('prob_4.pickle', 'rb') as f:
    prob_4 = pickle.load(f)

In [None]:
prob_4 = prob_4.toarray().ravel()

In [None]:
temp_df["prob"] = prob_4

In [None]:
del prob_4

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(120000, 141865) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 21865

In [None]:
with open('prob_5.pickle', 'rb') as f:
    prob_5 = pickle.load(f)

In [None]:
prob_5 = prob_5.toarray().ravel()

In [None]:
temp_df["prob"] = prob_5

In [None]:
del prob_5

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
final_df.shape

In [None]:
# make the submission
final_df.to_csv(".../submission.tsv", index = False, sep = "\t")