In [1]:
import os
import gc
import time
import copy
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score
from torchmetrics.classification import MultilabelAccuracy

from sklearn.metrics import roc_auc_score

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
CONFIG = {"root_data": '/kaggle/input/cafa-5-protein-function-prediction',
          "root_embed": '/kaggle/input/23468234',
          "exp_name": 'all, 600 label, 250 iter, sigmoid cosine, seed : 42',
          "n_labels": 600,
          "seeds": [42],
          "min_ia": 1,
          "truncate_ia": False,
          "epoch": 250,
          "lr": 1e-3,
          "batch_size": 256,
          "n_accumulate": 1,
          "loss": 'sigmoidf1', # 'cce', 'sigmoidf1' 'softf1'
          "scheduler": 'cosine', # 'cosine', 'onecycle'
          "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu')}

In [3]:
class ProteinDataset(Dataset):
    def __init__(self, ids_source, embedding_source1, embedding_source2, target_source, train=True):
        ids = np.load(ids_source)
        if embedding_source2 is not None:
            embeds1 = np.load(embedding_source1)
            embeds2 = np.load(embedding_source2)
            embeds = np.concatenate((embeds1, embeds2), axis=1)

            del embeds1, embeds2
            gc.collect()
            
            print(embeds.shape)
        
        else:
            embeds = np.load(embedding_source1)
        
        self.train = train
        
        embeds_list = []
        for l in range(embeds.shape[0]):
            embeds_list.append(embeds[l,:])
            
        self.df = pd.DataFrame(data={"EntryID": ids, "embed" : embeds_list})
        
        if self.train:
            target = np.load(target_source)
            df_labels = pd.DataFrame({"EntryID": ids, "target": target.tolist()})
            del target
            gc.collect()
            self.df = self.df.merge(df_labels, on="EntryID")
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        embed = torch.tensor(self.df.iloc[index]["embed"] , dtype = torch.float32)
        if self.train:
            target = torch.tensor(self.df.iloc[index]["target"], dtype = torch.float32)
            return {
                'embed': embed,
                'target': target,
            }
        else:
            id = self.df.iloc[index]["EntryID"]
            return { 
                'embed': embed,
                'id' : id
            }

In [4]:
class LinearModel(nn.Module):
    def __init__(self, embed_size, hidden_size1, hidden_size2, target_size, dropout=0.8, num_emb_layers=2):
        super(LinearModel, self).__init__()
        self.input_block = nn.Sequential(nn.LayerNorm(embed_size, eps=1e-6),
                                         nn.Linear(embed_size, hidden_size1), 
                                         nn.LeakyReLU())

        self.hidden_block1 = []
        for i in range(num_emb_layers - 1):
            self.hidden_block1.extend([nn.LayerNorm(hidden_size1, eps=1e-6), 
                                       nn.Dropout(0.15), 
                                       nn.Linear(hidden_size1, hidden_size1), 
                                       nn.LeakyReLU()])
            
            if i == num_emb_layers - 2:
                self.hidden_block1.extend([nn.LayerNorm(hidden_size1, eps=1e-6)])
        self.hidden_block1.extend([nn.LayerNorm(hidden_size1, eps=1e-6), 
                                   nn.Dropout(0.15), 
                                   nn.Linear(hidden_size1, hidden_size2)])
        self.hidden_block1 = nn.Sequential(*self.hidden_block1)
        
        self.hidden_block2 = []
        for i in range(num_emb_layers - 1):
            self.hidden_block2.extend([nn.LayerNorm(hidden_size2, eps=1e-6), 
                                       nn.Dropout(0.15), 
                                       nn.Linear(hidden_size2, hidden_size2), 
                                       nn.LeakyReLU()])
            
            if i == num_emb_layers - 2:
                self.hidden_block2.extend([nn.LayerNorm(hidden_size2, eps=1e-6)])
        self.hidden_block2 = nn.Sequential(*self.hidden_block2)
        
        self.output_block = nn.Sequential(nn.LayerNorm(hidden_size2, eps=1e-6), 
                                          nn.Dropout(0.15), 
                                          nn.Linear(hidden_size2, target_size))
        
        self.dropout = nn.Dropout(dropout)
        
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        
    def forward(self, x):        
        out = self.input_block(x)
        out = self.dropout(self.hidden_block1(out))
        out = self.dropout(self.hidden_block2(out))
        out = self.output_block(out)
        return out

In [5]:
dataset = ProteinDataset(ids_source='/kaggle/input/23468234/test_ids_esm2_t33_650M_UR50D.npy',
                         embedding_source1='/kaggle/input/23468234/test_embeds_esm2_t33_650M_UR50D.npy',
                         embedding_source2='/kaggle/input/t5embeds/test_embeds.npy',
                         target_source=None,
                         train=False)
test_dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, num_workers=2 ,shuffle=False)

(141865, 2304)


In [6]:
train_data = pd.read_csv(os.path.join(CONFIG["root_data"], 'Train', 'train_terms.tsv'), sep='\t')
IA = pd.read_csv(os.path.join(CONFIG["root_data"], 'IA.txt'), sep='\t', header=None, names=["term", "val"])
IA = IA[IA["val"] >= CONFIG["min_ia"]].reset_index().drop(["index"], axis=1)
IA = IA["term"].values

In [7]:
model = LinearModel(embed_size=2304, 
                    hidden_size1=1912,
                    hidden_size2=1024,
                    target_size=CONFIG['n_labels'], 
                    dropout=0.8, 
                    num_emb_layers=3)
print(model)

LinearModel(
  (input_block): Sequential(
    (0): LayerNorm((2304,), eps=1e-06, elementwise_affine=True)
    (1): Linear(in_features=2304, out_features=1912, bias=True)
    (2): LeakyReLU(negative_slope=0.01)
  )
  (hidden_block1): Sequential(
    (0): LayerNorm((1912,), eps=1e-06, elementwise_affine=True)
    (1): Dropout(p=0.15, inplace=False)
    (2): Linear(in_features=1912, out_features=1912, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): LayerNorm((1912,), eps=1e-06, elementwise_affine=True)
    (5): Dropout(p=0.15, inplace=False)
    (6): Linear(in_features=1912, out_features=1912, bias=True)
    (7): LeakyReLU(negative_slope=0.01)
    (8): LayerNorm((1912,), eps=1e-06, elementwise_affine=True)
    (9): LayerNorm((1912,), eps=1e-06, elementwise_affine=True)
    (10): Dropout(p=0.15, inplace=False)
    (11): Linear(in_features=1912, out_features=1024, bias=True)
  )
  (hidden_block2): Sequential(
    (0): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
    (1

In [8]:
model.load_state_dict(torch.load('/kaggle/input/cafa-model-new/model_42.pt'))
model.to(CONFIG['device'])
model.eval()


labels = pd.read_csv(os.path.join(CONFIG["root_data"], 'Train', 'train_terms.tsv'), sep='\t')
if CONFIG["truncate_ia"]:
    labels = labels[labels['term'].isin(IA)]

top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
labels_names = top_terms[:CONFIG["n_labels"]].index.values

ids_ = np.empty(shape=(len(test_dataloader) * CONFIG['n_labels'],), dtype=object)
go_terms_ = np.empty(shape=(len(test_dataloader) * CONFIG['n_labels'],), dtype=object)
confs_ = np.empty(shape=(len(test_dataloader) * CONFIG['n_labels'],), dtype=np.float32)

bar = tqdm(enumerate(test_dataloader), total=len(test_dataloader))
for step, data in bar:
    embed = data['embed'].to(CONFIG['device'])
    id = data['id']
    confs_[step * CONFIG['n_labels']:(step+1) * CONFIG['n_labels']] = torch.nn.functional.sigmoid(model(embed)).squeeze().detach().cpu().numpy()
    ids_[step * CONFIG['n_labels']:(step+1) * CONFIG['n_labels']] = id[0]
    go_terms_[step * CONFIG['n_labels']:(step+1) * CONFIG['n_labels']] = labels_names

submission_df = pd.DataFrame(data={"Id" : ids_, "GO term" : go_terms_, "Confidence" : confs_})

100%|██████████| 141865/141865 [06:27<00:00, 365.94it/s]


In [9]:
submission_df.head(50)

Unnamed: 0,Id,GO term,Confidence
0,Q9CQV8,GO:0005575,0.999314
1,Q9CQV8,GO:0008150,0.998685
2,Q9CQV8,GO:0110165,0.999305
3,Q9CQV8,GO:0003674,0.999605
4,Q9CQV8,GO:0005622,0.999265
5,Q9CQV8,GO:0009987,0.996265
6,Q9CQV8,GO:0043226,0.998219
7,Q9CQV8,GO:0043229,0.997871
8,Q9CQV8,GO:0005488,0.999793
9,Q9CQV8,GO:0043227,0.997979


In [10]:
len(submission_df)

85119000

In [11]:
submission_df.to_csv('submission.tsv', sep='\t', index=False)