In [None]:
!pip install Bio


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os, gc, re, csv
from collections import defaultdict

import pandas as pd
import numpy as np

from Bio import SeqIO

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset, DataLoader
from torch.cuda.amp import autocast, GradScaler


In [None]:
ROOT_DIR = './'
is_kaggle = True
if is_kaggle:
    ROOT_DIR = '/kaggle/input/cafa-6-protein-function-prediction/'

In [None]:
# https://www.kaggle.com/datasets/letuano5/prott5
protein_ids = np.load('/kaggle/input/prott5/test_ids.npy')
embeddings = np.load('/kaggle/input/prott5/test_embeddings.npy')
embeddings_dict = {pid: emb for pid, emb in zip(protein_ids, embeddings)}
print(f"Loaded {len(protein_ids)} embeddings of dimension {embeddings.shape[1]}")

In [None]:
bad_ids = ['Q09165', 'A2ASS6', 'Q09164', 'Q9H195', 'Q8WXI7', 'A0A0S6XHH0',
           'Q9I7U4', 'Q8WZ42', 'A0A348AXX4', 'M9MRD1', 'G4SLH0']

In [None]:
terms_df = pd.read_csv(os.path.join(ROOT_DIR + "Train/train_terms.tsv"), sep="\t", usecols=["EntryID", "term"])
train_annotations = terms_df.groupby("EntryID")["term"].apply(list).to_dict()

train_sq = []
train_answer = []

terms_to_answer = terms_df.groupby('EntryID')['term'].apply(list).to_dict()

cnt = 0

seq_of_id = {}

for record in SeqIO.parse(os.path.join(ROOT_DIR + "Train/train_sequences.fasta"), "fasta"):
    try:
        if "|" in record.id:
            clean_id = record.id.split("|")[1]
        else:
            clean_id = record.id
        a = record.description.split("OX=")
        b = a[1].split(" ")[0]
        if clean_id:
            train_sq.append({
                "id": clean_id,
                "tax": b,
                "seq": str(record.seq),
                "answer": terms_to_answer[clean_id]
            })
            seq_of_id[clean_id] = str(record.seq)
        else:
            print("123")
    except IndexError:
        continue

test_sq = []

for record in SeqIO.parse(os.path.join(ROOT_DIR + "Test/testsuperset.fasta"), "fasta"):
    tax = record.description.split(" ")[1]
    test_sq.append({
        "id": record.id,
        "tax": tax,
        "seq": str(record.seq)
    })

    seq_of_id[record.id] = str(record.seq)

test_df = pd.DataFrame(test_sq)
train_df = pd.DataFrame(train_sq)

In [None]:
def get_embds(ids):
    return np.array([embeddings_dict[id] if id in embeddings_dict else np.zeros(1024) for id in ids])

In [None]:
X_train = get_embds(train_df["id"])
X_test = get_embds(test_df["id"])

In [None]:
X_train.shape, X_test.shape

In [None]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df["answer"])
print(y_train.shape)

In [None]:
class ProteinDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
        self.Y = torch.tensor(Y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.Y[i]

# https://www.kaggle.com/code/alexandervc/pytorch-keras-etc-3-blend-cafa-metric-etc#Optimizer-%2522Sophia%2522-sometimes-better-than-Adam

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        
        self.activation = nn.PReLU()
        
        self.bn1 = nn.BatchNorm1d(input_dim)
        self.fc1 = nn.Linear(input_dim, 800)
        self.ln1 = nn.LayerNorm(800, elementwise_affine=True)
        
        self.bn2 = nn.BatchNorm1d(800)
        self.fc2 = nn.Linear(800, 600)
        self.ln2 = nn.LayerNorm(600, elementwise_affine=True)
        
        self.bn3 = nn.BatchNorm1d(600)
        self.fc3 = nn.Linear(600, 400)
        self.ln3 = nn.LayerNorm(400, elementwise_affine=True)
        
        # skip connection concat: 400 + 800 = 1200
        self.bn4 = nn.BatchNorm1d(1200)
        self.fc4 = nn.Linear(1200, output_dim)
        self.ln4 = nn.LayerNorm(output_dim, elementwise_affine=True)

    def forward(self, inputs):
        # First block
        fc1_out = self.bn1(inputs)
        fc1_out = self.ln1(self.fc1(inputs))
        fc1_out = self.activation(fc1_out)
        
        # Second block
        x = self.bn2(fc1_out)
        x = self.ln2(self.fc2(x))
        x = self.activation(x)
        
        # Third block
        x = self.bn3(x)
        x = self.ln3(self.fc3(x))
        x = self.activation(x)
        
        # Skip concat: x (400) + fc1_out (800)
        x = torch.cat([x, fc1_out], dim=-1)
        
        # Final block
        x = self.bn4(x)
        x = self.ln4(self.fc4(x))

        # Trả về LOGITS
        return x
        

In [None]:
K = 5
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 128
infer_batch_size = 32
epochs = 30
lr = 1e-3
min_prob = 0.02


In [None]:
# --- helper: training per fold (returns trained model) ---
def train_one_fold(train_idx, val_idx, X, Y, fold_id):
    train_ds = Subset(dataset, train_idx)
    val_ds = Subset(dataset, val_idx)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size)

    model = MLP(input_dim=X.shape[1], output_dim=Y.shape[1]).to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=False)
    scaler = torch.cuda.amp.GradScaler()
    
    best_val = float('inf')
    best_state = None

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for Xb, Yb in train_loader:
            Xb, Yb = Xb.to(device), Yb.to(device)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                logits = model(Xb)
                loss = criterion(logits, Yb)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item() * Xb.size(0)
        train_loss /= len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for Xb, Yb in val_loader:
                Xb, Yb = Xb.to(device), Yb.to(device)
                with torch.cuda.amp.autocast():
                    logits = model(Xb)
                    loss = criterion(logits, Yb)
                val_loss += loss.item() * Xb.size(0)
        val_loss /= len(val_loader.dataset)
        scheduler.step(val_loss)

        if val_loss < best_val:
            best_val = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        print(f"[Fold {fold_id}] Epoch {epoch+1}/{epochs} train_loss={train_loss:.6f} val_loss={val_loss:.6f}")

    model.load_state_dict(best_state)
    return model

In [None]:
dataset = ProteinDataset(X_train, y_train)

kf = KFold(n_splits=K, shuffle=True, random_state=42)

In [None]:
test_preds_probs = defaultdict(lambda: defaultdict(list))
oof_probs = defaultdict(dict)
sigmoid = torch.nn.Sigmoid()
X_test_tensor = torch.tensor(X_test, dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

In [None]:
for fold_id, (train_idx, val_idx) in enumerate(kf.split(np.arange(len(dataset))), 1):
    print(f"=== Fold {fold_id}/{K} ===")
    model = train_one_fold(train_idx, val_idx, X_train, y_train, fold_id)

    model.eval()
    with torch.no_grad():
        val_ds = Subset(dataset, val_idx)
        val_loader = DataLoader(val_ds, batch_size=batch_size)
        for batch_offset, (Xb, Yb) in enumerate(val_loader):
            Xb = Xb.to(device)
            with torch.cuda.amp.autocast():
                logits = model(Xb)
                probs = sigmoid(logits).cpu().numpy()  # shape (bs, n_classes)
            for i in range(probs.shape[0]):
                global_idx = val_idx[batch_offset * batch_size + i]
                row_probs = probs[i]
                nz = np.where(row_probs >= min_prob)[0]
                if len(nz) == 0:
                    # optionally keep top1
                    top = int(np.argmax(row_probs))
                    oof_probs[global_idx][top] = float(row_probs[top])
                else:
                    for k in nz:
                        oof_probs[global_idx][k] = float(row_probs[k])

    n_test = X_test_tensor.shape[0]
    n_batches = int(np.ceil(n_test / infer_batch_size))
    with torch.no_grad():
        for b in range(n_batches):
            s = b * infer_batch_size
            e = min((b+1) * infer_batch_size, n_test)
            batch_X = X_test_tensor[s:e].to(device)
            with torch.cuda.amp.autocast():
                logits = model(batch_X)
                probs = sigmoid(logits).cpu().numpy()  # (bs, n_classes)
            for i in range(probs.shape[0]):
                idx_global = s + i
                orig_test_id = test_df.iloc[idx_global]["id"]
                row_probs = probs[i]
                keep_idx = np.where(row_probs >= min_prob)[0]
                for cls in keep_idx:
                    test_preds_probs[orig_test_id][int(cls)].append(float(row_probs[cls]))

    del model
    torch.cuda.empty_cache()

In [None]:
final_test_preds = []

for tid, cls_dict in test_preds_probs.items():
    agg_list = []
    for cls_idx, prob_list in cls_dict.items():
        mean_p = float(np.mean(prob_list))
        if mean_p >= min_prob:
            agg_list.append((cls_idx, mean_p))
    agg_list.sort(key=lambda x: x[1], reverse=True)
    preds = [(mlb.classes_[c], p) for c, p in agg_list]
    final_test_preds.append({"id": tid, "preds": preds})

In [None]:
output_file = "submission.tsv"
num_lines = 0
with open(output_file, "w") as f:
    for mp in final_test_preds:
        if len(mp["preds"]) == 0:
            continue
        for pred, prob in mp["preds"]:
            num_lines += 1
            f.write(mp["id"] + "\t" + str(pred) + "\t" + str(round(prob, 3)) + "\n")

print(f"Wrote {output_file} with {len(final_test_preds)} entries and {num_lines} lines (some may have zero preds skipped).")