In [None]:
!pip install Bio

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os, gc, re, csv
from collections import defaultdict

import pandas as pd
from tqdm.auto import tqdm
import numpy as np

from collections import Counter
import math

from Bio import SeqIO

from lightgbm import LGBMClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import KFold
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.cuda.amp import autocast, GradScaler

from tqdm import tqdm


import math

In [None]:
ROOT_DIR = './'
is_kaggle = True
if is_kaggle:
    ROOT_DIR = '/kaggle/input/cafa-6-protein-function-prediction/'

In [None]:
AA_LIST = list("ACDEFGHIKLMNPQRSTVWY")

TOP_DI = ['AL', 'LA', 'LE', 'EA', 'AA', 'AS', 'SA', 'EL', 'LL', 'AE', 'SE', 'ES', 'GA', 'AG', 'VA', 'AV', 'LV', 'VL', 'LS', 'SL']

TOP_TRI = ['ALA', 'LEA', 'EAL', 'LAL', 'AAA', 'LLE', 'ELE', 'ALE', 'GAL', 'ASA', 'VLA', 'LAV', 'SLS', 'LSL', 'GLA', 'LAG', 'AVL', 'VLA', 'SLE', 'LES']


def compute_idf_aa(sequences):
    df = Counter()
    n = len(sequences)

    for seq in sequences:
        unique_aa = set(seq)
        for aa in unique_aa:
            if aa in AA_LIST:
                df[aa] += 1

    idf = {aa: math.log((n + 1) / (df[aa] + 1)) + 1 for aa in AA_LIST}
    return idf


def compute_idf_di(sequences):
    df = Counter()
    n = len(sequences)

    for seq in sequences:
        length = len(seq)
        tokens = set(
            seq[i:i+2] for i in range(length - 1)
        )
        for dp in tokens:
            if dp in TOP_DI:
                df[dp] += 1

    idf = {dp: math.log((n + 1) / (df[dp] + 1)) + 1 for dp in TOP_DI}
    return idf


def compute_idf_tri(sequences):
    df = Counter()
    n = len(sequences)

    for seq in sequences:
        length = len(seq)
        tokens = set(
            seq[i:i+3] for i in range(length - 2)
        )
        for tp in tokens:
            if tp in TOP_TRI:
                df[tp] += 1

    idf = {tp: math.log((n + 1) / (df[tp] + 1)) + 1 for tp in TOP_TRI}
    return idf


def compute_all_idf(sequences):
    return {
        "idf_aa": compute_idf_aa(sequences),
        "idf_di": compute_idf_di(sequences),
        "idf_tri": compute_idf_tri(sequences),
    }

In [None]:
def extract_sequence_features_tf_idf(
    seq,
    idf_aa=None,
    idf_di=None,
    idf_tri=None
):

    if not seq or len(seq) == 0:
        return np.zeros(85)

    length = len(seq)
    aa_counts = Counter(seq)

    aa_list = list("ACDEFGHIKLMNPQRSTVWY")
    tf_aa = np.array([aa_counts.get(a, 0) / length for a in aa_list])

    if idf_aa is not None:
        idf_vec = np.array([idf_aa.get(a, 1.0) for a in aa_list])
        tfidf_aa = tf_aa * idf_vec
    else:
        tfidf_aa = tf_aa

    hydrophobic = sum(aa_counts.get(a, 0) for a in 'AILMFWYV') / length
    charged     = sum(aa_counts.get(a, 0) for a in 'DEKR') / length

    aa_weights = {'A': 89, 'C': 121, 'D': 133, 'E': 147, 'F': 165, 'G': 75, 'H': 155, 'I': 131, 'K': 146, 'L': 131, 'M': 149, 'N': 132, 'P': 115, 'Q': 146, 'R': 174, 'S': 105, 'T': 119, 'V': 117, 'W': 204, 'Y': 181}

    mol_weight = sum(aa_counts.get(a, 0) * aa_weights.get(a, 0) for a in aa_counts)

    basic_features = np.array([
        np.log1p(length),
        hydrophobic,
        charged,
        np.log1p(mol_weight)
    ])

    
    groups = {'hydrophobic': set('AILMFWYV'),'polar': set('STNQ'),'positive': set('RK'),'negative': set('DE'),'aromatic': set('FWY'),'aliphatic': set('ILV'),'small': set('ACDGNPSTV')}

    ctd_features = np.array([
        sum(1 for aa in seq if aa in g) / length for g in groups.values()
    ])

    
    di_counts = Counter(seq[i:i+2] for i in range(length - 1))
    denom_di = max(length - 1, 1)

    tf_di = np.array([di_counts.get(dp, 0) / denom_di for dp in TOP_DI])

    if idf_di is not None:
        idf_vec_di = np.array([idf_di.get(dp, 1.0) for dp in TOP_DI])
        tfidf_di = tf_di * idf_vec_di
    else:
        tfidf_di = tf_di

    tri_counts = Counter(seq[i:i+3] for i in range(length - 2))
    denom_tri = max(length - 2, 1)

    tf_tri = np.array([tri_counts.get(tp, 0) / denom_tri for tp in TOP_TRI])

    if idf_tri is not None:
        idf_vec_tri = np.array([idf_tri.get(tp, 1.0) for tp in TOP_TRI])
        tfidf_tri = tf_tri * idf_vec_tri
    else:
        tfidf_tri = tf_tri

    
    return np.concatenate([
        tfidf_aa,
        basic_features,
        ctd_features,
        tfidf_di,
        tfidf_tri
    ])


In [None]:
def build_feature_matrix(seqs, idf_aa, idf_di, idf_tri):
    return np.array([extract_sequence_features_tf_idf(seq, idf_aa, idf_di, idf_tri) for seq in seqs])

In [None]:
ROOT_DIR = './'
is_kaggle = True
if is_kaggle:
    ROOT_DIR = '/kaggle/input/cafa-6-protein-function-prediction/'

In [None]:
terms_df = pd.read_csv(os.path.join(ROOT_DIR + "Train/train_terms.tsv"), sep="\t", usecols=["EntryID", "term"])
train_annotations = terms_df.groupby("EntryID")["term"].apply(list).to_dict()

train_sq = []
train_answer = []

terms_to_answer = terms_df.groupby('EntryID')['term'].apply(list).to_dict()

cnt = 0


for record in SeqIO.parse(os.path.join(ROOT_DIR + "Train/train_sequences.fasta"), "fasta"):
    try:
        if "|" in record.id:
            clean_id = record.id.split("|")[1]
        else:
            clean_id = record.id
        a = record.description.split("OX=")
        b = a[1].split(" ")[0]
        if clean_id:
            train_sq.append({
                "id": clean_id,
                "tax": b,
                "seq": str(record.seq),
                "answer": terms_to_answer[clean_id]
            })
        else:
            print("123")
    except IndexError:
        continue

test_sq = []

for record in SeqIO.parse(os.path.join(ROOT_DIR + "Test/testsuperset.fasta"), "fasta"):
    tax = record.description.split(" ")[1]
    # if (tax not in test_gr):
    #     test_gr[tax] = []
    test_sq.append({
        "id": record.id,
        "tax": tax,
        "seq": str(record.seq)
    })

test_df = pd.DataFrame(test_sq)
train_df = pd.DataFrame(train_sq)

In [None]:
features = ['LL', 'SS', 'SL', 'LS', 'AA', 'LA', 'AL', 'EE', 'LE', 'VL', 'EL', 'LV', 'LG', 'GL', 'LK', 'AS', 'GS', 'LR', 'SG', 'KL', 'GG', 'LP', 'TL', 'SA', 'IL', 'RL', 'LT', 'DL', 'LD', 'EK', 'EA', 'AV', 'KK', 'VS', 'AG', 'SV', 'PS', 'SP', 'VA', 'LQ', 'LI', 'KE', 'ST', 'TS', 'PL', 'SE', 'AE', 'GA', 'VV', 'PP', 'SK', 'IS', 'QL', 'FL', 'ES', 'SR', 'ED', 'KS', 'NL', 'KA', 'SI', 'GV', 'DS', 'PA', 'VE', 'TA', 'AT', 'EV', 'SD', 'PG', 'RS', 'RR', 'GK',
            'DE', 'GE', 'TV', 'AK', 'VG', 'PE', 'TG', 'EG', 'AR', 'ER', 'AP', 'GT', 'TT', 'RE', 'EI', 'IA', 'NS', 'DG', 'GR', 'RK', 'FS', 'PV', 'DV', 'VD', 'SN', 'VI', 'ET', 'VP', 'TP', 'RG', 'GD', 'SQ', 'AD', 'DD', 'SF', 'KI', 'KT', 'GP', 'RV', 'IE', 'QQ', 'IG', 'VR', 'QA', 'TI', 'IP', 'KP', 'DK', 'KN', 'RI', 'DP', 'GF', 'GN', 'IR', 'EP', 'PD', 'NV', 'RP', 'QK', 'NE', 'HL', 'FV', 'GQ', 'DR', 'DF', 'IF', 'SY', 'RF', 'GY', 'TQ', 'FP', 'HP', 'VC', 'GC', 'GW']

print(len(features))

In [None]:
idf_all = compute_all_idf(train_df["seq"])

idf_aa  = idf_all["idf_aa"]
idf_di  = idf_all["idf_di"]
idf_tri = idf_all["idf_tri"]
X_train = build_feature_matrix(train_df["seq"], idf_aa, idf_di, idf_tri)

In [None]:
X_test = build_feature_matrix(test_df["seq"], idf_aa, idf_di, idf_tri)


In [None]:
X_train.shape, X_test.shape

In [None]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df["answer"])
print(y_train.shape)

In [None]:
class ProteinDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
        self.Y = torch.tensor(Y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.Y[i]

class MLP(nn.Module):
    def __init__(self, input_dim=1280, output_dim=None):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, output_dim) 
        )

    def forward(self, x):
        return self.net(x)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 128
epochs = 30
lr = 1e-3

dataset = ProteinDataset(X_train, y_train)
val_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size)

model = MLP(input_dim=X_train.shape[1], output_dim=y_train.shape[1]).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2, verbose=True
)
scaler = GradScaler()



In [None]:
for epoch in range(epochs):
    model.train()
    train_loss = 0

    for Xb, Yb in train_loader:
        Xb, Yb = Xb.to(device), Yb.to(device)

        optimizer.zero_grad()

        with autocast():
            logits = model(Xb)
            loss = criterion(logits, Yb)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * Xb.size(0)

    train_loss /= len(train_loader.dataset)

    model.eval()
    val_loss = 0

    with torch.no_grad():
        for Xb, Yb in val_loader:
            Xb, Yb = Xb.to(device), Yb.to(device)
            with autocast():
                logits = model(Xb)
                loss = criterion(logits, Yb)
            val_loss += loss.item() * Xb.size(0)

    val_loss /= len(val_loader.dataset)
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.6f} | Val Loss: {val_loss:.6f}")

torch.save(model.state_dict(), "protein_mlp_best.pt")

In [None]:
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
import math
import numpy as np
from tqdm.auto import tqdm


In [None]:
infer_batch_size = 32
min_prob = 0.02
top_k = None

model_path = "protein_mlp_best.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model_inf = MLP(input_dim=X_train.shape[1], output_dim=y_train.shape[1]).to(device)
state = torch.load(model_path, map_location=device)
model_inf.load_state_dict(state)
model_inf.eval()

In [None]:
def clean_id_for_matching(s):
    if "|" in s:
        parts = s.split("|")
        if len(parts) >= 2:
            return parts[1]
    return s

train_id_to_terms = {}
for idx, row in train_df.iterrows():
    train_id_to_terms[clean_id_for_matching(row["id"])] = row["answer"]

In [None]:
X_test_tensor = torch.tensor(X_test, dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

In [None]:
n_test = X_test_tensor.shape[0]
n_batches = math.ceil(n_test / infer_batch_size)

all_probs = []  
sigmoid = torch.nn.Sigmoid()

In [None]:
with torch.no_grad():
    for i in tqdm(range(n_batches), desc="Inference Batches"):
        start = i * infer_batch_size
        end = min((i + 1) * infer_batch_size, n_test)
        batch_X = X_test_tensor[start:end].to(device)

        with autocast():
            logits = model_inf(batch_X)
            probs = sigmoid(logits)

        probs = probs.cpu().numpy()

        for j in range(probs.shape[0]):
            idx_global = start + j
            test_row = test_df.iloc[idx_global]
            orig_test_id = test_row["id"]
            cleaned = clean_id_for_matching(orig_test_id)
            
            row_probs = probs[j] 
            
            ge_idx = np.where(row_probs >= min_prob)[0]
            sorted_idx = ge_idx[np.argsort(row_probs[ge_idx])[::-1]] if len(ge_idx) > 0 else []
            selected = [(mlb.classes_[k], float(row_probs[k])) for k in sorted_idx]

            all_probs.append({"id": orig_test_id, "preds": selected})


In [None]:
output_file = "submission.tsv"
num_lines = 0
with open(output_file, "w") as f:
    for mp in all_probs:
        if len(mp["preds"]) == 0:
            continue
        for pred, prob in mp["preds"]:
            num_lines += 1
            f.write(mp["id"] + "\t" + str(pred) + "\t" + str(round(prob, 3)) + "\n")

print(f"Wrote {output_file} with {len(all_probs)} entries and {num_lines} lines (some may have zero preds skipped).")