In [1]:
CONFIG = {
    "n_terms": 5000,
    "knn_k": 10,
    "knn_weight": 0.4,
    "model_weight": 0.6,
    "paths": {
        "train_emb": "/kaggle/input/esm-dataset/train_embeds.npy",
        "train_ids": "/kaggle/input/esm-dataset/train_ids.npy",
        "test_emb": "/kaggle/input/esm-dataset/test_embeds.npy",
        "test_ids": "/kaggle/input/esm-dataset/test_ids.npy",
        "train_terms": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv",
        "train_tax": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",
        "test_tax": "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv",
        "go_obo": "/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo"
    }
}


In [2]:
# === SENIOR SCIENTIST PIPELINE: TARGET 0.30+ ===
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import gc

# ============ CONFIG ============
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

CONFIG = {
    "n_terms": 5000,  # INCREASED from 1500
    "knn_k": 10,       # Number of neighbors
    "knn_weight": 0.4, # Blend weight for KNN
    "model_weight": 0.6,
    "paths": {
        "train_emb": "/kaggle/input/esm-dataset/train_embeds.npy",
        "train_ids": "/kaggle/input/esm-dataset/train_ids.npy",
        "test_emb": "/kaggle/input/esm-dataset/test_embeds.npy",
        "test_ids": "/kaggle/input/esm-dataset/test_ids.npy",
        "train_terms": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv",
        "train_tax": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",
        "test_tax": "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv",
        "go_obo": "/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo"
    }
}

# ============ 1. LOAD DATA ============
print("\n=== 1. Loading Data ===")
train_emb = np.load(CONFIG["paths"]["train_emb"]).astype(np.float32)
train_ids = np.load(CONFIG["paths"]["train_ids"])
test_emb = np.load(CONFIG["paths"]["test_emb"]).astype(np.float32)
test_ids = np.load(CONFIG["paths"]["test_ids"])

print(f"Train: {train_emb.shape}, Test: {test_emb.shape}")

# Normalize (for cosine similarity in KNN)
print("Normalizing embeddings...")
train_emb_norm = normalize(train_emb, axis=1)
test_emb_norm = normalize(test_emb, axis=1)

# Standard scale for neural network
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb_scaled = (train_emb - mean) / std
test_emb_scaled = (test_emb - mean) / std

# Load Terms
print("Loading terms...")
terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])
id_to_idx = {pid: i for i, pid in enumerate(train_ids)}

# ============ 2. KNN INDEX ============
print("\n=== 2. Building KNN Index ===")
knn_model = NearestNeighbors(n_neighbors=CONFIG["knn_k"], metric='cosine', algorithm='brute', n_jobs=-1)
knn_model.fit(train_emb_norm)
print("KNN Index Ready.")

print("Finding neighbors for test set...")
distances, indices = knn_model.kneighbors(test_emb_norm)
similarities = 1 - distances
print(f"KNN search complete. Shape: {indices.shape}")

# ============ 3. TAXONOMY ============
print("\n=== 3. Preparing Taxonomy ===")
train_tax_df = pd.read_csv(CONFIG["paths"]["train_tax"], sep="\t", header=None, names=["id", "tax_id"])
test_tax_df = pd.read_csv(CONFIG["paths"]["test_tax"], sep="\t", header=None, names=["id", "tax_id"])

train_tax_df['tax_id'] = train_tax_df['tax_id'].astype(str)
test_tax_df['tax_id'] = test_tax_df['tax_id'].astype(str)

all_taxons = set(train_tax_df['tax_id'].unique()) | set(test_tax_df['tax_id'].unique())
tax_list = sorted(list(all_taxons))
tax_map = {t: i for i, t in enumerate(tax_list)}
num_taxons = len(tax_list)

def get_tax_indices(id_list, df):
    mapping = dict(zip(df["id"], df["tax_id"]))
    return np.array([tax_map.get(mapping.get(pid, "0"), 0) for pid in id_list], dtype=np.int32)

train_tax_idx = get_tax_indices(train_ids, train_tax_df)
test_tax_idx = get_tax_indices(test_ids, test_tax_df)
print(f"Taxonomy Ready. {num_taxons} species.")

# ============ 4. MODEL DEFINITION ============
class DeepTaxModel(nn.Module):
    def __init__(self, n_feat, n_taxons, n_class):
        super().__init__()
        self.tax_emb = nn.Embedding(n_taxons, 128)
        
        self.net = nn.Sequential(
            nn.Linear(n_feat + 128, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, n_class)
        )
        
    def forward(self, x, t):
        t_vec = self.tax_emb(t)
        combined = torch.cat([x, t_vec], dim=1)
        return self.net(combined)

class MultiInputData(Dataset):
    def __init__(self, X, T, y=None):
        self.X = torch.from_numpy(X)
        self.T = torch.from_numpy(T).long()
        self.y = torch.from_numpy(y) if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return (self.X[i], self.T[i], self.y[i]) if self.y is not None else (self.X[i], self.T[i])

# ============ 5. TRAINING FUNCTION ============
def train_aspect(aspect_char, aspect_name):
    print(f"\n>>> Training for {aspect_name} ({aspect_char})...")
    
    aspect_terms = terms_df[terms_df['aspect'] == aspect_char]
    top_terms = aspect_terms['term'].value_counts().index[:CONFIG["n_terms"]].tolist()
    term_map = {t: i for i, t in enumerate(top_terms)}
    num_classes = len(top_terms)
    print(f"Using {num_classes} GO terms")
    
    label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
    relevant = aspect_terms[aspect_terms['term'].isin(top_terms)]
    
    for _, row in tqdm(relevant.iterrows(), total=len(relevant), desc="Labels"):
        if row['id'] in id_to_idx:
            label_matrix[id_to_idx[row['id']], term_map[row['term']]] = 1.0
    
    ds = MultiInputData(train_emb_scaled, train_tax_idx, label_matrix)
    loader = DataLoader(ds, batch_size=512, shuffle=True, num_workers=2)
    
    model = DeepTaxModel(1280, num_taxons, num_classes).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=15)
    loss_fn = nn.BCEWithLogitsLoss()
    
    for epoch in range(15):
        model.train()
        total_loss = 0
        for x, t, y in tqdm(loader, desc=f"Epoch {epoch+1}", leave=False):
            x, t, y = x.to(device), t.to(device), y.to(device)
            opt.zero_grad()
            loss = loss_fn(model(x, t), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            total_loss += loss.item()
        scheduler.step()
        print(f"Epoch {epoch+1} Loss: {total_loss/len(loader):.4f}")
    
    torch.save(model.state_dict(), f"model_senior_{aspect_char}.pth")
    return top_terms, term_map, label_matrix

# ============ 6. EXECUTE TRAINING ============
results = {}
for aspect_char, aspect_name in [('F', 'Function'), ('P', 'Process'), ('C', 'Component')]:
    top_terms, term_map, label_matrix = train_aspect(aspect_char, aspect_name)
    results[aspect_char] = {
        'terms': top_terms,
        'term_map': term_map,
        'labels': label_matrix
    }

print("\n✅ All Models Trained!")

# ============ 7. PREDICTION WITH KNN BLEND ============
print("\n=== 7. Generating Blended Predictions ===")

with open("submission_senior.tsv", "w") as f:
    
    for aspect_char in ['F', 'P', 'C']:
        print(f"\nPredicting {aspect_char}...")
        terms = results[aspect_char]['terms']
        term_map = results[aspect_char]['term_map']
        label_matrix = results[aspect_char]['labels']
        num_classes = len(terms)
        
        model = DeepTaxModel(1280, num_taxons, num_classes).to(device)
        model.load_state_dict(torch.load(f"model_senior_{aspect_char}.pth"))
        model.eval()
        
        ds = MultiInputData(test_emb_scaled, test_tax_idx)
        loader = DataLoader(ds, batch_size=1024)
        
        nn_preds = []
        with torch.no_grad():
            for x, t in tqdm(loader, desc="NN Pred"):
                x, t = x.to(device), t.to(device)
                probs = torch.sigmoid(model(x, t)).cpu().numpy()
                nn_preds.append(probs)
        nn_preds = np.vstack(nn_preds)
        
        print("Computing KNN predictions...")
        knn_preds = np.zeros((len(test_ids), num_classes), dtype=np.float32)
        
        for i in tqdm(range(len(test_ids)), desc="KNN Pred"):
            neighbor_indices = indices[i]
            neighbor_sims = similarities[i]
            
            for j, (n_idx, sim) in enumerate(zip(neighbor_indices, neighbor_sims)):
                knn_preds[i] += sim * label_matrix[n_idx]
            
            knn_preds[i] /= (neighbor_sims.sum() + 1e-8)
        
        # BLEND
        final_preds = (CONFIG["model_weight"] * nn_preds) + (CONFIG["knn_weight"] * knn_preds)
        
        print(f"Writing {aspect_char} predictions...")
        for i, pid in enumerate(tqdm(test_ids)):
            row = final_preds[i]
            top_idx = np.argpartition(row, -100)[-100:]
            
            for idx in top_idx:
                score = row[idx]
                if score > 0.005:
                    f.write(f"{pid}\t{terms[idx]}\t{score:.4f}\n")
        
        del model, nn_preds, knn_preds, final_preds
        torch.cuda.empty_cache()
        gc.collect()

print("\n✅ submission_senior.tsv created!")


Device: cpu

=== 1. Loading Data ===
Train: (82404, 1280), Test: (224309, 1280)
Normalizing embeddings...
Loading terms...

=== 2. Building KNN Index ===
KNN Index Ready.
Finding neighbors for test set...
KNN search complete. Shape: (224309, 10)

=== 3. Preparing Taxonomy ===
Taxonomy Ready. 9835 species.

>>> Training for Function (F)...
Using 5000 GO terms


Labels: 100%|██████████| 126836/126836 [00:04<00:00, 28658.09it/s]
                                                          

Epoch 1 Loss: 0.0199


                                                          

Epoch 2 Loss: 0.0020


                                                          

Epoch 3 Loss: 0.0019


                                                          

Epoch 4 Loss: 0.0018


                                                          

Epoch 5 Loss: 0.0017


                                                          

Epoch 6 Loss: 0.0016


                                                          

Epoch 7 Loss: 0.0016


                                                          

Epoch 8 Loss: 0.0015


                                                          

Epoch 9 Loss: 0.0015


                                                           

Epoch 10 Loss: 0.0014


                                                           

Epoch 11 Loss: 0.0014


                                                           

Epoch 12 Loss: 0.0014


                                                           

Epoch 13 Loss: 0.0014


                                                           

Epoch 14 Loss: 0.0013


                                                           

Epoch 15 Loss: 0.0013

>>> Training for Process (P)...
Using 5000 GO terms


Labels: 100%|██████████| 208567/208567 [00:07<00:00, 29131.93it/s]
                                                          

Epoch 1 Loss: 0.0226


                                                          

Epoch 2 Loss: 0.0043


                                                          

Epoch 3 Loss: 0.0040


                                                          

Epoch 4 Loss: 0.0039


                                                          

Epoch 5 Loss: 0.0037


                                                          

Epoch 6 Loss: 0.0036


                                                          

Epoch 7 Loss: 0.0035


                                                          

Epoch 8 Loss: 0.0034


                                                          

Epoch 9 Loss: 0.0033


                                                           

Epoch 10 Loss: 0.0032


                                                           

Epoch 11 Loss: 0.0032


                                                           

Epoch 12 Loss: 0.0031


                                                           

Epoch 13 Loss: 0.0031


                                                           

Epoch 14 Loss: 0.0031


                                                           

Epoch 15 Loss: 0.0031

>>> Training for Component (C)...
Using 2651 GO terms


Labels: 100%|██████████| 157770/157770 [00:05<00:00, 28843.96it/s]
                                                          

Epoch 1 Loss: 0.0214


                                                          

Epoch 2 Loss: 0.0037


                                                          

Epoch 3 Loss: 0.0033


                                                          

Epoch 4 Loss: 0.0031


                                                          

Epoch 5 Loss: 0.0030


                                                          

Epoch 6 Loss: 0.0029


                                                          

Epoch 7 Loss: 0.0028


                                                          

Epoch 8 Loss: 0.0027


                                                          

Epoch 9 Loss: 0.0027


                                                           

Epoch 10 Loss: 0.0026


                                                           

Epoch 11 Loss: 0.0026


                                                           

Epoch 12 Loss: 0.0025


                                                           

Epoch 13 Loss: 0.0025


                                                           

Epoch 14 Loss: 0.0025


                                                           

Epoch 15 Loss: 0.0025

✅ All Models Trained!

=== 7. Generating Blended Predictions ===

Predicting F...


NN Pred: 100%|██████████| 220/220 [00:03<00:00, 62.31it/s]


Computing KNN predictions...


KNN Pred: 100%|██████████| 224309/224309 [00:11<00:00, 20117.13it/s]


Writing F predictions...


100%|██████████| 224309/224309 [00:11<00:00, 18999.86it/s]



Predicting P...


NN Pred: 100%|██████████| 220/220 [00:03<00:00, 58.56it/s]


Computing KNN predictions...


KNN Pred: 100%|██████████| 224309/224309 [00:10<00:00, 20484.54it/s]


Writing P predictions...


100%|██████████| 224309/224309 [00:12<00:00, 18470.82it/s]



Predicting C...


NN Pred: 100%|██████████| 220/220 [00:03<00:00, 66.48it/s]


Computing KNN predictions...


KNN Pred: 100%|██████████| 224309/224309 [00:09<00:00, 23200.38it/s]


Writing C predictions...


100%|██████████| 224309/224309 [00:11<00:00, 19201.00it/s]



✅ submission_senior.tsv created!


In [3]:
# --- GRAPH PROPAGATION ---
!pip install obonet networkx -q

import obonet
from tqdm import tqdm

print("Loading Graph...")
graph = obonet.read_obo(CONFIG["paths"]["go_obo"])
parent_map = {}
for node in graph.nodes():
    parents = [p for p in graph.successors(node) if p in graph.nodes()]
    if parents: parent_map[node] = parents

print("Loading predictions...")
submission_dict = {}
with open("submission_senior.tsv", "r") as f:
    for line in tqdm(f):
        parts = line.strip().split("\t")
        if len(parts) == 3:
            pid, term, score = parts
            score = float(score)
            if pid not in submission_dict: submission_dict[pid] = {}
            submission_dict[pid][term] = score

print("Propagating...")
with open("submission_final.tsv", "w") as f_out:
    for pid, preds in tqdm(submission_dict.items()):
        final_scores = preds.copy()
        queue = list(preds.keys())
        visited = set(queue)
        
        while queue:
            term = queue.pop(0)
            score = final_scores.get(term, 0.0)
            if term in parent_map:
                for parent in parent_map[term]:
                    old = final_scores.get(parent, 0.0)
                    if score > old:
                        final_scores[parent] = score
                        if parent not in visited:
                            queue.append(parent)
                            visited.add(parent)
        
        sorted_terms = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:100]
        for term, score in sorted_terms:
            if score > 0.001:
                f_out.write(f"{pid}\t{term}\t{score:.4f}\n")

print("✅ submission_final.tsv ready! Rename to submission.tsv and submit.")


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Loading Graph...
Loading predictions...


10310571it [00:09, 1071658.36it/s]


Propagating...


100%|██████████| 224309/224309 [00:48<00:00, 4604.41it/s]

✅ submission_final.tsv ready! Rename to submission.tsv and submit.





In [4]:
# === OPTIMIZED PIPELINE (NO KNN) ===
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

CONFIG = {
    "n_terms": 3000,  # Sweet spot between coverage and sparsity
    "paths": {
        "train_emb": "/kaggle/input/esm-dataset/train_embeds.npy",
        "train_ids": "/kaggle/input/esm-dataset/train_ids.npy",
        "test_emb": "/kaggle/input/esm-dataset/test_embeds.npy",
        "test_ids": "/kaggle/input/esm-dataset/test_ids.npy",
        "train_terms": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv",
        "train_tax": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",
        "test_tax": "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv",
        "go_obo": "/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo"
    }
}

# ============ LOAD DATA ============
print("Loading data...")
train_emb = np.load(CONFIG["paths"]["train_emb"]).astype(np.float32)
train_ids = np.load(CONFIG["paths"]["train_ids"])
test_emb = np.load(CONFIG["paths"]["test_emb"]).astype(np.float32)
test_ids = np.load(CONFIG["paths"]["test_ids"])

# Standard scale
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std

terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])
id_to_idx = {pid: i for i, pid in enumerate(train_ids)}

# Taxonomy
train_tax_df = pd.read_csv(CONFIG["paths"]["train_tax"], sep="\t", header=None, names=["id", "tax_id"])
test_tax_df = pd.read_csv(CONFIG["paths"]["test_tax"], sep="\t", header=None, names=["id", "tax_id"])
train_tax_df['tax_id'] = train_tax_df['tax_id'].astype(str)
test_tax_df['tax_id'] = test_tax_df['tax_id'].astype(str)

all_taxons = set(train_tax_df['tax_id'].unique()) | set(test_tax_df['tax_id'].unique())
tax_map = {t: i for i, t in enumerate(sorted(all_taxons))}
num_taxons = len(tax_map)

def get_tax_indices(id_list, df):
    mapping = dict(zip(df["id"], df["tax_id"]))
    return np.array([tax_map.get(mapping.get(pid, "0"), 0) for pid in id_list], dtype=np.int32)

train_tax_idx = get_tax_indices(train_ids, train_tax_df)
test_tax_idx = get_tax_indices(test_ids, test_tax_df)

# ============ OPTIMIZED MODEL ============
class OptimizedModel(nn.Module):
    def __init__(self, n_feat, n_taxons, n_class):
        super().__init__()
        self.tax_emb = nn.Embedding(n_taxons, 64)  # Smaller tax embedding
        
        self.net = nn.Sequential(
            nn.Linear(n_feat + 64, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),  # Higher dropout for generalization
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, n_class)
        )
        
    def forward(self, x, t):
        t_vec = self.tax_emb(t)
        return self.net(torch.cat([x, t_vec], dim=1))

class MultiInputData(Dataset):
    def __init__(self, X, T, y=None):
        self.X = torch.from_numpy(X)
        self.T = torch.from_numpy(T).long()
        self.y = torch.from_numpy(y) if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return (self.X[i], self.T[i], self.y[i]) if self.y is not None else (self.X[i], self.T[i])

# ============ TRAINING ============
def train_aspect(aspect_char, aspect_name):
    print(f"\n>>> Training {aspect_name}...")
    
    aspect_terms = terms_df[terms_df['aspect'] == aspect_char]
    top_terms = aspect_terms['term'].value_counts().index[:CONFIG["n_terms"]].tolist()
    num_classes = len(top_terms)
    print(f"Terms: {num_classes}")
    
    label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
    relevant = aspect_terms[aspect_terms['term'].isin(top_terms)]
    
    for _, row in tqdm(relevant.iterrows(), total=len(relevant)):
        if row['id'] in id_to_idx:
            label_matrix[id_to_idx[row['id']], {t: i for i, t in enumerate(top_terms)}[row['term']]] = 1.0
    
    ds = MultiInputData(train_emb, train_tax_idx, label_matrix)
    loader = DataLoader(ds, batch_size=256, shuffle=True)
    
    model = OptimizedModel(1280, num_taxons, num_classes).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3)  # Higher weight decay
    loss_fn = nn.BCEWithLogitsLoss()
    
    for epoch in range(12):
        model.train()
        total = 0
        for x, t, y in tqdm(loader, leave=False):
            x, t, y = x.to(device), t.to(device), y.to(device)
            opt.zero_grad()
            loss = loss_fn(model(x, t), y)
            loss.backward()
            opt.step()
            total += loss.item()
        print(f"Epoch {epoch+1}: {total/len(loader):.4f}")
    
    torch.save(model.state_dict(), f"model_opt_{aspect_char}.pth")
    return top_terms

# Train
results = {}
for char, name in [('F', 'Function'), ('P', 'Process'), ('C', 'Component')]:
    results[char] = train_aspect(char, name)

# ============ PREDICT ============
print("\n>>> Predicting...")
with open("submission_optimized.tsv", "w") as f:
    for char in ['F', 'P', 'C']:
        terms = results[char]
        model = OptimizedModel(1280, num_taxons, len(terms)).to(device)
        model.load_state_dict(torch.load(f"model_opt_{char}.pth"))
        model.eval()
        
        ds = MultiInputData(test_emb, test_tax_idx)
        loader = DataLoader(ds, batch_size=1024)
        
        preds = []
        with torch.no_grad():
            for x, t in tqdm(loader):
                x, t = x.to(device), t.to(device)
                preds.append(torch.sigmoid(model(x, t)).cpu().numpy())
        
        all_preds = np.vstack(preds)
        
        for i, pid in enumerate(test_ids):
            top_idx = np.argpartition(all_preds[i], -80)[-80:]
            for idx in top_idx:
                if all_preds[i, idx] > 0.01:
                    f.write(f"{pid}\t{terms[idx]}\t{all_preds[i, idx]:.4f}\n")
        
        del model, preds
        gc.collect()

print("Done!")


Device: cpu
Loading data...

>>> Training Function...
Terms: 3000


100%|██████████| 121614/121614 [00:32<00:00, 3727.86it/s]
                                                 

Epoch 1: 0.0176


                                                 

Epoch 2: 0.0067


                                                 

Epoch 3: 0.0068


                                                 

Epoch 4: 0.0068


                                                 

Epoch 5: 0.0069


                                                 

Epoch 6: 0.0069


                                                 

Epoch 7: 0.0069


                                                 

Epoch 8: 0.0069


                                                 

Epoch 9: 0.0070


                                                 

Epoch 10: 0.0070


                                                 

Epoch 11: 0.0070


                                                 

Epoch 12: 0.0070

>>> Training Process...
Terms: 3000


100%|██████████| 181213/181213 [00:47<00:00, 3789.84it/s]
                                                 

Epoch 1: 0.0202


                                                 

Epoch 2: 0.0092


                                                 

Epoch 3: 0.0092


                                                 

Epoch 4: 0.0093


                                                 

Epoch 5: 0.0093


                                                 

Epoch 6: 0.0093


                                                 

Epoch 7: 0.0094


                                                 

Epoch 8: 0.0094


                                                 

Epoch 9: 0.0094


                                                 

Epoch 10: 0.0094


                                                 

Epoch 11: 0.0094


                                                 

Epoch 12: 0.0094

>>> Training Component...
Terms: 2651


100%|██████████| 157770/157770 [00:36<00:00, 4293.32it/s]
                                                 

Epoch 1: 0.0182


                                                 

Epoch 2: 0.0077


                                                 

Epoch 3: 0.0078


                                                 

Epoch 4: 0.0078


                                                 

Epoch 5: 0.0079


                                                 

Epoch 6: 0.0079


                                                 

Epoch 7: 0.0079


                                                 

Epoch 8: 0.0079


                                                 

Epoch 9: 0.0079


                                                 

Epoch 10: 0.0079


                                                 

Epoch 11: 0.0079


                                                 

Epoch 12: 0.0079

>>> Predicting...


100%|██████████| 220/220 [00:02<00:00, 86.57it/s]
100%|██████████| 220/220 [00:02<00:00, 97.76it/s] 
100%|██████████| 220/220 [00:02<00:00, 97.10it/s] 


Done!


In [5]:
# === GRAPH PROPAGATION ===
!pip install obonet networkx -q

import obonet
from tqdm import tqdm

print("Loading Graph...")
graph = obonet.read_obo(CONFIG["paths"]["go_obo"])
parent_map = {}
for node in graph.nodes():
    parents = [p for p in graph.successors(node) if p in graph.nodes()]
    if parents:
        parent_map[node] = parents

print("Loading predictions...")
submission_dict = {}
with open("submission_optimized.tsv") as f:
    for line in tqdm(f):
        parts = line.strip().split("\t")
        if len(parts) == 3:
            pid, term, score = parts
            score = float(score)
            if pid not in submission_dict:
                submission_dict[pid] = {}
            submission_dict[pid][term] = score

print("Propagating...")
with open("submission_v2_final.tsv", "w") as f_out:
    for pid, preds in tqdm(submission_dict.items()):
        final_scores = preds.copy()
        queue = list(preds.keys())
        visited = set(queue)
        
        while queue:
            term = queue.pop(0)
            score = final_scores.get(term, 0.0)
            if term in parent_map:
                for parent in parent_map[term]:
                    old = final_scores.get(parent, 0.0)
                    if score > old:
                        final_scores[parent] = score
                        if parent not in visited:
                            queue.append(parent)
                            visited.add(parent)
        
        sorted_terms = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:100]
        for term, score in sorted_terms:
            if score > 0.001:
                f_out.write(f"{pid}\t{term}\t{score:.4f}\n")

print("\n submission_v2_final.tsv ready!")


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Loading Graph...
Loading predictions...


17361793it [00:15, 1097479.40it/s]


Propagating...


100%|██████████| 224309/224309 [01:13<00:00, 3058.19it/s]


 submission_v2_final.tsv ready!





In [6]:
# === RECREATE 0.188 BASELINE ===
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CONFIG = {
    "n_terms": 1500,  # BACK TO WHAT WORKED
    "paths": {
        "train_emb": "/kaggle/input/esm-dataset/train_embeds.npy",
        "train_ids": "/kaggle/input/esm-dataset/train_ids.npy",
        "test_emb": "/kaggle/input/esm-dataset/test_embeds.npy",
        "test_ids": "/kaggle/input/esm-dataset/test_ids.npy",
        "train_terms": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv",
        "go_obo": "/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo"
    }
}

# LOAD DATA
print("Loading...")
train_emb = np.load(CONFIG["paths"]["train_emb"]).astype(np.float32)
train_ids = np.load(CONFIG["paths"]["train_ids"])
test_emb = np.load(CONFIG["paths"]["test_emb"]).astype(np.float32)
test_ids = np.load(CONFIG["paths"]["test_ids"])

mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std

terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])
id_to_idx = {pid: i for i, pid in enumerate(train_ids)}

# SIMPLE MODEL (What got you 0.188)
class SimpleModel(nn.Module):
    def __init__(self, n_feat, n_class):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_feat, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_class)
        )
    def forward(self, x): return self.net(x)

class SimpleData(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y) if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return (self.X[i], self.y[i]) if self.y is not None else self.X[i]

# TRAIN
def train_aspect(aspect_char, aspect_name):
    print(f"\n>>> {aspect_name}...")
    aspect_terms = terms_df[terms_df['aspect'] == aspect_char]
    top_terms = aspect_terms['term'].value_counts().index[:CONFIG["n_terms"]].tolist()
    term_map = {t: i for i, t in enumerate(top_terms)}
    
    label_matrix = np.zeros((len(train_ids), len(top_terms)), dtype=np.float32)
    relevant = aspect_terms[aspect_terms['term'].isin(top_terms)]
    for _, row in tqdm(relevant.iterrows(), total=len(relevant)):
        if row['id'] in id_to_idx:
            label_matrix[id_to_idx[row['id']], term_map[row['term']]] = 1.0
    
    ds = SimpleData(train_emb, label_matrix)
    loader = DataLoader(ds, batch_size=256, shuffle=True)
    
    model = SimpleModel(1280, len(top_terms)).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.BCEWithLogitsLoss()
    
    for epoch in range(10):
        model.train()
        total = 0
        for x, y in tqdm(loader, leave=False):
            x, y = x.to(device), y.to(device)
            opt.zero_grad()
            loss = loss_fn(model(x), y)
            loss.backward()
            opt.step()
            total += loss.item()
        print(f"Epoch {epoch+1}: {total/len(loader):.4f}")
    
    torch.save(model.state_dict(), f"model_simple_{aspect_char}.pth")
    return top_terms

results = {c: train_aspect(c, n) for c, n in [('F', 'Function'), ('P', 'Process'), ('C', 'Component')]}

# PREDICT
print("\n>>> Predicting...")
with open("submission_simple.tsv", "w") as f:
    for char in ['F', 'P', 'C']:
        terms = results[char]
        model = SimpleModel(1280, len(terms)).to(device)
        model.load_state_dict(torch.load(f"model_simple_{char}.pth"))
        model.eval()
        
        loader = DataLoader(torch.from_numpy(test_emb), batch_size=1024)
        preds = []
        with torch.no_grad():
            for x in tqdm(loader):
                preds.append(torch.sigmoid(model(x.to(device))).cpu().numpy())
        
        all_preds = np.vstack(preds)
        for i, pid in enumerate(test_ids):
            top_idx = np.argpartition(all_preds[i], -70)[-70:]
            for idx in top_idx:
                if all_preds[i, idx] > 0.01:
                    f.write(f"{pid}\t{terms[idx]}\t{all_preds[i, idx]:.3f}\n")
        
        del model, preds
        gc.collect()

print("Done!")


Loading...

>>> Function...


100%|██████████| 112061/112061 [00:03<00:00, 30319.87it/s]
                                                  

Epoch 1: 0.0174


                                                  

Epoch 2: 0.0044


                                                  

Epoch 3: 0.0040


                                                  

Epoch 4: 0.0037


                                                  

Epoch 5: 0.0035


                                                  

Epoch 6: 0.0033


                                                  

Epoch 7: 0.0032


                                                  

Epoch 8: 0.0031


                                                  

Epoch 9: 0.0030


                                                  

Epoch 10: 0.0029

>>> Process...


100%|██████████| 143554/143554 [00:05<00:00, 28520.60it/s]
                                                  

Epoch 1: 0.0211


                                                  

Epoch 2: 0.0077


                                                  

Epoch 3: 0.0073


                                                  

Epoch 4: 0.0070


                                                  

Epoch 5: 0.0068


                                                  

Epoch 6: 0.0066


                                                  

Epoch 7: 0.0064


                                                  

Epoch 8: 0.0063


                                                  

Epoch 9: 0.0062


                                                  

Epoch 10: 0.0061

>>> Component...


100%|██████████| 154977/154977 [00:05<00:00, 29307.59it/s]
                                                  

Epoch 1: 0.0188


                                                  

Epoch 2: 0.0057


                                                  

Epoch 3: 0.0053


                                                  

Epoch 4: 0.0050


                                                  

Epoch 5: 0.0049


                                                  

Epoch 6: 0.0047


                                                  

Epoch 7: 0.0046


                                                  

Epoch 8: 0.0045


                                                  

Epoch 9: 0.0044


                                                  

Epoch 10: 0.0043

>>> Predicting...


100%|██████████| 220/220 [00:01<00:00, 218.01it/s]
100%|██████████| 220/220 [00:01<00:00, 217.01it/s]
100%|██████████| 220/220 [00:00<00:00, 220.64it/s]


Done!


In [7]:
!pip install obonet -q
import obonet
graph = obonet.read_obo(CONFIG["paths"]["go_obo"])
parent_map = {n: list(graph.successors(n)) for n in graph.nodes()}

sub = {}
with open("submission_simple.tsv") as f:
    for line in f:
        p, t, s = line.strip().split("\t")
        if p not in sub: sub[p] = {}
        sub[p][t] = float(s)

with open("submission_v3_baseline.tsv", "w") as f:
    for pid, preds in tqdm(sub.items()):
        final = preds.copy()
        q = list(preds.keys())
        while q:
            term = q.pop(0)
            for par in parent_map.get(term, []):
                if final.get(par, 0) < final[term]:
                    final[par] = final[term]
                    if par not in preds: q.append(par)
        
        for t, s in sorted(final.items(), key=lambda x: -x[1])[:70]:
            if s > 0.001:
                f.write(f"{pid}\t{t}\t{s:.3f}\n")


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


100%|██████████| 224309/224309 [00:55<00:00, 4061.82it/s]
