In [None]:
# --- MASTER TRAINING CELL (MFO - Molecular Function) ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd

# 1. LOAD & CLEAN INPUTS
print("1. Loading & Cleaning Inputs...")
train_emb = np.load("/kaggle/input/emb-models-ttt/train_embeds.npy").astype(np.float32)
train_ids = np.load("/kaggle/input/emb-models-ttt/train_ids.npy")
test_emb = np.load("/kaggle/input/emb-models-ttt/test_embeds.npy").astype(np.float32)
test_ids = np.load("/kaggle/input/emb-models-ttt/test_ids.npy")

# Standard Scale Inputs (Crucial for convergence)
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std
print(f"   Inputs ready. Shape: {train_emb.shape}")

# 2. LOAD & BUILD LABELS
print("2. Building Labels (Target: Function 'F')...")
terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])

# Filter for Molecular Function (F)
# Note: In this file, aspect is 'F', not 'MFO'
TARGET_ASPECT = 'F' 
aspect_terms = terms_df[terms_df['aspect'] == TARGET_ASPECT]

# Get Top 1500 Terms
top_terms = aspect_terms['term'].value_counts().index[:1500].tolist()
term_map = {t: i for i, t in enumerate(top_terms)}
num_classes = len(top_terms)

# Build Matrix
label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
id_map = {pid: i for i, pid in enumerate(train_ids)}
relevant_rows = aspect_terms[aspect_terms['term'].isin(top_terms)]

for _, row in tqdm(relevant_rows.iterrows(), total=len(relevant_rows), desc="Mapping Labels"):
    if row['id'] in id_map:
        label_matrix[id_map[row['id']], term_map[row['term']]] = 1.0

print(f"   Labels ready. Shape: {label_matrix.shape}")

# 3. DEFINE MODEL (Simple & Robust)
class CAFA_MLP(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_classes)
        )
    def forward(self, x):
        return self.net(x)

# 4. TRAIN
print("3. Starting Training...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
class ProteinData(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# Split 90/10
perm = np.random.permutation(len(train_ids))
split = int(len(train_ids) * 0.9)
train_ds = ProteinData(train_emb[perm[:split]], label_matrix[perm[:split]])
val_ds = ProteinData(train_emb[perm[split:]], label_matrix[perm[split:]])

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256)

# Init
model = CAFA_MLP(1280, num_classes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

# Loop
for epoch in range(15):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        pred = model(x)
        loss = loss_fn(pred, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

# Save
torch.save(model.state_dict(), "model_MFO.pth")
print("Training Complete. Model Saved!")


In [None]:
# --- GENERATE SUBMISSION ---
print("Generating Submission...")
model.eval()

preds = []
batch_size = 1024 # Fast inference
test_loader = DataLoader(torch.from_numpy(test_emb), batch_size=batch_size)

with torch.no_grad():
    for x in tqdm(test_loader, desc="Predicting"):
        x = x.to(device)
        # Forward pass
        logits = model(x)
        # Sigmoid to get probabilities (0 to 1)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds.append(probs)

# Combine all batches
all_probs = np.vstack(preds)
print(f"Predictions Shape: {all_probs.shape}") # (Test_Size, 1500)

# Write to TSV file
print("Writing submission.tsv...")
with open("submission.tsv", "w") as f:
    # Header is not strictly needed for CAFA but good practice
    # Format: ProteinID <tab> GO_Term <tab> Score
    
    for i, pid in enumerate(tqdm(test_ids)):
        # Get top 50 predictions per protein to save space
        # (Most proteins only have a few functions)
        row_probs = all_probs[i]
        # Get indices of top 50 scores
        top_indices = np.argpartition(row_probs, -50)[-50:]
        
        for idx in top_indices:
            score = row_probs[idx]
            # Only keep scores > 0.01 (filtering low confidence)
            if score > 0.01:
                term = top_terms[idx]
                # Format: <ProteinID> <GO_Term> <Score>
                f.write(f"{pid}\t{term}\t{score:.3f}\n")

print("✅ submission.tsv created!")


In [None]:
# --- MASTER SCRIPT: TRAIN ALL 3 MODELS & SUBMIT ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import gc
import os

# CONFIGURATION
CONFIG = {
    "batch_size": 256,
    "lr": 1e-3,
    "epochs": 10,  # 10 Epochs per model is enough
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "paths": {
        "train_emb": "/kaggle/input/emb-models-ttt/train_embeds.npy",
        "train_ids": "/kaggle/input/emb-models-ttt/train_ids.npy",
        "test_emb": "/kaggle/input/emb-models-ttt/test_embeds.npy",
        "test_ids": "/kaggle/input/emb-models-ttt/test_ids.npy",
        "train_terms": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv"
    }
}

# --- 1. LOAD & CLEAN INPUTS (Do this ONCE) ---
print("=== 1. Loading & Cleaning Data ===")
train_emb = np.load(CONFIG["paths"]["train_emb"]).astype(np.float32)
train_ids = np.load(CONFIG["paths"]["train_ids"])
test_emb = np.load(CONFIG["paths"]["test_emb"]).astype(np.float32)
test_ids = np.load(CONFIG["paths"]["test_ids"])

# Standard Scale
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std

print(f"Data Ready: {train_emb.shape}")

# Load Terms File
print("Loading Terms File...")
terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])

# --- MODEL DEFINITION ---
class CAFA_MLP(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_classes)
        )
    def forward(self, x): return self.net(x)

class ProteinData(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# --- TRAINING FUNCTION ---
def train_aspect(aspect_char, aspect_name):
    print(f"\n=== TRAINING MODEL FOR: {aspect_name} ({aspect_char}) ===")
    
    # 1. Filter Terms
    aspect_terms = terms_df[terms_df['aspect'] == aspect_char]
    # Take Top 1500 terms
    top_terms = aspect_terms['term'].value_counts().index[:1500].tolist()
    term_map = {t: i for i, t in enumerate(top_terms)}
    num_classes = len(top_terms)
    print(f"Selected {num_classes} most common terms for {aspect_name}")
    
    # 2. Build Labels
    print("Building Label Matrix...")
    label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
    id_map = {pid: i for i, pid in enumerate(train_ids)}
    relevant = aspect_terms[aspect_terms['term'].isin(top_terms)]
    
    for _, row in tqdm(relevant.iterrows(), total=len(relevant), desc="Mapping"):
        if row['id'] in id_map:
            label_matrix[id_map[row['id']], term_map[row['term']]] = 1.0
            
    # 3. Dataloaders
    perm = np.random.permutation(len(train_ids))
    split = int(len(train_ids) * 0.9)
    train_ds = ProteinData(train_emb[perm[:split]], label_matrix[perm[:split]])
    train_loader = DataLoader(train_ds, batch_size=CONFIG["batch_size"], shuffle=True)
    
    # 4. Train
    model = CAFA_MLP(1280, num_classes).to(CONFIG["device"])
    opt = torch.optim.Adam(model.parameters(), lr=CONFIG["lr"])
    loss_fn = nn.BCEWithLogitsLoss()
    
    for epoch in range(CONFIG["epochs"]):
        model.train()
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            x, y = x.to(CONFIG["device"]), y.to(CONFIG["device"])
            opt.zero_grad()
            loss = loss_fn(model(x), y)
            loss.backward()
            opt.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")
        
    # Save Model
    save_name = f"model_{aspect_char}.pth"
    torch.save(model.state_dict(), save_name)
    print(f"Saved {save_name}")
    
    return top_terms, save_name

# --- RUN TRAINING FOR ALL 3 ASPECTS ---
# Train MFO (Function)
top_terms_F, model_path_F = train_aspect('F', 'Molecular Function')
# Train BPO (Process)
top_terms_P, model_path_P = train_aspect('P', 'Biological Process')
# Train CCO (Component)
top_terms_C, model_path_C = train_aspect('C', 'Cellular Component')

# --- GENERATE SUBMISSION ---
print("\n=== GENERATING FINAL SUBMISSION ===")

models_info = [
    ('F', model_path_F, top_terms_F),
    ('P', model_path_P, top_terms_P),
    ('C', model_path_C, top_terms_C)
]

with open("submission_combined.tsv", "w") as f:
    # Loop through each trained model
    for aspect, path, terms in models_info:
        print(f"Predicting {aspect}...")
        
        # Load Model
        num_classes = len(terms)
        model = CAFA_MLP(1280, num_classes).to(CONFIG["device"])
        model.load_state_dict(torch.load(path))
        model.eval()
        
        # Predict
        test_loader = DataLoader(torch.from_numpy(test_emb), batch_size=1024)
        preds = []
        with torch.no_grad():
            for x in tqdm(test_loader):
                x = x.to(CONFIG["device"])
                probs = torch.sigmoid(model(x)).cpu().numpy()
                preds.append(probs)
        
        all_probs = np.vstack(preds)
        
        # Write High Confidence Predictions
        print(f"Writing predictions for {aspect}...")
        for i, pid in enumerate(tqdm(test_ids)):
            row_probs = all_probs[i]
            top_indices = np.argpartition(row_probs, -50)[-50:]
            
            for idx in top_indices:
                score = row_probs[idx]
                if score > 0.01:
                    f.write(f"{pid}\t{terms[idx]}\t{score:.3f}\n")
        
        # Clean RAM
        del model, all_probs, preds
        torch.cuda.empty_cache()
        gc.collect()

print("\n✅ DONE! Download 'submission_combined.tsv' and submit!")


In [None]:
!pip install obonet networkx


In [None]:
# --- ADVANCED POST-PROCESSING: GRAPH PROPAGATION ---
import networkx
import obonet
import pandas as pd
from tqdm import tqdm

print("Loading Gene Ontology Graph (go-basic.obo)...")
# Load the graph
graph = obonet.read_obo("/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo")

# Create a mapping of Child -> Parents
# We only care about 'is_a' relationships for propagation
print("Building Parent Map...")
parent_map = {}
for node in graph.nodes():
    parents = [p for p in graph.successors(node) if p in graph.nodes()]
    if parents:
        parent_map[node] = parents

print(f"Graph loaded. {len(parent_map)} terms have parents.")

# Load your current submission
print("Loading your submission...")
# We read it into a Dictionary for fast lookup: {ProteinID: {Term: Score}}
submission_dict = {}

# Read line by line to save RAM
with open("submission_combined.tsv", "r") as f:
    for line in tqdm(f):
        parts = line.strip().split("\t")
        if len(parts) == 3:
            pid, term, score = parts
            score = float(score)
            
            if pid not in submission_dict:
                submission_dict[pid] = {}
            submission_dict[pid][term] = score

print(f"Loaded predictions for {len(submission_dict)} proteins.")

# PROPAGATION LOGIC
print("Propagating Scores (This is the Magic Step)...")
# Open output file
with open("submission_propagated.tsv", "w") as f_out:
    
    for pid, preds in tqdm(submission_dict.items()):
        # preds is {Term: Score}
        # We need to propagate scores UP the tree
        # If Child has score 0.9, Parent gets max(Parent_Score, 0.9)
        
        # We iterate multiple times to propagate up multiple levels
        # (A -> B -> C). A naive single pass might miss C if processed in wrong order.
        # A "Set" based approach is faster:
        
        # 1. Get all terms currently predicted
        active_terms = set(preds.keys())
        
        # 2. Add all ancestors of these terms
        final_scores = preds.copy() # Start with current scores
        
        queue = list(active_terms)
        visited = set(queue)
        
        while queue:
            term = queue.pop(0)
            current_score = final_scores.get(term, 0.0)
            
            # Get parents
            if term in parent_map:
                for parent in parent_map[term]:
                    # Parent score is at least Child score
                    old_parent_score = final_scores.get(parent, 0.0)
                    new_parent_score = max(old_parent_score, current_score)
                    
                    if new_parent_score > old_parent_score:
                        final_scores[parent] = new_parent_score
                        if parent not in visited:
                            queue.append(parent)
                            visited.add(parent)
        
        # 3. Write to file (Top 70 to include propagated parents)
        # Sort by score
        sorted_terms = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:70]
        
        for term, score in sorted_terms:
            # Only confident ones
            if score > 0.001: 
                f_out.write(f"{pid}\t{term}\t{score:.3f}\n")

print("✅ submission_propagated.tsv created! This is your 'Senior Scientist' submission.")


In [None]:
# --- DIAMOND ENSEMBLE TRAINING ---
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import gc

# Define 3 Different Architectures
class Model_Standard(nn.Module):
    def __init__(self, n_feat, n_class):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_feat, 512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, n_class)
        )
    def forward(self, x): return self.net(x)

class Model_Deep(nn.Module):
    def __init__(self, n_feat, n_class):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_feat, 1024), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, n_class)
        )
    def forward(self, x): return self.net(x)

class Model_HighDropout(nn.Module):
    def __init__(self, n_feat, n_class):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_feat, 512), nn.ReLU(), nn.Dropout(0.6), # High Dropout
            nn.Linear(512, n_class)
        )
    def forward(self, x): return self.net(x)

# Helper to train one specific model variant
def train_variant(model_class, aspect_char, aspect_name, variant_name):
    print(f"\n>>> Training {variant_name} for {aspect_name}...")
    
    # 1. Setup Data (Reuse existing embeddings/labels code)
    # (Assuming you ran the Master Script before, so 'train_emb' exists)
    
    # Get Terms/Labels
    aspect_terms = terms_df[terms_df['aspect'] == aspect_char]
    top_terms = aspect_terms['term'].value_counts().index[:1500].tolist()
    term_map = {t: i for i, t in enumerate(top_terms)}
    num_classes = len(top_terms)
    
    # Build Labels
    label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
    id_map = {pid: i for i, pid in enumerate(train_ids)}
    relevant = aspect_terms[aspect_terms['term'].isin(top_terms)]
    for _, row in relevant.iterrows():
        if row['id'] in id_map:
            label_matrix[id_map[row['id']], term_map[row['term']]] = 1.0
            
    # Dataloader
    perm = np.random.permutation(len(train_ids))
    ds = ProteinData(train_emb[perm], label_matrix[perm])
    loader = DataLoader(ds, batch_size=512, shuffle=True)
    
    # Train
    model = model_class(1280, num_classes).to(CONFIG["device"])
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.BCEWithLogitsLoss()
    
    for epoch in range(8): # 8 Epochs is enough for ensemble
        model.train()
        for x, y in loader:
            x, y = x.to(CONFIG["device"]), y.to(CONFIG["device"])
            opt.zero_grad()
            loss = loss_fn(model(x), y)
            loss.backward()
            opt.step()
            
    # Save
    torch.save(model.state_dict(), f"model_{aspect_char}_{variant_name}.pth")
    return top_terms

# --- EXECUTE ENSEMBLE ---
variants = [
    ("Standard", Model_Standard),
    ("Deep", Model_Deep),
    ("Dropout", Model_HighDropout)
]

aspects = [('F', 'Function'), ('P', 'Process'), ('C', 'Component')]

# Train everything (Total 9 models)
all_terms = {}
for aspect_char, aspect_name in aspects:
    for var_name, var_class in variants:
        terms = train_variant(var_class, aspect_char, aspect_name, var_name)
        all_terms[aspect_char] = terms # Save term list

# --- PREDICT & AVERAGE ---
print("\n>>> Generating Ensemble Predictions...")

with open("submission_ensemble.tsv", "w") as f:
    test_loader = DataLoader(torch.from_numpy(test_emb), batch_size=1024)
    
    for aspect_char, aspect_name in aspects:
        print(f"Ensembling {aspect_name}...")
        terms = all_terms[aspect_char]
        num_classes = len(terms)
        
        # Load all 3 models for this aspect
        models = []
        for var_name, var_class in variants:
            m = var_class(1280, num_classes).to(CONFIG["device"])
            m.load_state_dict(torch.load(f"model_{aspect_char}_{var_name}.pth"))
            m.eval()
            models.append(m)
            
        # Predict and Average
        all_probs = []
        with torch.no_grad():
            for x in tqdm(test_loader):
                x = x.to(CONFIG["device"])
                # Get predictions from all 3
                p1 = torch.sigmoid(models[0](x))
                p2 = torch.sigmoid(models[1](x))
                p3 = torch.sigmoid(models[2](x))
                
                # AVERAGE THEM
                avg_p = (p1 + p2 + p3) / 3.0
                all_probs.append(avg_p.cpu().numpy())
                
        final_probs = np.vstack(all_probs)
        
        # Write to file
        for i, pid in enumerate(test_ids):
            row = final_probs[i]
            top_idx = np.argpartition(row, -50)[-50:]
            for idx in top_idx:
                score = row[idx]
                if score > 0.01:
                    f.write(f"{pid}\t{terms[idx]}\t{score:.3f}\n")

print("✅ submission_ensemble.tsv created! Submit this to crush 0.20!")


In [2]:
# --- COMPLETE RESTART: TAXONOMY-AWARE TRAINING (FIXED) ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import os

# 1. CONFIG & DEVICE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

CONFIG = {
    "paths": {
        "train_emb": "/kaggle/input/emb-models-ttt/train_embeds.npy",
        "train_ids": "/kaggle/input/emb-models-ttt/train_ids.npy",
        "test_emb": "/kaggle/input/emb-models-ttt/test_embeds.npy",
        "test_ids": "/kaggle/input/emb-models-ttt/test_ids.npy",
        "train_terms": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv",
        "train_tax": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",
        "test_tax": "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv"
    }
}

# 2. LOAD & CLEAN EMBEDDINGS
print("\n=== 1. Loading Embeddings ===")
train_emb = np.load(CONFIG["paths"]["train_emb"]).astype(np.float32)
train_ids = np.load(CONFIG["paths"]["train_ids"])
test_emb = np.load(CONFIG["paths"]["test_emb"]).astype(np.float32)
test_ids = np.load(CONFIG["paths"]["test_ids"])

# Standard Scale (Critical for fast convergence)
print("Normalizing data...")
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std
print("Embeddings Ready.")

# 3. PREPARE TAXONOMY (FIXED)
print("\n=== 2. Preparing Taxonomy Indices ===")
train_tax_df = pd.read_csv(CONFIG["paths"]["train_tax"], sep="\t", header=None, names=["id", "tax_id"])
test_tax_df = pd.read_csv(CONFIG["paths"]["test_tax"], sep="\t", header=None, names=["id", "tax_id"])

# Force to String to avoid sorting errors
train_tax_df['tax_id'] = train_tax_df['tax_id'].astype(str)
test_tax_df['tax_id'] = test_tax_df['tax_id'].astype(str)

# Map unique taxons to integer indices [0, N]
all_taxons = set(train_tax_df['tax_id'].unique()) | set(test_tax_df['tax_id'].unique())
tax_list = sorted(list(all_taxons))
tax_map = {t: i for i, t in enumerate(tax_list)}
num_taxons = len(tax_list)
print(f"Total Unique Species: {num_taxons}")

def get_tax_indices(id_list, df):
    mapping = dict(zip(df["id"], df["tax_id"]))
    # Default to index 0 (unknown) if missing
    return np.array([tax_map.get(mapping.get(pid, "0"), 0) for pid in id_list], dtype=np.int32)

train_tax_idx = get_tax_indices(train_ids, train_tax_df)
test_tax_idx = get_tax_indices(test_ids, test_tax_df)
print("Taxonomy Indices Ready.")

# 4. DEFINE TAXONOMY MODEL
class TaxModel(nn.Module):
    def __init__(self, n_feat, n_taxons, n_class):
        super().__init__()
        # Learnable embedding for each species
        self.tax_emb = nn.Embedding(n_taxons, 64)
        
        # Input is Sequence(1280) + Species(64)
        self.net = nn.Sequential(
            nn.Linear(n_feat + 64, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_class)
        )
        
    def forward(self, x, t):
        t_vec = self.tax_emb(t)
        combined = torch.cat([x, t_vec], dim=1)
        return self.net(combined)

class MultiInputData(Dataset):
    def __init__(self, X, T, y=None):
        self.X = torch.from_numpy(X)
        self.T = torch.from_numpy(T).long()
        self.y = torch.from_numpy(y) if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return (self.X[i], self.T[i], self.y[i]) if self.y is not None else (self.X[i], self.T[i])

# 5. TRAINING LOOP
def train_tax_aware(aspect_char, aspect_name):
    print(f"\n>>> Training Tax-Aware Model for {aspect_name} ({aspect_char})...")
    
    # Load Labels
    terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])
    aspect_terms = terms_df[terms_df['aspect'] == aspect_char]
    top_terms = aspect_terms['term'].value_counts().index[:1500].tolist()
    term_map = {t: i for i, t in enumerate(top_terms)}
    num_classes = len(top_terms)
    
    # Build Matrix
    label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
    id_map = {pid: i for i, pid in enumerate(train_ids)}
    relevant = aspect_terms[aspect_terms['term'].isin(top_terms)]
    
    # Fast mapping
    for _, row in tqdm(relevant.iterrows(), total=len(relevant), desc="Mapping Labels"):
        if row['id'] in id_map:
            label_matrix[id_map[row['id']], term_map[row['term']]] = 1.0
            
    # Dataloader
    ds = MultiInputData(train_emb, train_tax_idx, label_matrix)
    loader = DataLoader(ds, batch_size=256, shuffle=True)
    
    # Model Init
    model = TaxModel(1280, num_taxons, num_classes).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.BCEWithLogitsLoss()
    
    # Train
    for epoch in range(10):
        model.train()
        total_loss = 0
        for x, t, y in tqdm(loader, desc=f"Epoch {epoch+1}", leave=False):
            x, t, y = x.to(device), t.to(device), y.to(device)
            opt.zero_grad()
            loss = loss_fn(model(x, t), y)
            loss.backward()
            opt.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss/len(loader):.4f}")
            
    torch.save(model.state_dict(), f"model_tax_{aspect_char}.pth")
    return top_terms

# 6. EXECUTE
top_terms_F = train_tax_aware('F', 'Molecular Function')
top_terms_P = train_tax_aware('P', 'Biological Process')
top_terms_C = train_tax_aware('C', 'Cellular Component')

print("\n✅ All Taxonomy-Aware Models Trained!")


Running on: cuda

=== 1. Loading Embeddings ===
Normalizing data...
Embeddings Ready.

=== 2. Preparing Taxonomy Indices ===
Total Unique Species: 9835
Taxonomy Indices Ready.

>>> Training Tax-Aware Model for Molecular Function (F)...


Mapping Labels: 100%|██████████| 112061/112061 [00:04<00:00, 24683.77it/s]
                                                           

Epoch 1 Loss: 0.0152


                                                           

Epoch 2 Loss: 0.0043


                                                           

Epoch 3 Loss: 0.0038


                                                           

Epoch 4 Loss: 0.0035


                                                           

Epoch 5 Loss: 0.0033


                                                           

Epoch 6 Loss: 0.0032


                                                           

Epoch 7 Loss: 0.0031


                                                           

Epoch 8 Loss: 0.0030


                                                           

Epoch 9 Loss: 0.0029


                                                            

Epoch 10 Loss: 0.0028

>>> Training Tax-Aware Model for Biological Process (P)...


Mapping Labels: 100%|██████████| 143554/143554 [00:05<00:00, 24737.39it/s]
                                                           

Epoch 1 Loss: 0.0196


                                                           

Epoch 2 Loss: 0.0077


                                                           

Epoch 3 Loss: 0.0071


                                                           

Epoch 4 Loss: 0.0068


                                                           

Epoch 5 Loss: 0.0066


                                                           

Epoch 6 Loss: 0.0064


                                                           

Epoch 7 Loss: 0.0063


                                                           

Epoch 8 Loss: 0.0062


                                                           

Epoch 9 Loss: 0.0061


                                                            

Epoch 10 Loss: 0.0060

>>> Training Tax-Aware Model for Cellular Component (C)...


Mapping Labels: 100%|██████████| 154977/154977 [00:06<00:00, 24623.12it/s]
                                                           

Epoch 1 Loss: 0.0169


                                                           

Epoch 2 Loss: 0.0054


                                                           

Epoch 3 Loss: 0.0049


                                                           

Epoch 4 Loss: 0.0047


                                                           

Epoch 5 Loss: 0.0045


                                                           

Epoch 6 Loss: 0.0043


                                                           

Epoch 7 Loss: 0.0042


                                                           

Epoch 8 Loss: 0.0041


                                                           

Epoch 9 Loss: 0.0040


                                                            

Epoch 10 Loss: 0.0040

✅ All Taxonomy-Aware Models Trained!




In [3]:
# --- GENERATE TAX-AWARE SUBMISSION ---
import gc

print("\n=== Generating Taxonomy-Aware Submission ===")

models_info = [
    ('F', f"model_tax_F.pth", top_terms_F),
    ('P', f"model_tax_P.pth", top_terms_P),
    ('C', f"model_tax_C.pth", top_terms_C)
]

# Create final file
with open("submission_tax_aware.tsv", "w") as f:
    
    for aspect, path, terms in models_info:
        print(f"Predicting {aspect}...")
        num_classes = len(terms)
        
        # Load Model
        model = TaxModel(1280, num_taxons, num_classes).to(device)
        model.load_state_dict(torch.load(path))
        model.eval()
        
        # Dataset for Inference
        # Note: We need both Embeddings AND Tax Indices
        ds = MultiInputData(test_emb, test_tax_idx)
        loader = DataLoader(ds, batch_size=1024, shuffle=False)
        
        preds = []
        with torch.no_grad():
            for x, t in tqdm(loader):
                x, t = x.to(device), t.to(device)
                logits = model(x, t)
                probs = torch.sigmoid(logits).cpu().numpy()
                preds.append(probs)
        
        all_probs = np.vstack(preds)
        
        print(f"Writing predictions for {aspect}...")
        for i, pid in enumerate(tqdm(test_ids)):
            row_probs = all_probs[i]
            # Get Top 50
            top_indices = np.argpartition(row_probs, -50)[-50:]
            
            for idx in top_indices:
                score = row_probs[idx]
                if score > 0.01: # Only write confident predictions
                    term = terms[idx]
                    f.write(f"{pid}\t{term}\t{score:.3f}\n")
                    
        # Clean RAM
        del model, all_probs, preds
        torch.cuda.empty_cache()
        gc.collect()

print("\n✅ submission_tax_aware.tsv created!")



=== Generating Taxonomy-Aware Submission ===
Predicting F...


100%|██████████| 220/220 [00:03<00:00, 61.64it/s]


Writing predictions for F...


100%|██████████| 224309/224309 [00:25<00:00, 8710.35it/s]


Predicting P...


100%|██████████| 220/220 [00:02<00:00, 92.29it/s] 


Writing predictions for P...


100%|██████████| 224309/224309 [00:25<00:00, 8744.99it/s]


Predicting C...


100%|██████████| 220/220 [00:02<00:00, 92.81it/s] 


Writing predictions for C...


100%|██████████| 224309/224309 [00:29<00:00, 7734.04it/s]



✅ submission_tax_aware.tsv created!


In [5]:
!pip install obonet networkx


Collecting obonet
  Downloading obonet-1.1.1-py3-none-any.whl.metadata (6.7 kB)
Downloading obonet-1.1.1-py3-none-any.whl (9.2 kB)
Installing collected packages: obonet
Successfully installed obonet-1.1.1


In [6]:
# --- PROPAGATION FOR TAXONOMY MODEL ---
import networkx
import obonet
import pandas as pd
from tqdm import tqdm

print("Loading Graph...")
graph = obonet.read_obo("/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo")
parent_map = {}
for node in graph.nodes():
    parents = [p for p in graph.successors(node) if p in graph.nodes()]
    if parents: parent_map[node] = parents

print("Loading Tax-Aware Predictions...")
submission_dict = {}

# USE THE FILE YOU JUST GENERATED
input_filename = "submission_tax_aware.tsv" 

with open(input_filename, "r") as f:
    for line in tqdm(f):
        parts = line.strip().split("\t")
        if len(parts) == 3:
            pid, term, score = parts
            score = float(score)
            if pid not in submission_dict: submission_dict[pid] = {}
            submission_dict[pid][term] = score

print("Propagating...")
with open("submission_tax_prop.tsv", "w") as f_out:
    for pid, preds in tqdm(submission_dict.items()):
        active_terms = set(preds.keys())
        final_scores = preds.copy()
        queue = list(active_terms)
        visited = set(queue)
        
        while queue:
            term = queue.pop(0)
            current_score = final_scores.get(term, 0.0)
            if term in parent_map:
                for parent in parent_map[term]:
                    old_score = final_scores.get(parent, 0.0)
                    new_score = max(old_score, current_score)
                    if new_score > old_score:
                        final_scores[parent] = new_score
                        if parent not in visited:
                            queue.append(parent)
                            visited.add(parent)
        
        # Write Top 70
        sorted_terms = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:70]
        for term, score in sorted_terms:
            if score > 0.001:
                f_out.write(f"{pid}\t{term}\t{score:.3f}\n")

print("✅ submission_tax_prop.tsv created!")


Loading Graph...
Loading Tax-Aware Predictions...


7571663it [00:08, 914039.00it/s]


Propagating...


100%|██████████| 224299/224299 [00:48<00:00, 4637.87it/s] 

✅ submission_tax_prop.tsv created!



