In [11]:
# --- MASTER TRAINING CELL (MFO - Molecular Function) ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd

# 1. LOAD & CLEAN INPUTS
print("1. Loading & Cleaning Inputs...")
train_emb = np.load("/kaggle/input/emb-models-ttt/train_embeds.npy").astype(np.float32)
train_ids = np.load("/kaggle/input/emb-models-ttt/train_ids.npy")
test_emb = np.load("/kaggle/input/emb-models-ttt/test_embeds.npy").astype(np.float32)
test_ids = np.load("/kaggle/input/emb-models-ttt/test_ids.npy")

# Standard Scale Inputs (Crucial for convergence)
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std
print(f"   Inputs ready. Shape: {train_emb.shape}")

# 2. LOAD & BUILD LABELS
print("2. Building Labels (Target: Function 'F')...")
terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])

# Filter for Molecular Function (F)
# Note: In this file, aspect is 'F', not 'MFO'
TARGET_ASPECT = 'F' 
aspect_terms = terms_df[terms_df['aspect'] == TARGET_ASPECT]

# Get Top 1500 Terms
top_terms = aspect_terms['term'].value_counts().index[:1500].tolist()
term_map = {t: i for i, t in enumerate(top_terms)}
num_classes = len(top_terms)

# Build Matrix
label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
id_map = {pid: i for i, pid in enumerate(train_ids)}
relevant_rows = aspect_terms[aspect_terms['term'].isin(top_terms)]

for _, row in tqdm(relevant_rows.iterrows(), total=len(relevant_rows), desc="Mapping Labels"):
    if row['id'] in id_map:
        label_matrix[id_map[row['id']], term_map[row['term']]] = 1.0

print(f"   Labels ready. Shape: {label_matrix.shape}")

# 3. DEFINE MODEL (Simple & Robust)
class CAFA_MLP(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_classes)
        )
    def forward(self, x):
        return self.net(x)

# 4. TRAIN
print("3. Starting Training...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
class ProteinData(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# Split 90/10
perm = np.random.permutation(len(train_ids))
split = int(len(train_ids) * 0.9)
train_ds = ProteinData(train_emb[perm[:split]], label_matrix[perm[:split]])
val_ds = ProteinData(train_emb[perm[split:]], label_matrix[perm[split:]])

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256)

# Init
model = CAFA_MLP(1280, num_classes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

# Loop
for epoch in range(15):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        pred = model(x)
        loss = loss_fn(pred, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

# Save
torch.save(model.state_dict(), "model_MFO.pth")
print("Training Complete. Model Saved!")


1. Loading & Cleaning Inputs...
   Inputs ready. Shape: (82404, 1280)
2. Building Labels (Target: Function 'F')...


Mapping Labels: 100%|██████████| 112061/112061 [00:04<00:00, 23774.87it/s]


   Labels ready. Shape: (82404, 1500)
3. Starting Training...


Epoch 1: 100%|██████████| 290/290 [00:01<00:00, 239.52it/s]


Epoch 1 Loss: 0.0186


Epoch 2: 100%|██████████| 290/290 [00:01<00:00, 256.42it/s]


Epoch 2 Loss: 0.0045


Epoch 3: 100%|██████████| 290/290 [00:01<00:00, 249.74it/s]


Epoch 3 Loss: 0.0041


Epoch 4: 100%|██████████| 290/290 [00:01<00:00, 226.50it/s]


Epoch 4 Loss: 0.0038


Epoch 5: 100%|██████████| 290/290 [00:01<00:00, 253.49it/s]


Epoch 5 Loss: 0.0035


Epoch 6: 100%|██████████| 290/290 [00:01<00:00, 257.86it/s]


Epoch 6 Loss: 0.0033


Epoch 7: 100%|██████████| 290/290 [00:01<00:00, 232.04it/s]


Epoch 7 Loss: 0.0032


Epoch 8: 100%|██████████| 290/290 [00:01<00:00, 260.98it/s]


Epoch 8 Loss: 0.0031


Epoch 9: 100%|██████████| 290/290 [00:01<00:00, 255.88it/s]


Epoch 9 Loss: 0.0030


Epoch 10: 100%|██████████| 290/290 [00:01<00:00, 249.70it/s]


Epoch 10 Loss: 0.0029


Epoch 11: 100%|██████████| 290/290 [00:01<00:00, 217.26it/s]


Epoch 11 Loss: 0.0028


Epoch 12: 100%|██████████| 290/290 [00:01<00:00, 248.38it/s]


Epoch 12 Loss: 0.0028


Epoch 13: 100%|██████████| 290/290 [00:01<00:00, 248.14it/s]


Epoch 13 Loss: 0.0027


Epoch 14: 100%|██████████| 290/290 [00:01<00:00, 250.09it/s]


Epoch 14 Loss: 0.0027


Epoch 15: 100%|██████████| 290/290 [00:01<00:00, 218.40it/s]

Epoch 15 Loss: 0.0026
Training Complete. Model Saved!





In [12]:
# --- GENERATE SUBMISSION ---
print("Generating Submission...")
model.eval()

preds = []
batch_size = 1024 # Fast inference
test_loader = DataLoader(torch.from_numpy(test_emb), batch_size=batch_size)

with torch.no_grad():
    for x in tqdm(test_loader, desc="Predicting"):
        x = x.to(device)
        # Forward pass
        logits = model(x)
        # Sigmoid to get probabilities (0 to 1)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds.append(probs)

# Combine all batches
all_probs = np.vstack(preds)
print(f"Predictions Shape: {all_probs.shape}") # (Test_Size, 1500)

# Write to TSV file
print("Writing submission.tsv...")
with open("submission.tsv", "w") as f:
    # Header is not strictly needed for CAFA but good practice
    # Format: ProteinID <tab> GO_Term <tab> Score
    
    for i, pid in enumerate(tqdm(test_ids)):
        # Get top 50 predictions per protein to save space
        # (Most proteins only have a few functions)
        row_probs = all_probs[i]
        # Get indices of top 50 scores
        top_indices = np.argpartition(row_probs, -50)[-50:]
        
        for idx in top_indices:
            score = row_probs[idx]
            # Only keep scores > 0.01 (filtering low confidence)
            if score > 0.01:
                term = top_terms[idx]
                # Format: <ProteinID> <GO_Term> <Score>
                f.write(f"{pid}\t{term}\t{score:.3f}\n")

print("✅ submission.tsv created!")


Generating Submission...


Predicting: 100%|██████████| 220/220 [00:02<00:00, 102.01it/s]


Predictions Shape: (224309, 1500)
Writing submission.tsv...


100%|██████████| 224309/224309 [00:29<00:00, 7553.02it/s]

✅ submission.tsv created!





In [13]:
# --- MASTER SCRIPT: TRAIN ALL 3 MODELS & SUBMIT ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import gc
import os

# CONFIGURATION
CONFIG = {
    "batch_size": 256,
    "lr": 1e-3,
    "epochs": 10,  # 10 Epochs per model is enough
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "paths": {
        "train_emb": "/kaggle/input/emb-models-ttt/train_embeds.npy",
        "train_ids": "/kaggle/input/emb-models-ttt/train_ids.npy",
        "test_emb": "/kaggle/input/emb-models-ttt/test_embeds.npy",
        "test_ids": "/kaggle/input/emb-models-ttt/test_ids.npy",
        "train_terms": "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv"
    }
}

# --- 1. LOAD & CLEAN INPUTS (Do this ONCE) ---
print("=== 1. Loading & Cleaning Data ===")
train_emb = np.load(CONFIG["paths"]["train_emb"]).astype(np.float32)
train_ids = np.load(CONFIG["paths"]["train_ids"])
test_emb = np.load(CONFIG["paths"]["test_emb"]).astype(np.float32)
test_ids = np.load(CONFIG["paths"]["test_ids"])

# Standard Scale
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std

print(f"Data Ready: {train_emb.shape}")

# Load Terms File
print("Loading Terms File...")
terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])

# --- MODEL DEFINITION ---
class CAFA_MLP(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_classes)
        )
    def forward(self, x): return self.net(x)

class ProteinData(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# --- TRAINING FUNCTION ---
def train_aspect(aspect_char, aspect_name):
    print(f"\n=== TRAINING MODEL FOR: {aspect_name} ({aspect_char}) ===")
    
    # 1. Filter Terms
    aspect_terms = terms_df[terms_df['aspect'] == aspect_char]
    # Take Top 1500 terms
    top_terms = aspect_terms['term'].value_counts().index[:1500].tolist()
    term_map = {t: i for i, t in enumerate(top_terms)}
    num_classes = len(top_terms)
    print(f"Selected {num_classes} most common terms for {aspect_name}")
    
    # 2. Build Labels
    print("Building Label Matrix...")
    label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
    id_map = {pid: i for i, pid in enumerate(train_ids)}
    relevant = aspect_terms[aspect_terms['term'].isin(top_terms)]
    
    for _, row in tqdm(relevant.iterrows(), total=len(relevant), desc="Mapping"):
        if row['id'] in id_map:
            label_matrix[id_map[row['id']], term_map[row['term']]] = 1.0
            
    # 3. Dataloaders
    perm = np.random.permutation(len(train_ids))
    split = int(len(train_ids) * 0.9)
    train_ds = ProteinData(train_emb[perm[:split]], label_matrix[perm[:split]])
    train_loader = DataLoader(train_ds, batch_size=CONFIG["batch_size"], shuffle=True)
    
    # 4. Train
    model = CAFA_MLP(1280, num_classes).to(CONFIG["device"])
    opt = torch.optim.Adam(model.parameters(), lr=CONFIG["lr"])
    loss_fn = nn.BCEWithLogitsLoss()
    
    for epoch in range(CONFIG["epochs"]):
        model.train()
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            x, y = x.to(CONFIG["device"]), y.to(CONFIG["device"])
            opt.zero_grad()
            loss = loss_fn(model(x), y)
            loss.backward()
            opt.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")
        
    # Save Model
    save_name = f"model_{aspect_char}.pth"
    torch.save(model.state_dict(), save_name)
    print(f"Saved {save_name}")
    
    return top_terms, save_name

# --- RUN TRAINING FOR ALL 3 ASPECTS ---
# Train MFO (Function)
top_terms_F, model_path_F = train_aspect('F', 'Molecular Function')
# Train BPO (Process)
top_terms_P, model_path_P = train_aspect('P', 'Biological Process')
# Train CCO (Component)
top_terms_C, model_path_C = train_aspect('C', 'Cellular Component')

# --- GENERATE SUBMISSION ---
print("\n=== GENERATING FINAL SUBMISSION ===")

models_info = [
    ('F', model_path_F, top_terms_F),
    ('P', model_path_P, top_terms_P),
    ('C', model_path_C, top_terms_C)
]

with open("submission_combined.tsv", "w") as f:
    # Loop through each trained model
    for aspect, path, terms in models_info:
        print(f"Predicting {aspect}...")
        
        # Load Model
        num_classes = len(terms)
        model = CAFA_MLP(1280, num_classes).to(CONFIG["device"])
        model.load_state_dict(torch.load(path))
        model.eval()
        
        # Predict
        test_loader = DataLoader(torch.from_numpy(test_emb), batch_size=1024)
        preds = []
        with torch.no_grad():
            for x in tqdm(test_loader):
                x = x.to(CONFIG["device"])
                probs = torch.sigmoid(model(x)).cpu().numpy()
                preds.append(probs)
        
        all_probs = np.vstack(preds)
        
        # Write High Confidence Predictions
        print(f"Writing predictions for {aspect}...")
        for i, pid in enumerate(tqdm(test_ids)):
            row_probs = all_probs[i]
            top_indices = np.argpartition(row_probs, -50)[-50:]
            
            for idx in top_indices:
                score = row_probs[idx]
                if score > 0.01:
                    f.write(f"{pid}\t{terms[idx]}\t{score:.3f}\n")
        
        # Clean RAM
        del model, all_probs, preds
        torch.cuda.empty_cache()
        gc.collect()

print("\n✅ DONE! Download 'submission_combined.tsv' and submit!")


=== 1. Loading & Cleaning Data ===
Data Ready: (82404, 1280)
Loading Terms File...

=== TRAINING MODEL FOR: Molecular Function (F) ===
Selected 1500 most common terms for Molecular Function
Building Label Matrix...


Mapping: 100%|██████████| 112061/112061 [00:04<00:00, 23801.83it/s]
                                                           

Epoch 1 Loss: 0.0186


                                                           

Epoch 2 Loss: 0.0045


                                                           

Epoch 3 Loss: 0.0041


                                                           

Epoch 4 Loss: 0.0038


                                                           

Epoch 5 Loss: 0.0035


                                                           

Epoch 6 Loss: 0.0034


                                                           

Epoch 7 Loss: 0.0032


                                                           

Epoch 8 Loss: 0.0031


                                                           

Epoch 9 Loss: 0.0030


                                                            

Epoch 10 Loss: 0.0029
Saved model_F.pth

=== TRAINING MODEL FOR: Biological Process (P) ===
Selected 1500 most common terms for Biological Process
Building Label Matrix...


Mapping: 100%|██████████| 143554/143554 [00:05<00:00, 24191.62it/s]
                                                           

Epoch 1 Loss: 0.0226


                                                           

Epoch 2 Loss: 0.0078


                                                           

Epoch 3 Loss: 0.0073


                                                           

Epoch 4 Loss: 0.0070


                                                           

Epoch 5 Loss: 0.0068


                                                           

Epoch 6 Loss: 0.0066


                                                           

Epoch 7 Loss: 0.0064


                                                           

Epoch 8 Loss: 0.0063


                                                           

Epoch 9 Loss: 0.0062


                                                            

Epoch 10 Loss: 0.0060
Saved model_P.pth

=== TRAINING MODEL FOR: Cellular Component (C) ===
Selected 1500 most common terms for Cellular Component
Building Label Matrix...


Mapping: 100%|██████████| 154977/154977 [00:06<00:00, 24530.34it/s]
                                                           

Epoch 1 Loss: 0.0201


                                                           

Epoch 2 Loss: 0.0057


                                                           

Epoch 3 Loss: 0.0053


                                                           

Epoch 4 Loss: 0.0051


                                                           

Epoch 5 Loss: 0.0049


                                                           

Epoch 6 Loss: 0.0047


                                                           

Epoch 7 Loss: 0.0046


                                                           

Epoch 8 Loss: 0.0045


                                                           

Epoch 9 Loss: 0.0044


                                                            

Epoch 10 Loss: 0.0043
Saved model_C.pth

=== GENERATING FINAL SUBMISSION ===
Predicting F...


100%|██████████| 220/220 [00:01<00:00, 115.15it/s]


Writing predictions for F...


100%|██████████| 224309/224309 [00:27<00:00, 8235.51it/s]


Predicting P...


100%|██████████| 220/220 [00:01<00:00, 157.53it/s]


Writing predictions for P...


100%|██████████| 224309/224309 [00:30<00:00, 7254.55it/s]


Predicting C...


100%|██████████| 220/220 [00:01<00:00, 152.25it/s]


Writing predictions for C...


100%|██████████| 224309/224309 [00:30<00:00, 7344.37it/s]



✅ DONE! Download 'submission_combined.tsv' and submit!


In [14]:
!pip install obonet networkx


Collecting obonet
  Downloading obonet-1.1.1-py3-none-any.whl.metadata (6.7 kB)
Downloading obonet-1.1.1-py3-none-any.whl (9.2 kB)
Installing collected packages: obonet
Successfully installed obonet-1.1.1


In [15]:
# --- ADVANCED POST-PROCESSING: GRAPH PROPAGATION ---
import networkx
import obonet
import pandas as pd
from tqdm import tqdm

print("Loading Gene Ontology Graph (go-basic.obo)...")
# Load the graph
graph = obonet.read_obo("/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo")

# Create a mapping of Child -> Parents
# We only care about 'is_a' relationships for propagation
print("Building Parent Map...")
parent_map = {}
for node in graph.nodes():
    parents = [p for p in graph.successors(node) if p in graph.nodes()]
    if parents:
        parent_map[node] = parents

print(f"Graph loaded. {len(parent_map)} terms have parents.")

# Load your current submission
print("Loading your submission...")
# We read it into a Dictionary for fast lookup: {ProteinID: {Term: Score}}
submission_dict = {}

# Read line by line to save RAM
with open("submission_combined.tsv", "r") as f:
    for line in tqdm(f):
        parts = line.strip().split("\t")
        if len(parts) == 3:
            pid, term, score = parts
            score = float(score)
            
            if pid not in submission_dict:
                submission_dict[pid] = {}
            submission_dict[pid][term] = score

print(f"Loaded predictions for {len(submission_dict)} proteins.")

# PROPAGATION LOGIC
print("Propagating Scores (This is the Magic Step)...")
# Open output file
with open("submission_propagated.tsv", "w") as f_out:
    
    for pid, preds in tqdm(submission_dict.items()):
        # preds is {Term: Score}
        # We need to propagate scores UP the tree
        # If Child has score 0.9, Parent gets max(Parent_Score, 0.9)
        
        # We iterate multiple times to propagate up multiple levels
        # (A -> B -> C). A naive single pass might miss C if processed in wrong order.
        # A "Set" based approach is faster:
        
        # 1. Get all terms currently predicted
        active_terms = set(preds.keys())
        
        # 2. Add all ancestors of these terms
        final_scores = preds.copy() # Start with current scores
        
        queue = list(active_terms)
        visited = set(queue)
        
        while queue:
            term = queue.pop(0)
            current_score = final_scores.get(term, 0.0)
            
            # Get parents
            if term in parent_map:
                for parent in parent_map[term]:
                    # Parent score is at least Child score
                    old_parent_score = final_scores.get(parent, 0.0)
                    new_parent_score = max(old_parent_score, current_score)
                    
                    if new_parent_score > old_parent_score:
                        final_scores[parent] = new_parent_score
                        if parent not in visited:
                            queue.append(parent)
                            visited.add(parent)
        
        # 3. Write to file (Top 70 to include propagated parents)
        # Sort by score
        sorted_terms = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:70]
        
        for term, score in sorted_terms:
            # Only confident ones
            if score > 0.001: 
                f_out.write(f"{pid}\t{term}\t{score:.3f}\n")

print("✅ submission_propagated.tsv created! This is your 'Senior Scientist' submission.")


Loading Gene Ontology Graph (go-basic.obo)...
Building Parent Map...
Graph loaded. 40119 terms have parents.
Loading your submission...


14941910it [00:16, 904825.00it/s]


Loaded predictions for 224309 proteins.
Propagating Scores (This is the Magic Step)...


100%|██████████| 224309/224309 [01:33<00:00, 2395.81it/s]

✅ submission_propagated.tsv created! This is your 'Senior Scientist' submission.



