In [11]:
# --- MASTER TRAINING CELL (MFO - Molecular Function) ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd

# 1. LOAD & CLEAN INPUTS
print("1. Loading & Cleaning Inputs...")
train_emb = np.load("/kaggle/input/emb-models-ttt/train_embeds.npy").astype(np.float32)
train_ids = np.load("/kaggle/input/emb-models-ttt/train_ids.npy")
test_emb = np.load("/kaggle/input/emb-models-ttt/test_embeds.npy").astype(np.float32)
test_ids = np.load("/kaggle/input/emb-models-ttt/test_ids.npy")

# Standard Scale Inputs (Crucial for convergence)
mean = train_emb.mean(axis=0)
std = train_emb.std(axis=0) + 1e-6
train_emb = (train_emb - mean) / std
test_emb = (test_emb - mean) / std
print(f"   Inputs ready. Shape: {train_emb.shape}")

# 2. LOAD & BUILD LABELS
print("2. Building Labels (Target: Function 'F')...")
terms_df = pd.read_csv(CONFIG["paths"]["train_terms"], sep="\t", header=None, names=["id", "term", "aspect"])

# Filter for Molecular Function (F)
# Note: In this file, aspect is 'F', not 'MFO'
TARGET_ASPECT = 'F' 
aspect_terms = terms_df[terms_df['aspect'] == TARGET_ASPECT]

# Get Top 1500 Terms
top_terms = aspect_terms['term'].value_counts().index[:1500].tolist()
term_map = {t: i for i, t in enumerate(top_terms)}
num_classes = len(top_terms)

# Build Matrix
label_matrix = np.zeros((len(train_ids), num_classes), dtype=np.float32)
id_map = {pid: i for i, pid in enumerate(train_ids)}
relevant_rows = aspect_terms[aspect_terms['term'].isin(top_terms)]

for _, row in tqdm(relevant_rows.iterrows(), total=len(relevant_rows), desc="Mapping Labels"):
    if row['id'] in id_map:
        label_matrix[id_map[row['id']], term_map[row['term']]] = 1.0

print(f"   Labels ready. Shape: {label_matrix.shape}")

# 3. DEFINE MODEL (Simple & Robust)
class CAFA_MLP(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_classes)
        )
    def forward(self, x):
        return self.net(x)

# 4. TRAIN
print("3. Starting Training...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
class ProteinData(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# Split 90/10
perm = np.random.permutation(len(train_ids))
split = int(len(train_ids) * 0.9)
train_ds = ProteinData(train_emb[perm[:split]], label_matrix[perm[:split]])
val_ds = ProteinData(train_emb[perm[split:]], label_matrix[perm[split:]])

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256)

# Init
model = CAFA_MLP(1280, num_classes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

# Loop
for epoch in range(15):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        pred = model(x)
        loss = loss_fn(pred, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

# Save
torch.save(model.state_dict(), "model_MFO.pth")
print("Training Complete. Model Saved!")


1. Loading & Cleaning Inputs...
   Inputs ready. Shape: (82404, 1280)
2. Building Labels (Target: Function 'F')...


Mapping Labels: 100%|██████████| 112061/112061 [00:04<00:00, 23774.87it/s]


   Labels ready. Shape: (82404, 1500)
3. Starting Training...


Epoch 1: 100%|██████████| 290/290 [00:01<00:00, 239.52it/s]


Epoch 1 Loss: 0.0186


Epoch 2: 100%|██████████| 290/290 [00:01<00:00, 256.42it/s]


Epoch 2 Loss: 0.0045


Epoch 3: 100%|██████████| 290/290 [00:01<00:00, 249.74it/s]


Epoch 3 Loss: 0.0041


Epoch 4: 100%|██████████| 290/290 [00:01<00:00, 226.50it/s]


Epoch 4 Loss: 0.0038


Epoch 5: 100%|██████████| 290/290 [00:01<00:00, 253.49it/s]


Epoch 5 Loss: 0.0035


Epoch 6: 100%|██████████| 290/290 [00:01<00:00, 257.86it/s]


Epoch 6 Loss: 0.0033


Epoch 7: 100%|██████████| 290/290 [00:01<00:00, 232.04it/s]


Epoch 7 Loss: 0.0032


Epoch 8: 100%|██████████| 290/290 [00:01<00:00, 260.98it/s]


Epoch 8 Loss: 0.0031


Epoch 9: 100%|██████████| 290/290 [00:01<00:00, 255.88it/s]


Epoch 9 Loss: 0.0030


Epoch 10: 100%|██████████| 290/290 [00:01<00:00, 249.70it/s]


Epoch 10 Loss: 0.0029


Epoch 11: 100%|██████████| 290/290 [00:01<00:00, 217.26it/s]


Epoch 11 Loss: 0.0028


Epoch 12: 100%|██████████| 290/290 [00:01<00:00, 248.38it/s]


Epoch 12 Loss: 0.0028


Epoch 13: 100%|██████████| 290/290 [00:01<00:00, 248.14it/s]


Epoch 13 Loss: 0.0027


Epoch 14: 100%|██████████| 290/290 [00:01<00:00, 250.09it/s]


Epoch 14 Loss: 0.0027


Epoch 15: 100%|██████████| 290/290 [00:01<00:00, 218.40it/s]

Epoch 15 Loss: 0.0026
Training Complete. Model Saved!





In [12]:
# --- GENERATE SUBMISSION ---
print("Generating Submission...")
model.eval()

preds = []
batch_size = 1024 # Fast inference
test_loader = DataLoader(torch.from_numpy(test_emb), batch_size=batch_size)

with torch.no_grad():
    for x in tqdm(test_loader, desc="Predicting"):
        x = x.to(device)
        # Forward pass
        logits = model(x)
        # Sigmoid to get probabilities (0 to 1)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds.append(probs)

# Combine all batches
all_probs = np.vstack(preds)
print(f"Predictions Shape: {all_probs.shape}") # (Test_Size, 1500)

# Write to TSV file
print("Writing submission.tsv...")
with open("submission.tsv", "w") as f:
    # Header is not strictly needed for CAFA but good practice
    # Format: ProteinID <tab> GO_Term <tab> Score
    
    for i, pid in enumerate(tqdm(test_ids)):
        # Get top 50 predictions per protein to save space
        # (Most proteins only have a few functions)
        row_probs = all_probs[i]
        # Get indices of top 50 scores
        top_indices = np.argpartition(row_probs, -50)[-50:]
        
        for idx in top_indices:
            score = row_probs[idx]
            # Only keep scores > 0.01 (filtering low confidence)
            if score > 0.01:
                term = top_terms[idx]
                # Format: <ProteinID> <GO_Term> <Score>
                f.write(f"{pid}\t{term}\t{score:.3f}\n")

print("✅ submission.tsv created!")


Generating Submission...


Predicting: 100%|██████████| 220/220 [00:02<00:00, 102.01it/s]


Predictions Shape: (224309, 1500)
Writing submission.tsv...


100%|██████████| 224309/224309 [00:29<00:00, 7553.02it/s]

✅ submission.tsv created!



