In [None]:
# ==========================================================
# 1. SETUP: Load Features and Teacher Model
# ==========================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Preprocessed Data
data = torch.load("preprocessed_features.pth")
X_train = data["X_train"].to(device)
Y_silver = data["y_all"].to(device) # Original silver labels
E_label_768 = data["E_label_768"].to(device)
ancestors = data["ancestor_list"]

# Load Teacher (Stage 1 Model)
teacher = ProposedClassifier(768, 531, 256, E_label_768, ancestors).to(device)
teacher.load_state_dict(torch.load("stage1_teacher_model.pth"))
teacher.eval()

In [None]:
# ==========================================================
# 2. GENERATE SOFT TARGETS (Distillation)
# ==========================================================
# TWEAKS for Stage 2
LAMBDA = 0.7  # How much to trust the teacher vs silver labels
TEMP = 2.5    # Temperature scaling to smooth distributions

@torch.no_grad()
def generate_q_soft(model, X, Y_hard, temp=2.5, lam=0.7):
    # Predict probabilities with temperature scaling
    logits = model(X)
    P_teacher = torch.sigmoid(logits / temp)
    
    # Mix teacher intelligence with original silver labels
    Q_soft = (1.0 - lam) * Y_hard + lam * P_teacher
    return Q_soft

Q_soft = generate_q_soft(teacher, X_train, Y_silver, temp=TEMP, lam=LAMBDA)


In [None]:
# ==========================================================
# 3. CONFIDENCE-AWARE WEIGHTING LOGIC
# ==========================================================
def get_confidence_weight(q, gamma=2.5):
    """
    Assigns higher weight to certain predictions (near 0 or 1)
    and zero weight to uncertain ones (near 0.5).
    """
    return (torch.abs(q - 0.5) * 2.0) ** gamma

In [None]:
# ==========================================================
# 4. STAGE 2 TRAINING LOOP
# ==========================================================
# Initialize a fresh STUDENT model
student = ProposedClassifier(768, 531, 256, E_label_768, ancestors).to(device)
optimizer = torch.optim.AdamW(student.parameters(), lr=3e-4) # Lower LR for refinement

for epoch in range(1, 11):
    student.train()
    # Iterate through batches of X_train and Q_soft
    # ... (batching logic) ...
    
    probs_student = torch.sigmoid(student(batch_X))
    
    # Apply Confidence-Aware Loss
    w = get_confidence_weight(batch_Q)
    loss_elem = F.binary_cross_entropy(probs_student, batch_Q, reduction='none')
    loss = (loss_elem * w).mean()
    
    loss.backward()
    torch.nn.utils.clip_grad_norm_(student.parameters(), 1.0)
    optimizer.step()


In [None]:
# ==========================================================
# FINAL KAGGLE SUBMISSION: Stage 2 Student Inference
# ==========================================================
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# 1. Prepare Model for Evaluation
student.eval()
test_probs = []

# 2. Batch Inference on Test Set
# We use batching to avoid OOM (Out of Memory) issues on the GPU
BATCH_SIZE = 256
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    
    for i in tqdm(range(0, len(X_test_tensor), BATCH_SIZE), desc="Final Inference"):
        batch_x = X_test_tensor[i : i + BATCH_SIZE]
        # Pass through our Multi-Head GAT+Path Student model
        logits = student(batch_x)
        probs = torch.sigmoid(logits)
        test_probs.append(probs.cpu().numpy())

# Combine all batch probabilities into one matrix [NumTestSamples, 531]
all_test_probs = np.vstack(test_probs)

# 3. Adaptive Label Selection (2-3 Label Constraint)
def get_final_labels(prob_vector, thr=0.35, min_l=2, max_l=3):
    """
    Selects labels based on a threshold while forcing a count between min_l and max_l.
    """
    # Sort indices by probability descending
    idx = np.argsort(prob_vector)[::-1]
    
    # Select labels that pass the threshold
    chosen = [i for i in idx[:50] if prob_vector[i] >= thr]
    
    # FORCE CONSTRAINT: If fewer than 2, take the top 2
    if len(chosen) < min_l:
        chosen = idx[:min_l].tolist()
        
    # FORCE CONSTRAINT: If more than 3, take the top 3
    # This maximizes F1 by not over-predicting broad categories
    final_selection = sorted(chosen[:max_l])
    
    return ",".join(map(str, final_selection))

# 4. Generate Final Prediction Strings
print("Applying adaptive thresholding...")
final_pred_strs = [get_final_labels(p, thr=0.35) for p in all_test_probs]

# 5. Create Submission DataFrame
# Ensure 'test_ids' is the list of IDs from your test corpus load step
submission = pd.DataFrame({
    "id": test_ids,
    "label": final_pred_strs
})

# 6. Save to CSV
OUTPUT_FILE = "2023320344_Final.csv"
submission.to_csv(OUTPUT_FILE, index=False)

print(f"SUCCESS: Submission saved to {OUTPUT_FILE}")
print(submission.head())