In [95]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [96]:
from sklearn.metrics import f1_score
import numpy as np

# Also ensure these are present for the training loop
import torch.nn.functional as F

In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# ==========================================================
# 1. SETUP & MODEL DEFINITION
# ==========================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
NUM_CLASSES = 531


In [98]:
# ==========================================================
# 2. ARCHITECTURE: Multi-Head GATv2 + Path Attention
# ==========================================================
class MultiHeadPathLabelAttn(nn.Module):
    def __init__(self, dim, num_heads=8, dropout=0.2): # Increased to 8 heads for Strategy B
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        
        self.w_q = nn.Linear(dim, dim, bias=False)
        self.w_k = nn.Linear(dim, dim, bias=False)
        self.v_attn = nn.Linear(self.head_dim, 1, bias=False)
        self.v_proj = nn.Linear(dim, dim, bias=False)
        
        self.ln = nn.LayerNorm(dim)
        self.dropout = nn.Dropout(dropout)
        # Learnable gate to balance self-information vs. hierarchy information
        self.gate = nn.Parameter(torch.ones(1) * 0.5)

    def forward(self, E, ancestors):
        L, d = E.shape
        Q = self.w_q(E).view(L, self.num_heads, self.head_dim)
        K = self.w_k(E).view(L, self.num_heads, self.head_dim)
        V = self.v_proj(E).view(L, self.num_heads, self.head_dim)
        
        out = torch.zeros_like(E)
        for c in range(L):
            anc = ancestors[c]
            if not anc:
                out[c] = E[c]
                continue
            
            # GATv2 Dynamic Attention
            q_c = Q[c:c+1] 
            k_a = K[anc]
            
            # Compute scores: LeakyReLU(Q + K) is more expressive than standard GAT
            scores = self.v_attn(F.leaky_relu(q_c + k_a, 0.2)).squeeze(-1)
            attn = F.softmax(scores, dim=0) 
            
            # Aggregate hierarchy message
            msg = (attn.unsqueeze(-1) * V[anc]).sum(dim=0).view(d)
            
            # Apply Gating
            out[c] = (self.gate * E[c]) + ((1.0 - self.gate) * msg)
            
        # Strategy B: Added Residual Connection (out + E) before LayerNorm
        # This ensures specific label meanings aren't "washed out" by broad parent info
        return self.ln(self.dropout(out) + E)

class ProposedClassifier(nn.Module):
    def __init__(self, input_dim, num_labels, emb_dim, E_label_768, ancestors):
        super().__init__()
        self.doc_proj = nn.Sequential(
            nn.Linear(input_dim, emb_dim), 
            nn.ReLU(), 
            nn.Dropout(0.3)
        )
        self.label_proj = nn.Linear(768, emb_dim, bias=False)
        self.register_buffer("E_text", E_label_768.float())
        
        # Consistent num_heads with the Attention class
        self.label_attn = MultiHeadPathLabelAttn(emb_dim, num_heads=8)
        self.ancestors = ancestors

    def forward(self, X):
        # 1. Project product BERT features
        h = self.doc_proj(X)
        
        # 2. Refine label embeddings using hierarchy attention
        # Label proj maps BERT label space (768) to joint emb_dim (256)
        E = self.label_attn(self.label_proj(self.E_text), self.ancestors)
        
        # 3. Compute compatibility (Dot Product)
        return h @ E.t()

In [99]:
# ==========================================================
# 2. LOAD DATA & TEACHER MODEL (.pth)
# ==========================================================
data = torch.load("preprocessed_features.pth", weights_only=False)
X_train, Y_silver = data["X_train"], data["y_all"]
E_label_768, ancestors = data["E_label_768"], data["ancestors"]
X_test, test_ids = data["X_test"], data["test_ids"]

if isinstance(E_label_768, np.ndarray): E_label_768 = torch.from_numpy(E_label_768)

# Load Teacher
teacher = ProposedClassifier(768, NUM_CLASSES, 256, E_label_768, ancestors).to(device)
teacher.load_state_dict(torch.load("stage1_teacher_model1.pth"))
teacher.eval()

ProposedClassifier(
  (doc_proj): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (label_proj): Linear(in_features=768, out_features=256, bias=False)
  (label_attn): MultiHeadPathLabelAttn(
    (w_q): Linear(in_features=256, out_features=256, bias=False)
    (w_k): Linear(in_features=256, out_features=256, bias=False)
    (v_attn): Linear(in_features=32, out_features=1, bias=False)
    (v_proj): Linear(in_features=256, out_features=256, bias=False)
    (ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

In [100]:
# ==========================================================
# 2. LOAD DATA & TEACHER MODEL (.pth)
# ==========================================================
import os

# Load with weights_only=False to allow list/numpy recovery
data = torch.load("preprocessed_features.pth", weights_only=False)

X_train = data["X_train"]
Y_silver = data["y_all"]
E_label_768 = data["E_label_768"]
ancestors = data["ancestors"]
X_test = data["X_test"]

# --- SMART ID RECOVERY ---
try:
    test_ids = data["test_ids"]
    print(" test_ids loaded from .pth file.")
except KeyError:
    print("test_ids not found in .pth. Recovering from raw text file...")
    test_ids = []
    # Adjust this path to where your test_corpus.txt is located
    with open("./Amazon_products/test/test_corpus.txt", "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                test_ids.append(parts[0])
    print(f" Recovered {len(test_ids)} IDs from corpus.")

if isinstance(E_label_768, np.ndarray): 
    E_label_768 = torch.from_numpy(E_label_768)

# --- LOAD TEACHER ---
teacher = ProposedClassifier(768, NUM_CLASSES, 256, E_label_768, ancestors).to(device)
teacher.load_state_dict(torch.load("stage1_teacher_model1.pth"))
teacher.eval()
print("Teacher model loaded and set to eval mode.")


 test_ids loaded from .pth file.
Teacher model loaded and set to eval mode.


In [101]:
# ==========================================================
# GENERATE SOFT TARGETS (Run this before the training loop)
# ==========================================================
LAMBDA, TEMP = 0.7, 2.5 

@torch.no_grad()
def generate_q_soft(model, X, Y_hard, temp=2.5, lam=0.7):
    model.eval()
    Q_list = []
    # Process in batches to prevent GPU Out-of-Memory (OOM)
    for i in tqdm(range(0, len(X), 256), desc="Distilling Knowledge"):
        bx = X[i:i+256].to(device)
        # Apply temperature scaling to soften the distribution
        P_teacher = torch.sigmoid(model(bx) / temp)
        by = Y_hard[i:i+256].to(device)
        # Mix teacher opinion with silver labels
        Q_list.append(((1.0 - lam) * by + lam * P_teacher).cpu())
    return torch.cat(Q_list, dim=0)

print("Generating Q_soft targets using Teacher model...")
Q_soft = generate_q_soft(teacher, X_train, Y_silver, temp=TEMP, lam=LAMBDA)

Generating Q_soft targets using Teacher model...


Distilling Knowledge: 100%|██████████| 116/116 [00:15<00:00,  7.40it/s]


In [102]:
from sklearn.model_selection import train_test_split

# 1. Recreate the indices split (MUST use same random_state=42 as Stage 1)
all_indices = np.arange(len(X_train))
train_idx, val_idx = train_test_split(all_indices, test_size=0.15, random_state=42)

# 2. Define the Dataset Class (if not already in memory)
class MultiLabelDataset(Dataset):
    def __init__(self, X, y):
        
        self.X = X
        self.y = y
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return {"X": self.X[i], "y": self.y[i]}

# 3. Create the val_loader
# Note: We use the original Silver Y_all here to check against "Ground Truth"
val_ds = MultiLabelDataset(X_train[val_idx], Y_silver[val_idx])
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

# 4. Define train_indices for your loop
train_indices = train_idx 

print(f" val_loader defined. Ready to evaluate on {len(val_idx)} samples.")

 val_loader defined. Ready to evaluate on 4424 samples.


In [103]:
def evaluate_f1(model, loader, thr=0.35):
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in loader:
            Xb, yb = batch["X"].to(device), batch["y"].to(device)
            probs = torch.sigmoid(model(Xb)).cpu().numpy()
            for p in probs:
                idx = np.argsort(p)[::-1]
                chosen = [i for i in idx[:50] if p[i] >= thr]
                if len(chosen) < 2: chosen = idx[:2].tolist()
                vec = np.zeros(NUM_CLASSES)
                vec[chosen[:3]] = 1
                all_preds.append(vec)
            all_targets.append(yb.cpu().numpy())
    return f1_score(np.vstack(all_targets), np.vstack(all_preds), average='samples')

In [None]:
# ==========================================================
# 4. CONFIDENCE-AWARE STUDENT TRAINING (EARLY STOPPING)
# ==========================================================
best_student_f1 = 0.0
patience = 7
epochs_no_improve = 0
n_epochs = 20

for epoch in range(1, n_epochs + 1):
    student.train()
    total_train_loss = 0
    
    pbar = tqdm(range(0, len(train_indices), 64), desc=f"Student Epoch {epoch}")
    for i in pbar:
        idx = train_indices[i:i+64]
        bx, bq = X_train[idx].to(device), Q_soft[idx].to(device)
        
        preds = torch.sigmoid(student(bx))
        w = get_conf_weight(bq) 
        
        loss = (F.binary_cross_entropy(preds, bq, reduction='none') * w).mean()
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(student.parameters(), 1.0)
        optimizer.step()
        
        total_train_loss += loss.item()
        pbar.set_postfix({'loss': f"{loss.item():.4f}"})

    # --- EVALUATION STEP ---
    val_f1 = evaluate_f1(student, val_loader) 
    avg_train_loss = total_train_loss / (max(1, len(train_indices) // 64))
    
    print(f"\nEpoch {epoch} Results:")
    print(f"   Train Loss: {avg_train_loss:.4f} | Val Sample-F1: {val_f1:.4f}")

    # Early Stopping & Model Saving Logic
    if val_f1 > best_student_f1:
        best_student_f1 = val_f1
        epochs_no_improve = 0  # Reset counter
        torch.save(student.state_dict(), "final_student_model.pth")
        print(f"   ⭐ New Best Student! Saved with F1: {val_f1:.4f}")
    else:
        epochs_no_improve += 1
        print(f"   No improvement for {epochs_no_improve} epochs.")

    print("-" * 30)

    # Trigger Early Stopping
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered at epoch {epoch}! Best Val F1: {best_student_f1:.4f}")
        break

Student Epoch 1: 100%|██████████| 392/392 [03:00<00:00,  2.17it/s, loss=0.0403]



Epoch 1 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1931
   ⭐ New Best Student! Saved with F1: 0.1931
------------------------------


Student Epoch 2: 100%|██████████| 392/392 [03:06<00:00,  2.11it/s, loss=0.0402]



Epoch 2 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1942
   ⭐ New Best Student! Saved with F1: 0.1942
------------------------------


Student Epoch 3: 100%|██████████| 392/392 [03:02<00:00,  2.15it/s, loss=0.0403]



Epoch 3 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1982
   ⭐ New Best Student! Saved with F1: 0.1982
------------------------------


Student Epoch 4: 100%|██████████| 392/392 [03:06<00:00,  2.10it/s, loss=0.0405]



Epoch 4 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1876
   No improvement for 1 epochs.
------------------------------


Student Epoch 5: 100%|██████████| 392/392 [03:14<00:00,  2.02it/s, loss=0.0403]



Epoch 5 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1937
   No improvement for 2 epochs.
------------------------------


Student Epoch 6: 100%|██████████| 392/392 [03:23<00:00,  1.93it/s, loss=0.0405]



Epoch 6 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1921
   No improvement for 3 epochs.
------------------------------


Student Epoch 7: 100%|██████████| 392/392 [03:12<00:00,  2.04it/s, loss=0.0402]



Epoch 7 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1976
   No improvement for 4 epochs.
------------------------------


Student Epoch 8: 100%|██████████| 392/392 [02:57<00:00,  2.21it/s, loss=0.0407]



Epoch 8 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1880
   No improvement for 5 epochs.
------------------------------


Student Epoch 9: 100%|██████████| 392/392 [02:57<00:00,  2.21it/s, loss=0.0402]



Epoch 9 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1928
   No improvement for 6 epochs.
------------------------------


Student Epoch 10: 100%|██████████| 392/392 [06:46<00:00,  1.04s/it, loss=0.0403]



Epoch 10 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1924
   No improvement for 7 epochs.
------------------------------


Student Epoch 11: 100%|██████████| 392/392 [02:58<00:00,  2.20it/s, loss=0.0406]



Epoch 11 Results:
   Train Loss: 0.0397 | Val Sample-F1: 0.1894
   No improvement for 8 epochs.
------------------------------


Student Epoch 12:  53%|█████▎    | 209/392 [01:34<01:26,  2.13it/s, loss=0.0380]

In [65]:
# ==========================================================
# 5. FINAL INFERENCE & KAGGLE SUBMISSION
# ==========================================================
student.eval()
all_probs = []

# Use .detach().clone() to resolve the UserWarning
# We also move it to CPU first if it's large, then batch it to the GPU
Xt_tensor = X_test.detach().clone().to(torch.float32)

print("Starting Final Inference...")
with torch.no_grad():
    for i in tqdm(range(0, len(Xt_tensor), 256), desc="Inference Batches"):
        batch_x = Xt_tensor[i : i+256].to(device)
        logits = student(batch_x)
        probs = torch.sigmoid(logits)
        all_probs.append(probs.cpu().numpy())

# Stack all probabilities [N_test, 531]
probs = np.vstack(all_probs)

def get_labels(p, thr=0.35):
    """
    Selects top labels based on threshold and forces count 
    between 2 and 3 to optimize Sample-F1.
    """
    idx = np.argsort(p)[::-1]
    # Pick labels passing the threshold
    chosen = [i for i in idx[:50] if p[i] >= thr]
    
    # Competition Constraint: Force at least 2
    if len(chosen) < 2:
        chosen = idx[:2].tolist()
    
    # Competition Constraint: Force at most 3
    # Keeping it to 3 prevents 'over-predicting' which hurts F1
    final_selection = sorted(chosen[:3])
    
    return ",".join(map(str, final_selection))

# Generate the submission dataframe
print("Creating submission file...")
submission = pd.DataFrame({
    "id": test_ids, 
    "label": [get_labels(p) for p in probs]
})

# Save with your specific filename
submission.to_csv("2023320344_Final.csv", index=False)
print("Done! File saved as: 2023320344_Final.csv")

# Quick preview
print(submission.head())

Starting Final Inference...


Inference Batches: 100%|██████████| 77/77 [00:10<00:00,  7.57it/s]


Creating submission file...
Done! File saved as: 2023320344_Final.csv
  id   label
0  0  10,206
1  1  10,200
2  2   10,11
3  3   0,206
4  4   10,11


In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def plot_embedding_clusters(features, labels, title="Embedding Space"):
    # 1. PCA: Reduce to 50 dimensions to preserve global structure 
    pca = PCA(n_components=50)
    X_pca = pca.fit_transform(features[:2000].cpu().numpy()) # Sample 2000 for speed
    
    # 2. t-SNE: Project to 2D for human viewing 
    tsne = TSNE(n_components=2, perplexity=30, random_state=42)
    X_tsne = tsne.fit_transform(X_pca)
    
    plt.figure(figsize=(10, 8))
    # We color by the most frequent label in the sample for visual clarity
    colors = np.argmax(labels[:2000].cpu().numpy(), axis=1)
    
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, cmap='jet', alpha=0.6, s=15)
    plt.colorbar(label='Category ID')
    plt.title(f"PCA-initialized t-SNE: {title}")
    plt.savefig(f"{title}_plot.png")
    plt.show()

# Use it like this:
plot_embedding_clusters(X_train, Y_silver, "Raw BERT Features")

In [None]:
# A. Raw State (The BERT embeddings before any training)
plot_embedding_clusters(X_train, Y_silver, "1_Raw_BERT_Features")

# B. Teacher State (Embeddings after Stage 1)
# We need to extract the hidden 'h' from the teacher model
teacher.eval()
with torch.no_grad():
    h_teacher = teacher.doc_proj(X_train[:2000].to(device))
plot_embedding_clusters(h_teacher, Y_silver, "2_Teacher_Refined_Features")

# C. Final Student State (Embeddings after Stage 2 Refinement)
student.eval()
with torch.no_grad():
    h_student = student.doc_proj(X_train[:2000].to(device))
plot_embedding_clusters(h_student, Y_silver, "3_Final_Student_Features")

In [None]:
def plot_full_analysis(features, labels, title, n_samples=5000):
    # Ensure we don't exceed actual data length
    n = min(n_samples, len(features))
    
    # PCA is fast, t-SNE is slow. 
    # PCA helps 'denoise' high-dimensional BERT data before t-SNE tries to map it.
    pca = PCA(n_components=50)
    feat_subset = features[:n].cpu().numpy() if torch.is_tensor(features) else features[:n]
    X_pca = pca.fit_transform(feat_subset)
    
    print(f"Running t-SNE on {n} samples...")
    tsne = TSNE(n_components=2, perplexity=50, random_state=42, init='pca')
    X_tsne = tsne.fit_transform(X_pca)

plot_full_analysis(X_train, Y_silver, "1_Raw_BERT_Features")

In [None]:
import os

# Define the root and data directories
PROJECT_ROOT = "."
DATA_DIR = os.path.join(PROJECT_ROOT, "Amazon_products")

# Define the specific hierarchy path
HIER_PATH = os.path.join(DATA_DIR, "class_hierarchy.txt")

In [None]:
from collections import defaultdict

# 1. Re-define the loading function
def load_hierarchy(path):
    parents = defaultdict(set)  # child -> set of parents
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            p_str, c_str = line.split("\t")
            p, c = int(p_str), int(c_str)
            parents[c].add(p)
    return parents

# 2. Load the file (Ensure HIER_PATH is defined)
HIER_PATH = os.path.join(DATA_DIR, "class_hierarchy.txt")
parents = load_hierarchy(HIER_PATH)

print(f"Hierarchy loaded with {len(parents)} child-parent relations.")

In [None]:
def check_hvr(pred_labels, parents_dict):
    """
    Calculates the Hierarchy Violation Rate.
    A violation occurs if a child label is predicted but NONE of its parents are.
    """
    total_child_preds = 0
    violations = 0

    for labs in pred_labels:
        # labs is a list of predicted label indices for one sample
        S = set(labs)
        for c in labs:
            ps = parents_dict.get(c, set())
            if len(ps) == 0:
                continue  # Root nodes have no parent constraints
            
            total_child_preds += 1
            # Check if at least one parent is in the predicted set
            if len(S.intersection(ps)) == 0:
                violations += 1

    hvr = violations / max(total_child_preds, 1)
    return hvr, violations, total_child_preds

In [None]:
@torch.no_grad()
def evaluate_stage2_with_hvr(model, loader, parents_dict, thr=0.20):
    model.eval()
    all_preds = []
    
    for batch in loader:
        Xb = batch["X"].to(device)
        probs = torch.sigmoid(model(Xb)).cpu().numpy()
        
        # Convert probabilities to discrete labels for HVR check
        for p in probs:
            idx = np.argsort(p)[::-1]
            # Select labels based on threshold (Strategy A)
            chosen = [i for i in idx[:50] if p[i] >= thr]
            # Enforce competition constraints (2nd-3rd best if none pass)
            if len(chosen) < 2: chosen = idx[:2].tolist()
            all_preds.append(chosen[:3]) # Top 3 labels

    # Calculate HRV
    hvr_rate, v_cnt, total_c = check_hvr(all_preds, parents_dict)
    return hvr_rate, v_cnt, total_c

# --- Inside your Stage 2 Training Loop ---
hvr_rate, v_cnt, total_c = evaluate_stage2_with_hvr(student, val_loader, parents)
print(f"   HRV Rate: {hvr_rate:.4f} ({v_cnt} violations)")

In [None]:
# Now 'parents' is defined in your session
hvr_rate, v_cnt, total_c = evaluate_stage2_with_hvr(student, val_loader, parents)
print(f"Stage 2 Hierarchical Health:")
print(f"HRV Rate: {hvr_rate:.4f} ({v_cnt} violations in {total_c} predictions)")