## SECTION 1

In [1]:
# ============================================================================
# SECTION 1: ENVIRONMENT SETUP (ROBUST, PY3.12-FRIENDLY)
# ============================================================================

import sys, subprocess, importlib, os

def pipi(*pkgs):
    # Force reinstall + no cache to avoid stale wheels
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "--force-reinstall", "--no-cache-dir", *pkgs])

print("Installing pinned, compatible versions …")
# Torch: keep your existing CUDA build. If you don't have torch yet, uncomment the torch trio below.
# pipi("torch==2.2.2", "torchaudio==2.2.2", "torchvision==0.17.2")

# Pin NumPy 2.x and libs that are built against it
pipi(
    "numpy==2.1.1",
    "pandas==2.2.3",
    "scikit-learn==1.5.2",
    "matplotlib==3.9.2",
    "transformers==4.44.2",
    "accelerate==0.34.2",
    "datasets==2.21.0",
)

# --- Import order matters; import numpy FIRST to catch ABI issues clearly
import numpy as np
print("NumPy:", np.__version__)

# Now the rest
import torch, transformers, datasets, sklearn, pandas as pd, matplotlib, importlib

print("\n=== VERSION CHECK ===")
print("torch          :", getattr(torch, "__version__", "n/a"))
print("transformers   :", transformers.__version__)
print("accelerate     :", importlib.import_module("accelerate").__version__)
print("datasets       :", datasets.__version__)
print("scikit-learn   :", sklearn.__version__)
print("pandas         :", pd.__version__)
print("numpy          :", np.__version__)
print("matplotlib     :", matplotlib.__version__)

# Sanity for TrainingArguments modern kwargs
from packaging import version
assert version.parse(transformers.__version__) >= version.parse("4.26.0"), \
    "Transformers too old for `evaluation_strategy`."

# If NumPy was previously imported in this session, you may still have stale .so’s in memory.
# Simple guard: if you see an ABI error above, Restart runtime and run this cell again first.
print("\nCUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA Device Count:", torch.cuda.device_count())
    print("Current CUDA Device:", torch.cuda.get_device_name(0))


Installing pinned, compatible versions …
NumPy: 2.1.1

=== VERSION CHECK ===
torch          : 2.9.0+cu128
transformers   : 4.44.2
accelerate     : 0.34.2
datasets       : 2.21.0
scikit-learn   : 1.5.2
pandas         : 2.2.3
numpy          : 2.1.1
matplotlib     : 3.9.2

CUDA Available: True
CUDA Device Count: 1
Current CUDA Device: Tesla T4


### SECTION 1.5


In [2]:
# ============================================================================
# SECTION 1.5: VERSION CHECK + TRAININGARGUMENTS COMPATIBILITY SHIM
# ============================================================================

import inspect, importlib, sys
import transformers as _tf

print("Transformers version loaded in memory:", _tf.__version__)

def _supported_kwargs_of_training_args():
    # Build the set of supported __init__ kwargs for the loaded TrainingArguments
    try:
        from transformers import TrainingArguments
        sig = inspect.signature(TrainingArguments.__init__)
        return set(sig.parameters.keys())
    except Exception as e:
        print("[Compat] Could not inspect TrainingArguments:", e)
        return set()

_SUPPORTED_TA_KEYS = _supported_kwargs_of_training_args()
print("Sample of supported TrainingArguments kwargs:", sorted(list(_SUPPORTED_TA_KEYS))[:12], "...")

def make_training_args_compat(**kwargs):
    """
    Create TrainingArguments while dropping any kwargs unsupported by the loaded transformers version.
    Prints what was ignored so you know if your runtime is old.
    """
    from transformers import TrainingArguments
    filtered = {k: v for k, v in kwargs.items() if k in _SUPPORTED_TA_KEYS}
    ignored = [k for k in kwargs.keys() if k not in _SUPPORTED_TA_KEYS]
    if ignored:
        print("[Compat] Ignored unsupported TrainingArguments keys:", ignored)
    return TrainingArguments(**filtered)

def get_early_stopping_callbacks(patience: int):
    """Return EarlyStoppingCallback if available; otherwise return []."""
    try:
        from transformers import EarlyStoppingCallback
        return [EarlyStoppingCallback(early_stopping_patience=patience)]
    except Exception as e:
        print("[Compat] EarlyStoppingCallback unavailable:", e)
        return []


Transformers version loaded in memory: 4.44.2
Sample of supported TrainingArguments kwargs: ['accelerator_config', 'adafactor', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'auto_find_batch_size', 'batch_eval_metrics', 'bf16', 'bf16_full_eval', 'data_seed', 'dataloader_drop_last', 'dataloader_num_workers'] ...


## SECTION 2

In [3]:

# ============================================================================
# SECTION 2: IMPORTS AND BASIC SETUP
# ============================================================================

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import time
from datetime import timedelta

# ============================================================================
# TIMING UTILITY - Track execution time for each section
# ============================================================================
class SectionTimer:
    def __init__(self):
        self.section_times = {}
        self.start_time = None
        self.total_start = time.time()

    def start_section(self, section_name):
        """Start timing a section"""
        self.start_time = time.time()
        print(f"\n🚀 Starting {section_name}...")

    def end_section(self, section_name):
        """End timing and display results"""
        if self.start_time is None:
            self.start_time = time.time()

        elapsed = time.time() - self.start_time
        self.section_times[section_name] = elapsed

        # Format time nicely
        if elapsed < 60:
            time_str = f"{elapsed:.1f}s"
        elif elapsed < 3600:
            time_str = f"{elapsed/60:.1f}m {elapsed%60:.0f}s"
        else:
            time_str = f"{elapsed/3600:.1f}h {(elapsed%3600)/60:.0f}m"

        total_elapsed = time.time() - self.total_start
        if total_elapsed < 60:
            total_str = f"{total_elapsed:.1f}s"
        elif total_elapsed < 3600:
            total_str = f"{total_elapsed/60:.1f}m {total_elapsed%60:.0f}s"
        else:
            total_str = f"{total_elapsed/3600:.1f}h {(total_elapsed%3600)/60:.0f}m"

        print(f"✅ {section_name} completed in {time_str}")
        print(f"🕒 Total runtime so far: {total_str}")
        print("-" * 60)

    def get_summary(self):
        """Get timing summary"""
        total = time.time() - self.total_start
        print("\n" + "="*60)
        print("⏱️  EXECUTION TIME SUMMARY")
        print("="*60)
        for section, elapsed in self.section_times.items():
            if elapsed < 60:
                time_str = f"{elapsed:.1f}s"
            elif elapsed < 3600:
                time_str = f"{elapsed/60:.1f}m {elapsed%60:.0f}s"
            else:
                time_str = f"{elapsed/3600:.1f}h {(elapsed%3600)/60:.0f}m"
            print(f"{section:<40} : {time_str}")

        if total < 60:
            total_str = f"{total:.1f}s"
        elif total < 3600:
            total_str = f"{total/60:.1f}m {total%60:.0f}s"
        else:
            total_str = f"{total/3600:.1f}h {(total%3600)/60:.0f}m"

        print(f"{'='*40} : {'='*10}")
        print(f"{'TOTAL EXECUTION TIME':<40} : {total_str}")
        print("="*60)

# Initialize global timer
timer = SectionTimer()
timer.start_section("SECTION 2: Environment & Imports")
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# End timing for section 2
timer.end_section("SECTION 2: Environment & Imports")
timer.start_section("SECTION 3: Configuration Setup")



🚀 Starting SECTION 2: Environment & Imports...
✅ SECTION 2: Environment & Imports completed in 9.2s
🕒 Total runtime so far: 9.2s
------------------------------------------------------------

🚀 Starting SECTION 3: Configuration Setup...


In [4]:
import os, random, json, math
from dataclasses import dataclass
from typing import Dict, Tuple, Optional, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

from transformers import (
    AutoTokenizer, AutoModel, TrainingArguments, Trainer,
    DataCollatorWithPadding, EarlyStoppingCallback
)

def seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_all(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

## SECTION 3

In [5]:
# 🤖 TRAINING ONLY: mBERT (bert-base-multilingual-cased)\n
# Expected: ~35-40 min, 60-65% macro-F1\n

# ===== Section 3 — Config (pooling + R-Drop + LLRD) =====

data_path = '/content/adjudications_2025-10-22.csv'
CSV_PATH = '/content/adjudications_2025-10-22.csv'


TITLE_COL = "Title"
TEXT_COL  = "Comment"
SENT_COL  = "Final Sentiment"
POL_COL   = "Final Polarization"

MODEL_CONFIGS = {
    "mbert": {"name": "bert-base-multilingual-cased", "desc": "mBERT (104 langs)"},
}
MODELS_TO_RUN = ["mbert"]  # ← TRAINING ONLY mBERT
OUT_DIR = "./runs_mbert_optimized"  # ← Separate output directory

# ============================================================================
# CORE TRAINING - RUN #7 TASK-SPECIFIC GRADIENTS + ANTI-OVERFITTING
# Run #6 Result: 61.59% Macro-F1 (Partial recovery +3.05% from R5, but -0.47% vs R4)
# Problem: Gradient flow trade-off - sentiment improved (+2.9%), polarization regressed (-3.9%)
# Run #7 Strategy: Task-specific gradient norms + anti-overfitting enhancements
# Run #7 Target: 63-65% Macro-F1 (Combine R6 sentiment gains + R4 polarization stability!)
# Expected training time: ~75 minutes (similar to R4/R6)
# ============================================================================
MAX_LENGTH = 224
EPOCHS = 20                 # ✅ KEEP from R4 (proven optimal)
BATCH_SIZE = 16            # ✅ KEEP R4
LR = 2.5e-5               # ✅ KEEP R4 (stable and effective)
WEIGHT_DECAY = 0.05       # ⬆️ ANTI-OVERFITTING: UP from 0.03 (stronger L2 regularization!)
WARMUP_RATIO = 0.20       # ✅ KEEP R4
EARLY_STOP_PATIENCE = 7   # ⬇️ ANTI-OVERFITTING: DOWN from 8 (stop earlier if not improving)
GRAD_ACCUM_STEPS = 3      # Effective batch: 48

# Per-task loss - RUN #7 RESTORE R4 (proven stable)
USE_FOCAL_SENTIMENT = True
USE_FOCAL_POLARITY  = True
FOCAL_GAMMA_SENTIMENT = 2.5   # ✅ RESTORE R4 (worked well)
FOCAL_GAMMA_POLARITY = 3.5    # ✅ RESTORE R4 (working well)
LABEL_SMOOTH_SENTIMENT = 0.10 # ✅ RESTORE R4
LABEL_SMOOTH_POLARITY = 0.08  # ✅ RESTORE R4

# Task weights - RUN #7 RESTORE R4 (proven stable balance)
TASK_LOSS_WEIGHTS = {"sentiment": 1.0, "polarization": 1.4}  # ✅ RESTORE R4

# 🎯 NEW: TASK-SPECIFIC GRADIENT CONTROL (R6 proved tasks need different gradients!)
USE_TASK_SPECIFIC_GRAD_NORM = True  # Enable separate gradient norms per task
SENTIMENT_GRAD_NORM = 1.0    # R6 proved sentiment benefits from stronger gradients
POLARITY_GRAD_NORM = 0.5     # R4 proved polarization needs tighter control
MAX_GRAD_NORM = 0.5          # Global fallback (if task-specific disabled)
USE_GRADIENT_CHECKPOINTING = True  # Memory efficiency

# ============================================================================
# AGGRESSIVE CLASS WEIGHTS - RUN #6 RESTORE R4 (proven stable)
# Run #5 showed: 1.30 negative weight WORSENED recall (47.5%→40.3%)!
# negative: Keep R4's 1.10 (1.30 backfired spectacularly)
# neutral: Keep R4's 1.80 (working with oversampling)
# objective: Keep R4's 2.50 (working well)
# ============================================================================
CLASS_WEIGHT_MULT = {
    "sentiment": {
        "negative": 1.10,    # ✅ RESTORE R4 from 1.30 (1.30 made recall WORSE!)
        "neutral":  1.80,    # ✅ RESTORE R4 (working with oversampling)
        "positive": 1.30     # ✅ RESTORE R4
    },
    "polarization": {
        "non_polarized": 1.20,  # ✅ RESTORE R4
        "objective":     2.50,  # ✅ RESTORE R4 (working well)
        "partisan":      0.95   # ✅ RESTORE R4
    }
}

# Cap maximum class weight to prevent instability
MAX_CLASS_WEIGHT = 10.0  # 🔥 INCREASED (was 6.0) - Allow stronger weighting for weak classes

# ============================================================================
# SELECTIVE OVERSAMPLING - RUN #6 RESTORE R4 (proven optimal balance)
# Run #5 showed: 10x objective + 3x neutral = DISASTER (non-polarized -8.2%!)
# Run #4's 8.5x/3.5x balance was OPTIMAL - don't change it!
# Goal: Restore R4's proven configuration completely
# ============================================================================
USE_OVERSAMPLING = True
USE_JOINT_OVERSAMPLING = True
USE_SMART_OVERSAMPLING = True
JOINT_ALPHA = 0.70              # ✅ RESTORE R4 (was effective)
JOINT_OVERSAMPLING_MAX_MULT = 8.0  # ✅ RESTORE R4
OBJECTIVE_BOOST_MULT = 8.5      # ✅ RESTORE R4 from 10.0 (10x destroyed non-polarized!)
NEUTRAL_BOOST_MULT = 3.5        # ✅ RESTORE R4 from 3.0 (3x removed critical signal!)

# ============================================================================
# ARCHITECTURE - RUN #7 WITH ANTI-OVERFITTING ENHANCEMENTS
# Larger model can learn more complex multi-task representations
# ============================================================================
HEAD_HIDDEN = 768            # ✅ KEEP from R4 - Match BERT hidden size!
HEAD_DROPOUT = 0.30          # ⬆️ ANTI-OVERFITTING: UP from 0.25 (stronger dropout!)
REP_POOLING = "last4_mean"   # ✅ KEEP - best pooling strategy
HEAD_LAYERS = 3              # ✅ KEEP from R4 - Deeper task-specific heads

# ============================================================================
# REGULARIZATION - RUN #7 RESTORE R4 + ENHANCED R-DROP
# ============================================================================
USE_RDROP = True
RDROP_ALPHA = 0.7           # ⬆️ ANTI-OVERFITTING: UP from 0.6 (stronger consistency regularization!)
RDROP_WARMUP_EPOCHS = 2     # ✅ KEEP (gradual warmup)

# LLRD (layer-wise learning-rate decay) - RUN #7 RESTORE R4 (proven stable)
USE_LLRD = True
LLRD_DECAY = 0.90            # ✅ RESTORE R4 from 0.92 (R4's tighter control worked better for polarity!)
HEAD_LR_MULT = 3.0           # ✅ RESTORE R4 (working well)

os.makedirs(OUT_DIR, exist_ok=True)

# ============================================================================
# CONFIGURATION SUMMARY
# ============================================================================
print("="*80)
print("🎯 mBERT TRAINING - RUN #7 TASK-SPECIFIC GRADIENTS + ANTI-OVERFITTING")
print("="*80)
print(f"📊 Run History: R1: 58.5% → R2: 60.97% → R3: 60.55% → R4: 62.06% 🏆 → R5: 58.54% 💥 → R6: 61.59%")
print(f"   Run #6 Result: Partial recovery (+3.05% from R5) but task trade-off (sent +2.9%, pol -3.9%)")
print(f"   Run #7 Strategy: Task-specific gradient norms + anti-overfitting enhancements")
print(f"   Run #7 Target: 63-65% Macro-F1 (Combine R6 sentiment gains + R4 polarity stability!)")
print()
print(f"⏱️  Training Configuration (R4 BASE + ANTI-OVERFITTING):")
print(f"   ├─ Epochs: {EPOCHS} ✅ KEEP from R4")
print(f"   ├─ Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * GRAD_ACCUM_STEPS})")
print(f"   ├─ Learning Rate: {LR} ✅ KEEP from R4")
print(f"   ├─ Weight Decay: {WEIGHT_DECAY} ⬆️ UP from 0.03 (anti-overfitting!)")
print(f"   ├─ Early Stop Patience: {EARLY_STOP_PATIENCE} ⬇️ DOWN from 8 (stop earlier!)")
print(f"   └─ Estimated Time: ~75 minutes")
print()
print(f"🏗️  Architecture (R4 BASE + ANTI-OVERFITTING):")
print(f"   ├─ Head Hidden: {HEAD_HIDDEN} ✅ KEEP from R4")
print(f"   ├─ Head Layers: {HEAD_LAYERS} ✅ KEEP from R4")
print(f"   └─ Head Dropout: {HEAD_DROPOUT} ⬆️ UP from 0.25 (anti-overfitting!)")
print()
print(f"⚖️  Critical Class Weights (RUN #7 RESTORED R4):")
print(f"   ├─ Negative:  {CLASS_WEIGHT_MULT['sentiment']['negative']}x ✅ RESTORED R4")
print(f"   ├─ Neutral:   {CLASS_WEIGHT_MULT['sentiment']['neutral']}x ✅ RESTORED R4")
print(f"   ├─ Objective: {CLASS_WEIGHT_MULT['polarization']['objective']}x ✅ RESTORED R4")
print(f"   └─ Max Weight Cap: {MAX_CLASS_WEIGHT}")
print()
print(f"📊 Oversampling Strategy (Run #7 - RESTORED R4 OPTIMAL BALANCE):")
print(f"   ├─ Joint Alpha: {JOINT_ALPHA}")
print(f"   ├─ Max Multiplier: {JOINT_OVERSAMPLING_MAX_MULT}x")
print(f"   ├─ Objective Boost: {OBJECTIVE_BOOST_MULT}x ✅ RESTORED R4 (proven balance)")
print(f"   └─ Neutral Boost: {NEUTRAL_BOOST_MULT}x ✅ RESTORED R4 (proven balance)")
print()
print(f"🎯 NEW: TASK-SPECIFIC GRADIENT CONTROL (KEY INNOVATION!):")
print(f"   ├─ Feature Enabled: {USE_TASK_SPECIFIC_GRAD_NORM}")
print(f"   ├─ Sentiment Grad Norm: {SENTIMENT_GRAD_NORM} (R6 proved sentiment needs strong gradients!)")
print(f"   ├─ Polarity Grad Norm: {POLARITY_GRAD_NORM} (R4 proved polarity needs tight control!)")
print(f"   └─ Strategy: Clip sentiment head & polarity head gradients SEPARATELY!")
print()
print(f"🔥 Advanced Techniques (Run #7 - R4 BASE + ENHANCED REGULARIZATION):")
print(f"   ├─ Focal Loss Gamma (Sent/Pol): {FOCAL_GAMMA_SENTIMENT}/{FOCAL_GAMMA_POLARITY} ✅ RESTORED R4")
print(f"   ├─ Label Smoothing (Sent/Pol): {LABEL_SMOOTH_SENTIMENT}/{LABEL_SMOOTH_POLARITY} ✅ RESTORED R4")
print(f"   ├─ Task Weights (Sent/Pol): {TASK_LOSS_WEIGHTS['sentiment']}/{TASK_LOSS_WEIGHTS['polarization']} ✅ RESTORED R4")
print(f"   ├─ R-Drop Alpha: {RDROP_ALPHA} ⬆️ UP from 0.6 (anti-overfitting!)")
print(f"   └─ LLRD Decay: {LLRD_DECAY} ✅ RESTORED R4 (polarity needs tighter layer control)")
print()
print(f"🎯 Run #7 Expected Results (BREAK THROUGH R4 PLATEAU!):")
print(f"   ├─ Overall Macro-F1: 63-65% (beat R4's 62.06%, R6's 61.59%) → Target: 75%")
print(f"   ├─ Sentiment F1: 64-65% (keep R6's gains: negative 66.8%, positive 72.2%)")
print(f"   ├─ Polarization F1: 62-63% (recover R4's stability: partisan 81%, objective 42%)")
print(f"   ├─ Key Innovation: Task-specific gradients should eliminate the trade-off!")
print(f"   └─ Anti-overfitting: Stronger regularization for more robust performance")
print()
print(f"📁 Output: {OUT_DIR}")
print("="*80)

# End timing for section 3
timer.end_section("SECTION 3: Configuration Setup")
timer.start_section("SECTION 4: Data Loading & Preprocessing")


🎯 mBERT TRAINING - RUN #7 TASK-SPECIFIC GRADIENTS + ANTI-OVERFITTING
📊 Run History: R1: 58.5% → R2: 60.97% → R3: 60.55% → R4: 62.06% 🏆 → R5: 58.54% 💥 → R6: 61.59%
   Run #6 Result: Partial recovery (+3.05% from R5) but task trade-off (sent +2.9%, pol -3.9%)
   Run #7 Strategy: Task-specific gradient norms + anti-overfitting enhancements
   Run #7 Target: 63-65% Macro-F1 (Combine R6 sentiment gains + R4 polarity stability!)

⏱️  Training Configuration (R4 BASE + ANTI-OVERFITTING):
   ├─ Epochs: 20 ✅ KEEP from R4
   ├─ Batch Size: 16 (effective: 48)
   ├─ Learning Rate: 2.5e-05 ✅ KEEP from R4
   ├─ Weight Decay: 0.05 ⬆️ UP from 0.03 (anti-overfitting!)
   ├─ Early Stop Patience: 7 ⬇️ DOWN from 8 (stop earlier!)
   └─ Estimated Time: ~75 minutes

🏗️  Architecture (R4 BASE + ANTI-OVERFITTING):
   ├─ Head Hidden: 768 ✅ KEEP from R4
   ├─ Head Layers: 3 ✅ KEEP from R4
   └─ Head Dropout: 0.3 ⬆️ UP from 0.25 (anti-overfitting!)

⚖️  Critical Class Weights (RUN #7 RESTORED R4):
   ├─ Negative:

## SECTION 4

In [6]:
# ===== Section 4 — Load & Prepare Data (updated for multipliers) =====
df = pd.read_csv(CSV_PATH)
df.columns = df.columns.str.strip()

required = [TITLE_COL, TEXT_COL, SENT_COL, POL_COL]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}. Found: {list(df.columns)}")

df = df.dropna(subset=[TITLE_COL, TEXT_COL, SENT_COL, POL_COL]).reset_index(drop=True)

# Encode labels
from sklearn.preprocessing import LabelEncoder
sent_le = LabelEncoder().fit(df[SENT_COL])
pol_le  = LabelEncoder().fit(df[POL_COL])

df["sent_y"] = sent_le.transform(df[SENT_COL])
df["pol_y"]  = pol_le.transform(df[POL_COL])

num_sent_classes = len(sent_le.classes_)
num_pol_classes  = len(pol_le.classes_)

print("Sentiment classes:", dict(enumerate(sent_le.classes_)))
print("Polarization classes:", dict(enumerate(pol_le.classes_)))

# Splits (stratify by sentiment)
from sklearn.model_selection import train_test_split
X = df[[TITLE_COL, TEXT_COL]].copy()
y_sent = df["sent_y"].values
y_pol  = df["pol_y"].values

X_train, X_tmp, ysent_train, ysent_tmp, ypol_train, ypol_tmp = train_test_split(
    X, y_sent, y_pol, test_size=0.3, random_state=42, stratify=y_sent
)
X_val, X_test, ysent_val, ysent_test, ypol_val, ypol_test = train_test_split(
    X_tmp, ysent_tmp, ypol_tmp, test_size=0.5, random_state=42, stratify=ysent_tmp
)

print("Train size:", len(X_train), "Val size:", len(X_val), "Test size:", len(X_test))

# Balanced class weights from TRAIN only
from sklearn.utils.class_weight import compute_class_weight
import numpy as np, json, os

def safe_class_weights(y, n_classes):
    classes = np.arange(n_classes)
    counts = np.bincount(y, minlength=n_classes)
    if np.any(counts == 0):
        return np.ones(n_classes, dtype=np.float32)
    return compute_class_weight("balanced", classes=classes, y=y).astype(np.float32)

sent_weights_np = safe_class_weights(ysent_train, num_sent_classes)
pol_weights_np  = safe_class_weights(ypol_train,  num_pol_classes)

# Apply user multipliers by class name
sent_name_to_idx = {name: i for i, name in enumerate(sent_le.classes_)}
pol_name_to_idx  = {name: i for i, name in enumerate(pol_le.classes_)}

for cname, mult in CLASS_WEIGHT_MULT["sentiment"].items():
    if cname in sent_name_to_idx:
        sent_weights_np[sent_name_to_idx[cname]] *= float(mult)

for cname, mult in CLASS_WEIGHT_MULT["polarization"].items():
    if cname in pol_name_to_idx:
        pol_weights_np[pol_name_to_idx[cname]] *= float(mult)

# Apply class weight caps to prevent training instability
sent_weights_np = np.clip(sent_weights_np, 0.1, MAX_CLASS_WEIGHT)
pol_weights_np = np.clip(pol_weights_np, 0.1, MAX_CLASS_WEIGHT)

print("Final sentiment class weights (capped):", {sent_le.classes_[i]: float(w) for i, w in enumerate(sent_weights_np)})
print("Final polarization class weights (capped):", {pol_le.classes_[i]: float(w) for i, w in enumerate(pol_weights_np)})
print(f"Class weights were capped at maximum: {MAX_CLASS_WEIGHT}")

# Save label maps
with open(os.path.join(OUT_DIR, "label_map_sentiment.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(sent_le.classes_)).items()}, f, indent=2)
with open(os.path.join(OUT_DIR, "label_map_polarization.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(pol_le.classes_)).items()}, f, indent=2)

# End timing for section 4
timer.end_section("SECTION 4: Data Loading & Preprocessing")
timer.start_section("SECTION 5-9: Model Architecture & Training Setup")


Sentiment classes: {0: 'negative', 1: 'neutral', 2: 'positive'}
Polarization classes: {0: 'non_polarized', 1: 'objective', 2: 'partisan'}
Train size: 6975 Val size: 1495 Test size: 1495
Final sentiment class weights (capped): {'negative': 0.6187999248504639, 'neutral': 2.2331910133361816, 'positive': 3.1224172115325928}
Final polarization class weights (capped): {'non_polarized': 1.436663269996643, 'objective': 10.0, 'partisan': 0.47725799679756165}
Class weights were capped at maximum: 10.0
✅ SECTION 4: Data Loading & Preprocessing completed in 1.2m 11s
🕒 Total runtime so far: 1.5m 33s
------------------------------------------------------------

🚀 Starting SECTION 5-9: Model Architecture & Training Setup...


## SECTION 5

In [7]:
# ===== Section 5 — Dataset & Collator (proper text-pair encoding) =====
from torch.utils.data import Dataset

class TaglishDataset(Dataset):
    def __init__(self, titles, texts, y_sent, y_pol, tokenizer, max_length=224):
        self.titles = list(titles)
        self.texts  = list(texts)
        self.y_sent = np.array(y_sent)
        self.y_pol  = np.array(y_pol)
        self.tok = tokenizer
        self.max_length = max_length
        # mBERT has token_type_ids; XLM-R/RemBERT don't, and that's fine.
        self.use_token_type = "token_type_ids" in tokenizer.model_input_names

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Pass title as text, comment as text_pair so the tokenizer inserts the correct separators.
        # We also bias truncation to the comment since titles are short.
        enc = self.tok(
            text=str(self.titles[idx]),
            text_pair=str(self.texts[idx]),
            truncation="only_second",     # keep the title intact; trim the comment if needed
            max_length=self.max_length,
            return_token_type_ids=self.use_token_type,
        )
        item = {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "sentiment_labels": torch.tensor(self.y_sent[idx], dtype=torch.long),
            "polarization_labels": torch.tensor(self.y_pol[idx], dtype=torch.long),
        }
        if self.use_token_type and "token_type_ids" in enc:
            item["token_type_ids"] = enc["token_type_ids"]
        return item


## SECTION 6

In [8]:
# ===== Section 6 — Multi-Task Model (pooling + MLP heads) =====
import torch
import torch.nn as nn
from transformers import AutoModel

def mean_pooling(token_embeddings, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    summed = (token_embeddings * mask).sum(dim=1)
    denom = mask.sum(dim=1).clamp(min=1e-9)
    return summed / denom

class MultiTaskModel(nn.Module):
    def __init__(self, base_model_name: str, num_sent: int, num_pol: int, dropout: float = 0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        self.hidden = self.encoder.config.hidden_size

        # Enhanced trunk with better architecture
        self.trunk = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.hidden, HEAD_HIDDEN),
            nn.GELU(),
            nn.LayerNorm(HEAD_HIDDEN),
            nn.Dropout(HEAD_DROPOUT),
        )

        # Enhanced multi-layer heads for better task-specific learning
        if HEAD_LAYERS == 2:
            self.head_sent = nn.Sequential(
                nn.Linear(HEAD_HIDDEN, HEAD_HIDDEN // 2),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 2),
                nn.Dropout(HEAD_DROPOUT * 0.8),
                nn.Linear(HEAD_HIDDEN // 2, num_sent)
            )
            self.head_pol = nn.Sequential(
                nn.Linear(HEAD_HIDDEN, HEAD_HIDDEN // 2),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 2),
                nn.Dropout(HEAD_DROPOUT * 0.8),
                nn.Linear(HEAD_HIDDEN // 2, num_pol)
            )
        else:
            self.head_sent = nn.Linear(HEAD_HIDDEN, num_sent)
            self.head_pol  = nn.Linear(HEAD_HIDDEN, num_pol)

        # Enable gradient checkpointing if configured
        if USE_GRADIENT_CHECKPOINTING:
            self.encoder.gradient_checkpointing_enable()

    def _pool(self, outputs, attention_mask):
        # Flexible representation pooling
        if REP_POOLING == "pooler" and hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            return outputs.pooler_output
        if REP_POOLING == "cls":
            return outputs.last_hidden_state[:, 0]
        # default: last4_mean
        hs = outputs.hidden_states  # tuple of [layer0..last]
        last4 = torch.stack(hs[-4:]).mean(dim=0)       # [B, T, H]
        return mean_pooling(last4, attention_mask)     # [B, H]

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                sentiment_labels=None,
                polarization_labels=None):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids if token_type_ids is not None else None,
            output_hidden_states=(REP_POOLING != "pooler")  # needed for last4_mean/cls
        )
        pooled = self._pool(outputs, attention_mask)
        z = self.trunk(pooled)
        return {"logits": (self.head_sent(z), self.head_pol(z))}


## SECTION 7

In [9]:
# SECTION 7

def compute_metrics_multi(eval_pred):
    (sent_logits, pol_logits) = eval_pred.predictions
    (y_sent, y_pol) = eval_pred.label_ids

    ps = np.argmax(sent_logits, axis=1)
    pp = np.argmax(pol_logits, axis=1)

    # Macro metrics
    sent_report = classification_report(y_sent, ps, output_dict=True, zero_division=0)
    pol_report  = classification_report(y_pol,  pp, output_dict=True, zero_division=0)

    sent_f1 = sent_report["macro avg"]["f1-score"]
    pol_f1  = pol_report["macro avg"]["f1-score"]
    macro_f1_avg = (sent_f1 + pol_f1) / 2.0

    return {
        "sent_acc": sent_report["accuracy"],
        "sent_prec": sent_report["macro avg"]["precision"],
        "sent_rec": sent_report["macro avg"]["recall"],
        "sent_f1": sent_f1,

        "pol_acc": pol_report["accuracy"],
        "pol_prec": pol_report["macro avg"]["precision"],
        "pol_rec": pol_report["macro avg"]["recall"],
        "pol_f1": pol_f1,

        "macro_f1_avg": macro_f1_avg
    }


## SECTION 8

In [10]:
# ===== Section 8 — Custom Trainer (R-Drop + LLRD + safe prediction_step) =====
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer
from torch.utils.data import DataLoader
from torch.optim import AdamW

class FocalLoss(nn.Module):
    def __init__(self, weight=None, gamma=2.0, reduction="mean"):
        super().__init__()
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction
    def forward(self, logits, target):
        logp = F.log_softmax(logits, dim=1)
        p = torch.exp(logp)
        loss = F.nll_loss((1 - p) ** self.gamma * logp, target, weight=self.weight, reduction="none")
        return loss.mean() if self.reduction == "mean" else loss.sum()

def _sym_kl_with_logits(logits1, logits2):
    p = F.log_softmax(logits1, dim=-1);  q = F.log_softmax(logits2, dim=-1)
    p_exp, q_exp = p.exp(), q.exp()
    return 0.5 * (F.kl_div(p, q_exp, reduction="batchmean") + F.kl_div(q, p_exp, reduction="batchmean"))

class MultiTaskTrainer(Trainer):
    def __init__(self, *args, class_weights=None, task_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights or {}
        self.task_weights  = task_weights or {"sentiment": 1.0, "polarization": 1.0}
        self._custom_train_sampler = None

    # ----- LLRD optimizer -----
    def create_optimizer(self):
        if self.optimizer is not None:
            return self.optimizer
        if not USE_LLRD:
            self.optimizer = AdamW(self.get_decay_parameter_groups(self.model), lr=LR, weight_decay=WEIGHT_DECAY)
            return self.optimizer

        no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
        encoder = self.model.encoder
        n_layers = getattr(encoder.config, "num_hidden_layers", 12)
        # Try to access sequential layers
        layers = getattr(getattr(encoder, "encoder", encoder), "layer", None)
        if layers is None:
            # Fallback: no LLRD if we can't find layers
            self.optimizer = AdamW(self.get_decay_parameter_groups(self.model), lr=LR, weight_decay=WEIGHT_DECAY)
            return self.optimizer

        param_groups = []

        # Embeddings (lowest lr)
        emb = getattr(encoder, "embeddings", None)
        if emb is not None:
            lr_emb = LR * (LLRD_DECAY ** n_layers)
            decay, nodecay = [], []
            for n, p in emb.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
            if decay:   param_groups.append({"params": decay,   "lr": lr_emb, "weight_decay": WEIGHT_DECAY})
            if nodecay: param_groups.append({"params": nodecay, "lr": lr_emb, "weight_decay": 0.0})

        # Encoder blocks (increasing LR toward the top)
        for i in range(n_layers):
            block = layers[i]
            lr_i = LR * (LLRD_DECAY ** (n_layers - 1 - i))
            decay, nodecay = [], []
            for n, p in block.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
            if decay:   param_groups.append({"params": decay,   "lr": lr_i, "weight_decay": WEIGHT_DECAY})
            if nodecay: param_groups.append({"params": nodecay, "lr": lr_i, "weight_decay": 0.0})

        # Pooler (if any)
        pooler = getattr(encoder, "pooler", None)
        if pooler is not None:
            decay, nodecay = [], []
            for n, p in pooler.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
            if decay:   param_groups.append({"params": decay,   "lr": LR, "weight_decay": WEIGHT_DECAY})
            if nodecay: param_groups.append({"params": nodecay, "lr": LR, "weight_decay": 0.0})

        # Heads/trunk (highest LR)
        head_lr = LR * HEAD_LR_MULT
        head_modules = [self.model.trunk, self.model.head_sent, self.model.head_pol]
        decay, nodecay = [], []
        for m in head_modules:
            for n, p in m.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
        if decay:   param_groups.append({"params": decay,   "lr": head_lr, "weight_decay": WEIGHT_DECAY})
        if nodecay: param_groups.append({"params": nodecay, "lr": head_lr, "weight_decay": 0.0})

        self.optimizer = AdamW(param_groups, lr=LR)  # lr here is ignored per-group
        return self.optimizer

    def set_train_sampler(self, sampler):
        self._custom_train_sampler = sampler

    def get_train_dataloader(self):
        if self.train_dataset is None:
            return None
        if self._custom_train_sampler is not None:
            return DataLoader(
                self.train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=self._custom_train_sampler,
                collate_fn=self.data_collator,
                drop_last=self.args.dataloader_drop_last,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
        return super().get_train_dataloader()

    def _sent_loss_fn(self, weight, logits, target):
        if USE_FOCAL_SENTIMENT:
            return FocalLoss(weight=weight, gamma=FOCAL_GAMMA_SENTIMENT)(logits, target)
        return nn.CrossEntropyLoss(weight=weight, label_smoothing=float(LABEL_SMOOTH_SENTIMENT))(logits, target)

    def _pol_loss_fn(self, weight, logits, target):
        if USE_FOCAL_POLARITY:
            return FocalLoss(weight=weight, gamma=FOCAL_GAMMA_POLARITY)(logits, target)
        return nn.CrossEntropyLoss(weight=weight, label_smoothing=float(LABEL_SMOOTH_POLARITY))(logits, target)

    def compute_loss(self, model, inputs, return_outputs=False):
        y_sent = inputs.pop("sentiment_labels")
        y_pol  = inputs.pop("polarization_labels")

        # R-Drop with warmup: two forward passes with dropout
        current_epoch = getattr(self.state, 'epoch', 0) if hasattr(self, 'state') else 0
        use_rdrop_now = USE_RDROP and model.training and current_epoch >= RDROP_WARMUP_EPOCHS

        if use_rdrop_now:
            outputs1 = model(**inputs)
            outputs2 = model(**inputs)
            s1, p1 = outputs1["logits"]
            s2, p2 = outputs2["logits"]

            ws = self.class_weights.get("sentiment", None); ws = ws.to(s1.device) if ws is not None else None
            wp = self.class_weights.get("polarization", None); wp = wp.to(p1.device) if wp is not None else None

            ce_s = 0.5 * (self._sent_loss_fn(ws, s1, y_sent) + self._sent_loss_fn(ws, s2, y_sent))
            ce_p = 0.5 * (self._pol_loss_fn(wp,  p1, y_pol)  + self._pol_loss_fn(wp,  p2, y_pol))
            kl_s = _sym_kl_with_logits(s1, s2)
            kl_p = _sym_kl_with_logits(p1, p2)

            w_s = float(self.task_weights.get("sentiment", 1.0))
            w_p = float(self.task_weights.get("polarization", 1.0))

            # Gradual R-Drop alpha rampup for stability
            rdrop_factor = min(1.0, (current_epoch - RDROP_WARMUP_EPOCHS + 1) / 2.0)
            loss = w_s * ce_s + w_p * ce_p + (RDROP_ALPHA * rdrop_factor) * (kl_s + kl_p)
            if return_outputs:
                return loss, {"logits": (s1, p1)}
            return loss

        # Standard single forward
        outputs = model(**inputs)
        s, p = outputs["logits"]

        ws = self.class_weights.get("sentiment", None); ws = ws.to(s.device) if ws is not None else None
        wp = self.class_weights.get("polarization", None); wp = wp.to(p.device) if wp is not None else None

        loss_s = self._sent_loss_fn(ws, s, y_sent)
        loss_p = self._pol_loss_fn(wp, p, y_pol)

        w_s = float(self.task_weights.get("sentiment", 1.0))
        w_p = float(self.task_weights.get("polarization", 1.0))
        loss = w_s * loss_s + w_p * loss_p

        if return_outputs:
            outputs = dict(outputs); outputs["labels"] = (y_sent, y_pol)
            return loss, outputs
        return loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        # Safe for inference (no labels provided)
        y_sent = inputs.get("sentiment_labels", None)
        y_pol  = inputs.get("polarization_labels", None)

        model_inputs = {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]}
        if "token_type_ids" in inputs:
            model_inputs["token_type_ids"] = inputs["token_type_ids"]

        model.eval()
        with torch.no_grad():
            outputs = model(**model_inputs)
            s, p = outputs["logits"]

        loss = None
        logits = (s.detach(), p.detach())
        labels = (y_sent, y_pol) if isinstance(y_sent, torch.Tensor) and isinstance(y_pol, torch.Tensor) else None
        return (loss, logits, labels)

    # 🎯 NEW: Override training_step for task-specific gradient control
    def training_step(self, model, inputs):
        """
        Perform a training step with optional task-specific gradient clipping.

        RUN #7 INNOVATION: Clip sentiment head and polarity head gradients separately
        based on Run #6 findings that tasks respond differently to gradient magnitudes.
        """
        model.train()
        inputs = self._prepare_inputs(inputs)

        # Forward pass and compute loss
        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)

        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        # Backward pass
        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss)

        # 🎯 TASK-SPECIFIC GRADIENT CLIPPING (RUN #7)
        if USE_TASK_SPECIFIC_GRAD_NORM:
            # Clip sentiment head gradients with SENTIMENT_GRAD_NORM
            if hasattr(model, 'head_sent'):
                torch.nn.utils.clip_grad_norm_(model.head_sent.parameters(), SENTIMENT_GRAD_NORM)
            elif hasattr(model, 'module') and hasattr(model.module, 'head_sent'):
                torch.nn.utils.clip_grad_norm_(model.module.head_sent.parameters(), SENTIMENT_GRAD_NORM)

            # Clip polarity head gradients with POLARITY_GRAD_NORM
            if hasattr(model, 'head_pol'):
                torch.nn.utils.clip_grad_norm_(model.head_pol.parameters(), POLARITY_GRAD_NORM)
            elif hasattr(model, 'module') and hasattr(model.module, 'head_pol'):
                torch.nn.utils.clip_grad_norm_(model.module.head_pol.parameters(), POLARITY_GRAD_NORM)

            # Clip shared encoder and trunk with the tighter norm (polarity norm)
            shared_params = []
            if hasattr(model, 'encoder'):
                shared_params.extend(model.encoder.parameters())
            elif hasattr(model, 'module') and hasattr(model.module, 'encoder'):
                shared_params.extend(model.module.encoder.parameters())

            if hasattr(model, 'trunk'):
                shared_params.extend(model.trunk.parameters())
            elif hasattr(model, 'module') and hasattr(model.module, 'trunk'):
                shared_params.extend(model.module.trunk.parameters())

            if shared_params:
                torch.nn.utils.clip_grad_norm_(shared_params, POLARITY_GRAD_NORM)
        else:
            # Standard gradient clipping with MAX_GRAD_NORM
            torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)

        return loss.detach() / self.args.gradient_accumulation_steps


## SECTION 9

In [11]:
# ===== Section 9 — Train/Evaluate One Model (with grad accumulation) =====
from transformers import AutoTokenizer, DataCollatorWithPadding
import math, json, numpy as np, pandas as pd, os
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import torch
from torch.utils.data import WeightedRandomSampler
from collections import Counter

def train_eval_one_model(model_key: str,
                         X_tr: pd.DataFrame, X_v: pd.DataFrame, X_te: pd.DataFrame,
                         ysent_tr: np.ndarray, ysent_v: np.ndarray, ysent_te: np.ndarray,
                         ypol_tr: np.ndarray,  ypol_v: np.ndarray,  ypol_te: np.ndarray,
                         sent_w_np: np.ndarray, pol_w_np: np.ndarray):
    base_name = MODEL_CONFIGS[model_key]["name"]
    run_dir = os.path.join(OUT_DIR, f"{model_key}")
    os.makedirs(run_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(base_name)
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    tr_titles, tr_texts = X_tr[TITLE_COL].values, X_tr[TEXT_COL].values
    v_titles,  v_texts  = X_v[TITLE_COL].values, X_v[TEXT_COL].values
    te_titles, te_texts = X_te[TITLE_COL].values, X_te[TEXT_COL].values

    train_ds = TaglishDataset(tr_titles, tr_texts, ysent_tr, ypol_tr, tokenizer, max_length=MAX_LENGTH)
    val_ds   = TaglishDataset(v_titles,  v_texts,  ysent_v,  ypol_v,  tokenizer, max_length=MAX_LENGTH)
    test_ds  = TaglishDataset(te_titles, te_texts, ysent_te, ypol_te, tokenizer, max_length=MAX_LENGTH)

    model = MultiTaskModel(base_name, num_sent_classes, num_pol_classes).to(device)

    sent_w = torch.tensor(sent_w_np, dtype=torch.float32)
    pol_w  = torch.tensor(pol_w_np,  dtype=torch.float32)

    args = make_training_args_compat(
        output_dir=run_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_RATIO,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1_avg",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        logging_dir=os.path.join(run_dir, "logs"),
        logging_steps=25,                    # More frequent logging
        logging_first_step=True,             # Log first step for debugging
        save_steps=500,                      # Save checkpoints more often
        eval_steps=None,                     # Eval at end of each epoch
        report_to="none",
        seed=42,
        remove_unused_columns=False,
        eval_accumulation_steps=1,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        dataloader_pin_memory=True,          # Performance optimization
        max_grad_norm=MAX_GRAD_NORM,         # Used only if task-specific disabled
        label_smoothing_factor=0.0,          # We handle this in loss functions
        save_total_limit=5,                  # ✅ Keep 5 best checkpoints
        prediction_loss_only=False           # Log all metrics
    )

    callbacks = get_early_stopping_callbacks(EARLY_STOP_PATIENCE)

    trainer = MultiTaskTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_multi,
        callbacks=callbacks,
        class_weights={"sentiment": sent_w, "polarization": pol_w},
        task_weights=TASK_LOSS_WEIGHTS
    )

    # ----- ENHANCED JOINT oversampling with objective boost -----
    if USE_OVERSAMPLING and USE_JOINT_OVERSAMPLING:
        pair_counts = Counter(zip(ysent_tr.tolist(), ypol_tr.tolist()))
        counts = np.array(list(pair_counts.values()), dtype=np.float32)
        med = float(np.median(counts)) if len(counts) else 1.0

        # Find objective class index (polarization)
        obj_idx = np.where(pol_le.classes_ == "objective")[0][0] if "objective" in pol_le.classes_ else 1

        # 🔥 NEW: Find neutral class index (sentiment)
        neutral_idx = np.where(sent_le.classes_ == "neutral")[0][0] if "neutral" in sent_le.classes_ else 1

        def inv_mult(c):
            if c <= 0: return JOINT_OVERSAMPLING_MAX_MULT
            return float(np.clip(med / float(c), 1.0, JOINT_OVERSAMPLING_MAX_MULT))

        inv_by_pair = {k: inv_mult(v) for k, v in pair_counts.items()}
        sample_weights = []

        for ys, yp in zip(ysent_tr, ypol_tr):
            inv = inv_by_pair.get((int(ys), int(yp)), 1.0)
            w = (1.0 - JOINT_ALPHA) * 1.0 + JOINT_ALPHA * inv

            # Smart oversampling: extra boost for objective class (polarization)
            if USE_SMART_OVERSAMPLING and int(yp) == obj_idx:
                w *= OBJECTIVE_BOOST_MULT

            # 🔥 NEW: Also boost neutral class (sentiment) - Fixes 49% F1!
            if USE_SMART_OVERSAMPLING and int(ys) == neutral_idx:
                w *= NEUTRAL_BOOST_MULT

            sample_weights.append(w)

        obj_boost_count = sum(1 for i, yp in enumerate(ypol_tr) if int(yp) == obj_idx and sample_weights[i] > 2.0)
        neutral_boost_count = sum(1 for i, ys in enumerate(ysent_tr) if int(ys) == neutral_idx and sample_weights[i] > 2.0)
        print(f"🔥 Enhanced Oversampling: min={min(sample_weights):.2f}, max={max(sample_weights):.2f}")
        print(f"   ├─ Objective boosted samples: {obj_boost_count} (target: weak class at 40% F1)")
        print(f"   └─ Neutral boosted samples: {neutral_boost_count} (target: weak class at 49% F1)")
        trainer.set_train_sampler(WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True))

    trainer.train()

    # Test
    test_out = trainer.predict(test_ds)
    metrics = {f"test_{k}": float(v) for k, v in test_out.metrics.items()}
    trainer.save_model(run_dir)
    tokenizer.save_pretrained(run_dir)
    with open(os.path.join(run_dir, "metrics_test.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    sent_logits, pol_logits = test_out.predictions
    ysent_pred = np.argmax(sent_logits, axis=1)
    ypol_pred  = np.argmax(pol_logits,  axis=1)

    cm_sent = confusion_matrix(ysent_te, ysent_pred, labels=list(range(num_sent_classes)))
    cm_pol  = confusion_matrix(ypol_te,  ypol_pred,  labels=list(range(num_pol_classes)))
    np.save(os.path.join(run_dir, "cm_sent.npy"), cm_sent)
    np.save(os.path.join(run_dir, "cm_pol.npy"),  cm_pol)

    def plot_cm(cm, labels, title, path_png):
        fig, ax = plt.subplots(figsize=(4.5, 4))
        im = ax.imshow(cm, interpolation="nearest")
        ax.set_title(title); ax.set_xlabel("Predicted"); ax.set_ylabel("True")
        ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45, ha="right")
        ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, cm[i, j], ha="center", va="center")
        fig.colorbar(im, ax=ax, fraction=0.046); plt.tight_layout(); plt.savefig(path_png, dpi=160); plt.close(fig)

    plot_cm(cm_sent, sent_le.classes_, "Sentiment Confusion", os.path.join(run_dir, "cm_sent.png"))
    plot_cm(cm_pol,  pol_le.classes_,  "Polarization Confusion", os.path.join(run_dir, "cm_pol.png"))

    rep_sent = classification_report(ysent_te, ysent_pred, target_names=sent_le.classes_, digits=4, zero_division=0)
    rep_pol  = classification_report(ypol_te,  ypol_pred,  target_names=pol_le.classes_,  digits=4, zero_division=0)
    with open(os.path.join(run_dir, "report_sentiment.txt"), "w") as f: f.write(rep_sent)
    with open(os.path.join(run_dir, "report_polarization.txt"), "w") as f: f.write(rep_pol)

    return {"model_key": model_key, "base_name": base_name, **metrics}, (ysent_pred, ypol_pred)

# End timing for architecture setup
timer.end_section("SECTION 5-9: Model Architecture & Training Setup")


✅ SECTION 5-9: Model Architecture & Training Setup completed in 33.7s
🕒 Total runtime so far: 2.1m 7s
------------------------------------------------------------


## SECTION 10


In [12]:
# SECTION 10

timer.start_section("SECTION 10: Model Training Execution")

results = []
pred_cache = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Running {key} -> {MODEL_CONFIGS[key]['name']} ===")
    row, preds = train_eval_one_model(
        key,
        X_train, X_val, X_test,
        ysent_train, ysent_val, ysent_test,
        ypol_train,  ypol_val,  ypol_test,
        sent_weights_np, pol_weights_np
    )
    results.append(row)
    pred_cache[key] = preds

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(OUT_DIR, "summary_results.csv"), index=False)

# End timing for training execution
timer.end_section("SECTION 10: Model Training Execution")
timer.start_section("SECTION 11+: Evaluation & Calibration")

results_df



🚀 Starting SECTION 10: Model Training Execution...

=== Running mbert -> bert-base-multilingual-cased ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

🔥 Enhanced Oversampling: min=1.00, max=68.28
   ├─ Objective boosted samples: 405 (target: weak class at 40% F1)
   └─ Neutral boosted samples: 1874 (target: weak class at 49% F1)


Epoch,Training Loss,Validation Loss,Sent Acc,Sent Prec,Sent Rec,Sent F1,Pol Acc,Pol Prec,Pol Rec,Pol F1,Macro F1 Avg
0,1.1434,No log,0.268896,0.089632,0.333333,0.141276,0.062207,0.020736,0.333333,0.039043,0.090159
1,0.948,No log,0.269565,0.423025,0.334944,0.144555,0.062207,0.020736,0.333333,0.039043,0.091799
3,0.6775,No log,0.346488,0.585167,0.494278,0.357877,0.172575,0.468766,0.42297,0.193428,0.275653
4,0.5284,No log,0.513712,0.644447,0.524632,0.509758,0.486288,0.50702,0.575548,0.419738,0.464748
6,0.3564,No log,0.401338,0.680054,0.512302,0.427373,0.52709,0.504729,0.623186,0.478751,0.453062
7,0.3881,No log,0.408696,0.658292,0.504303,0.427709,0.544482,0.502889,0.629015,0.489064,0.458386
9,0.2761,No log,0.442809,0.632422,0.56384,0.47696,0.418729,0.490286,0.589005,0.399033,0.437997
10,0.2823,No log,0.538462,0.565254,0.613036,0.542352,0.609365,0.527025,0.655939,0.534619,0.538486
12,0.21,No log,0.429431,0.667329,0.533571,0.461921,0.511706,0.510306,0.622634,0.474,0.46796
13,0.2245,No log,0.440134,0.661168,0.556868,0.478901,0.529097,0.525066,0.628009,0.497606,0.488253


✅ SECTION 10: Model Training Execution completed in 57.8m 46s
🕒 Total runtime so far: 60.0m 59s
------------------------------------------------------------

🚀 Starting SECTION 11+: Evaluation & Calibration...


Unnamed: 0,model_key,base_name,test_test_sent_acc,test_test_sent_prec,test_test_sent_rec,test_test_sent_f1,test_test_pol_acc,test_test_pol_prec,test_test_pol_rec,test_test_pol_f1,test_test_macro_f1_avg,test_test_runtime,test_test_samples_per_second,test_test_steps_per_second
0,mbert,bert-base-multilingual-cased,0.53913,0.555329,0.605001,0.535048,0.612709,0.536327,0.654056,0.53849,0.536769,4.886,305.979,19.239


### SECTION 10A


In [13]:
# ============================================================================
# SECTION 10A — VERIFY ARTIFACTS & RESOLVE TOKENIZER + WEIGHTS (v2)
# Builds maps for: tokenizer_dir (usually run root) and weights_dir (checkpoint or run root).
# Run AFTER Section 10 (training) and BEFORE 11B/11C.
# ============================================================================

import os, re, json
from typing import Optional, Dict

def _has_weights(path: str) -> bool:
    return os.path.isfile(os.path.join(path, "pytorch_model.bin")) or os.path.isfile(os.path.join(path, "model.safetensors"))

def _has_tokenizer(path: str) -> bool:
    # Minimal tokenizer files
    return (
        os.path.isfile(os.path.join(path, "tokenizer.json")) or
        os.path.isfile(os.path.join(path, "vocab.txt")) or
        os.path.isfile(os.path.join(path, "spiece.model"))
    )

def _list_checkpoints(run_dir: str):
    if not os.path.isdir(run_dir): return []
    chks = []
    for name in os.listdir(run_dir):
        p = os.path.join(run_dir, name)
        if os.path.isdir(p) and re.match(r"^checkpoint-\d+$", name):
            chks.append(p)
    # sort by gl


## SECTION 11


In [14]:
# ===== Section 11 — Detailed Breakdown Reports (per-class + cross-slices) =====
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import os
import json

def per_class_breakdown(y_true, y_pred, class_names):
    rep = classification_report(
        y_true, y_pred,
        target_names=list(class_names),
        output_dict=True, zero_division=0
    )
    # Keep only the class rows in the given order
    rows = []
    for cname in class_names:
        if cname in rep:
            rows.append({
                "class": cname,
                "precision": rep[cname]["precision"],
                "recall":    rep[cname]["recall"],
                "f1":        rep[cname]["f1-score"],
                "support":   int(rep[cname]["support"]),
            })
        else:
            rows.append({"class": cname, "precision": 0.0, "recall": 0.0, "f1": 0.0, "support": 0})
    return pd.DataFrame(rows)

def cross_slice_breakdown(
    slice_true,  # array of ints for the slicing label (e.g., true sentiment indices)
    slice_names, # names of the slicing label classes (e.g., sentiment class names)
    task_true,   # array of ints for the task we evaluate (e.g., true polarity indices)
    task_pred,   # array of ints for the task predictions (e.g., predicted polarity indices)
    task_names,  # names of the task classes (e.g., polarity class names)
    slice_label  # string for the slice axis name, e.g., "sentiment" or "polarity"
):
    """
    For each class s in slice_true, evaluate the task predictions on the subset where slice_true == s.
    Returns one row per slice value, including macro-F1, accuracy, and per-class F1 for the task.
    """
    rows = []
    for idx, sname in enumerate(slice_names):
        mask = (slice_true == idx)
        n = int(mask.sum())
        if n == 0:
            # No samples for this slice in test set
            row = {"slice": sname, "support": 0, "accuracy": np.nan, "macro_f1": np.nan}
            for tname in task_names:
                row[f"f1_{tname}"] = np.nan
            rows.append(row)
            continue

        rep = classification_report(
            task_true[mask], task_pred[mask],
            target_names=list(task_names),
            output_dict=True, zero_division=0
        )
        row = {
            "slice": sname,
            "support": n,
            "accuracy": rep["accuracy"],
            "macro_f1": rep["macro avg"]["f1-score"],
        }
        for tname in task_names:
            row[f"f1_{tname}"] = rep[tname]["f1-score"]
        rows.append(row)

    df = pd.DataFrame(rows)
    # Sort slices by support (desc) for readability
    df = df.sort_values(by="support", ascending=False).reset_index(drop=True)
    return df

# Where to save things
DETAILS_DIR = os.path.join(OUT_DIR, "details")
os.makedirs(DETAILS_DIR, exist_ok=True)

all_breakdowns = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Detailed breakdowns for {key} ===")
    ysent_pred, ypol_pred = pred_cache[key]

    # ---- Per-class reports on the full test set
    sent_per_class = per_class_breakdown(ysent_test, ysent_pred, sent_le.classes_)
    pol_per_class  = per_class_breakdown(ypol_test,  ypol_pred,  pol_le.classes_)

    # Save + show
    sent_csv = os.path.join(DETAILS_DIR, f"{key}_sentiment_per_class.csv")
    pol_csv  = os.path.join(DETAILS_DIR, f"{key}_polarization_per_class.csv")
    sent_per_class.to_csv(sent_csv, index=False)
    pol_per_class.to_csv(pol_csv, index=False)

    print("\nSentiment — per class (precision/recall/F1/support):")
    display(sent_per_class)

    print("\nPolarization — per class (precision/recall/F1/support):")
    display(pol_per_class)

    # ---- Cross-slice reports
    # Polarity performance within each (true) sentiment slice
    pol_given_sent = cross_slice_breakdown(
        slice_true=ysent_test, slice_names=sent_le.classes_,
        task_true=ypol_test,   task_pred=ypol_pred, task_names=pol_le.classes_,
        slice_label="sentiment"
    )
    pol_given_sent_csv = os.path.join(DETAILS_DIR, f"{key}_polarity_given_sentiment.csv")
    pol_given_sent.to_csv(pol_given_sent_csv, index=False)

    print("\nPolarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):")
    display(pol_given_sent)

    # Sentiment performance within each (true) polarity slice
    sent_given_pol = cross_slice_breakdown(
        slice_true=ypol_test,  slice_names=pol_le.classes_,
        task_true=ysent_test,  task_pred=ysent_pred, task_names=sent_le.classes_,
        slice_label="polarity"
    )
    sent_given_pol_csv = os.path.join(DETAILS_DIR, f"{key}_sentiment_given_polarity.csv")
    sent_given_pol.to_csv(sent_given_pol_csv, index=False)

    print("\nSentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):")
    display(sent_given_pol)

    # Keep for a single JSON bundle if you like
    all_breakdowns[key] = {
        "sentiment_per_class_csv": sent_csv,
        "polarization_per_class_csv": pol_csv,
        "polarity_given_sentiment_csv": pol_given_sent_csv,
        "sentiment_given_polarity_csv": sent_given_pol_csv
    }

# Optional: write an index JSON pointing to all CSVs
with open(os.path.join(DETAILS_DIR, "index.json"), "w") as f:
    json.dump(all_breakdowns, f, indent=2)
print("\nSaved detailed breakdowns to:", DETAILS_DIR)



=== Detailed breakdowns for mbert ===

Sentiment — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,negative,0.850325,0.442438,0.582034,886
1,neutral,0.385838,0.665835,0.488564,401
2,positive,0.429825,0.706731,0.534545,208



Polarization — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,non_polarized,0.527675,0.657471,0.585466,435
1,objective,0.217391,0.722222,0.33419,90
2,partisan,0.863914,0.582474,0.695813,970



Polarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_non_polarized,f1_objective,f1_partisan
0,negative,886,0.648984,0.487851,0.525253,0.176796,0.761506
1,neutral,401,0.581047,0.555678,0.655172,0.509804,0.502058
2,positive,208,0.519231,0.483506,0.56,0.363636,0.526882



Sentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_negative,f1_neutral,f1_positive
0,partisan,970,0.549485,0.508176,0.655797,0.318271,0.550459
1,non_polarized,435,0.491954,0.441958,0.199005,0.604255,0.522613
2,objective,90,0.655556,0.554929,0.47619,0.77193,0.416667



Saved detailed breakdowns to: ./runs_mbert_optimized/details



### SECTION 11A
Changed from 11C

In [15]:
# ============================================================================
# SECTION 11C — MULTICLASS POLARITY CALIBRATION (v2)
# ============================================================================

from sklearn.metrics import classification_report
import numpy as np, json, os
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, TrainingArguments, DataCollatorWithPadding

# ============================================================================
# Helper Functions for Calibration
# ============================================================================

class _PlainPairDS(Dataset):
    """Simple dataset for inference-only (no labels needed)"""
    def __init__(self, titles, texts, tokenizer, max_length=224):
        self.titles, self.texts = list(titles), list(texts)
        self.tok = tokenizer
        self.max_length = max_length
        self.use_tt = "token_type_ids" in tokenizer.model_input_names

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.tok(
            text=str(self.titles[idx]),
            text_pair=str(self.texts[idx]),
            truncation="only_second",
            max_length=self.max_length,
            return_token_type_ids=self.use_tt
        )

def _get_pol_logits(model_key, titles, texts):
    """Get polarization logits from trained model"""
    # Load tokenizer and model
    run_dir = os.path.join(OUT_DIR, model_key)
    model_name = MODEL_CONFIGS[model_key]["name"]

    print(f"   Loading model from: {run_dir}")
    tokenizer = AutoTokenizer.from_pretrained(run_dir if os.path.exists(os.path.join(run_dir, "tokenizer.json")) else model_name)

    # Rebuild model and load weights
    model = MultiTaskModel(model_name, num_sent_classes, num_pol_classes)

    # Load weights
    model_file = os.path.join(run_dir, "pytorch_model.bin")
    if os.path.exists(model_file):
        model.load_state_dict(torch.load(model_file, map_location=device), strict=False)
    else:
        print(f"   Warning: No trained weights found at {model_file}, using untrained model")

    model.to(device)
    model.eval()

    # Create dataset and trainer
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
    args = TrainingArguments(
        output_dir=os.path.join(run_dir, "calib_tmp"),
        per_device_eval_batch_size=64,
        report_to="none"
    )

    dummy_trainer = MultiTaskTrainer(
        model=model,
        args=args,
        data_collator=collator,
        class_weights=None,
        task_weights=None
    )

    ds = _PlainPairDS(titles, texts, tokenizer, MAX_LENGTH)
    out = dummy_trainer.predict(ds)
    _, pol_logits = out.predictions

    return pol_logits

# ============================================================================
# Calibration Functions
# ============================================================================

def coord_search_biases(pol_logits_val, y_val, class_names, passes=2, grid=(-0.8, 0.8, 0.1)):
    lo, hi, step = grid
    C = pol_logits_val.shape[1]
    b = np.zeros(C, dtype=np.float32)

    def macro_f1_with(bias_vec):
        y_pred = np.argmax(pol_logits_val + bias_vec.reshape(1, -1), axis=1)
        rep = classification_report(y_val, y_pred, target_names=class_names, output_dict=True, zero_division=0)
        return rep["macro avg"]["f1-score"]

    best = macro_f1_with(b)
    for _ in range(passes):
        improved = False
        for c in range(C):
            best_b_c, best_score_c = b[c], best
            for val in np.arange(lo, hi + 1e-9, step):
                b_try = b.copy()
                b_try[c] = val
                score = macro_f1_with(b_try)
                if score > best_score_c + 1e-6:
                    best_score_c, best_b_c = score, val
            if best_b_c != b[c]:
                b[c] = best_b_c
                best = best_score_c
                improved = True
        if not improved:
            break
    return b, float(best)

CALIB_DIR2 = os.path.join(OUT_DIR, "calibration_vector")
os.makedirs(CALIB_DIR2, exist_ok=True)

print("🎯 MULTICLASS CALIBRATION - Optimize prediction biases for better performance")
print("="*70)

for key in MODELS_TO_RUN:
    print(f"\n🔧 Calibrating {key} ({MODEL_CONFIGS[key]['name']})...")

    print(f"📊 Step 1: Extracting polarization logits from trained model...")
    pol_val_logits = _get_pol_logits(key, X_val[TITLE_COL].values,  X_val[TEXT_COL].values)
    pol_tst_logits = _get_pol_logits(key, X_test[TITLE_COL].values, X_test[TEXT_COL].values)
    print(f"   ✓ Validation logits shape: {pol_val_logits.shape}")
    print(f"   ✓ Test logits shape: {pol_tst_logits.shape}")

    y_val = ypol_val
    y_tst = ypol_test
    class_names = list(pol_le.classes_)

    print(f"🔍 Step 2: Searching for optimal bias vector (coordinate search)...")
    b_vec, val_macro = coord_search_biases(pol_val_logits, y_val, class_names, passes=3, grid=(-0.8, 0.8, 0.1))
    print(f"   ✓ Optimal bias vector found (VAL macro-F1={val_macro:.3f}):")
    for cname, bias_val in zip(class_names, b_vec):
        print(f"      • {cname:>13}: {bias_val:+.2f}")

    # Test before/after
    print(f"📈 Step 3: Evaluating calibration impact on test set...")
    y_before = np.argmax(pol_tst_logits, axis=1)
    rep_before = classification_report(y_tst, y_before, target_names=class_names, output_dict=True, zero_division=0)

    y_after = np.argmax(pol_tst_logits + b_vec.reshape(1, -1), axis=1)
    rep_after  = classification_report(y_tst, y_after, target_names=class_names, output_dict=True, zero_division=0)

    improvement = rep_after['macro avg']['f1-score'] - rep_before['macro avg']['f1-score']
    print(f"\n   📊 TEST MACRO-F1: {rep_before['macro avg']['f1-score']:.3f} → {rep_after['macro avg']['f1-score']:.3f} ({improvement:+.3f})\n")
    print("   Per-class breakdown:")
    for cname in class_names:
        b = rep_before[cname]; a = rep_after[cname]
        f1_change = a['f1-score'] - b['f1-score']
        emoji = "📈" if f1_change > 0 else "📉" if f1_change < 0 else "➡️"
        print(f"   {emoji} {cname:>13}: P={b['precision']:.3f} R={b['recall']:.3f} F1={b['f1-score']:.3f} (n={int(b['support'])})"
              f"  →  P={a['precision']:.3f} R={a['recall']:.3f} F1={a['f1-score']:.3f} ({f1_change:+.3f})")

    # Save calibration results
    calib_file = os.path.join(CALIB_DIR2, f"{key}_bias_vector.json")
    with open(calib_file, "w") as f:
        json.dump({
            "bias_vector": {class_names[i]: float(b_vec[i]) for i in range(len(class_names))},
            "val_macro_f1": val_macro,
            "test_macro_f1_before": float(rep_before["macro avg"]["f1-score"]),
            "test_macro_f1_after":  float(rep_after["macro avg"]["f1-score"])
        }, f, indent=2)

    print(f"\n✅ Calibration complete! Bias vector saved to:")
    print(f"   {calib_file}")

print(f"\n{'='*70}")
print(f"🎉 CALIBRATION FINISHED - All models optimized!")


🎯 MULTICLASS CALIBRATION - Optimize prediction biases for better performance

🔧 Calibrating mbert (bert-base-multilingual-cased)...
📊 Step 1: Extracting polarization logits from trained model...
   Loading model from: ./runs_mbert_optimized/mbert


   Loading model from: ./runs_mbert_optimized/mbert


   ✓ Validation logits shape: (1495, 3)
   ✓ Test logits shape: (1495, 3)
🔍 Step 2: Searching for optimal bias vector (coordinate search)...
   ✓ Optimal bias vector found (VAL macro-F1=0.358):
      • non_polarized: -0.30
      •     objective: +0.20
      •      partisan: +0.00
📈 Step 3: Evaluating calibration impact on test set...

   📊 TEST MACRO-F1: 0.292 → 0.285 (-0.007)

   Per-class breakdown:
   📉 non_polarized: P=0.273 R=0.007 F1=0.013 (n=435)  →  P=0.000 R=0.000 F1=0.000 (-0.013)
   📈     objective: P=0.095 R=0.133 F1=0.111 (n=90)  →  P=0.106 R=0.511 F1=0.176 (+0.065)
   📉      partisan: P=0.644 R=0.901 F1=0.751 (n=970)  →  P=0.649 R=0.710 F1=0.678 (-0.073)

✅ Calibration complete! Bias vector saved to:
   ./runs_mbert_optimized/calibration_vector/mbert_bias_vector.json

🎉 CALIBRATION FINISHED - All models optimized!


## SECTION 12

In [16]:
# ===== Section 12 — Length Diagnostics (clean) =====
import warnings

def token_lengths_summary(texts, titles, tokenizer, n=5000):
    # Random sample (or full if dataset is small)
    n = min(n, len(texts))
    idx = np.random.choice(len(texts), size=n, replace=False) if len(texts) > n else np.arange(len(texts))

    lengths = []
    # Silence the "sequence > 512" warnings emitted by some tokenizers for inspection
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Token indices sequence length is longer.*")
        for i in idx:
            s = f"{titles[i]} [SEP] {texts[i]}"
            # We want raw length pre-truncation to choose MAX_LENGTH wisely
            ids = tokenizer.encode(s, add_special_tokens=True, truncation=False)
            lengths.append(len(ids))

    arr = np.array(lengths)
    stats = {
        "mean": float(arr.mean()),
        "p50":  float(np.percentile(arr, 50)),
        "p90":  float(np.percentile(arr, 90)),
        "p95":  float(np.percentile(arr, 95)),
        "p99":  float(np.percentile(arr, 99)),
        "max":  int(arr.max())
    }
    print("Token length stats:", stats)
    return stats

for key in MODELS_TO_RUN:
    name = MODEL_CONFIGS[key]["name"]
    tok = AutoTokenizer.from_pretrained(name)
    print(f"\n[{key}] {name}")
    token_lengths_summary(
        texts=X_train[TEXT_COL].values,
        titles=X_train[TITLE_COL].values,
        tokenizer=tok,
        n=5000
    )

# Tip:
# If p95 is comfortably < 192, you're fine. If you see p95 > 192, consider MAX_LENGTH=224
# (Update in Section 3 if you decide to bump it.)

# Final timing summary
timer.end_section("SECTION 11+: Evaluation & Calibration")
timer.get_summary()



[mbert] bert-base-multilingual-cased


Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors


Token length stats: {'mean': 109.174, 'p50': 97.0, 'p90': 179.0, 'p95': 194.0, 'p99': 226.02000000000044, 'max': 916}
✅ SECTION 11+: Evaluation & Calibration completed in 16.5s
🕒 Total runtime so far: 1.0h 0m
------------------------------------------------------------

⏱️  EXECUTION TIME SUMMARY
SECTION 2: Environment & Imports         : 9.2s
SECTION 3: Configuration Setup           : 13.1s
SECTION 4: Data Loading & Preprocessing  : 1.2m 11s
SECTION 5-9: Model Architecture & Training Setup : 33.7s
SECTION 10: Model Training Execution     : 57.8m 46s
SECTION 11+: Evaluation & Calibration    : 16.5s
TOTAL EXECUTION TIME                     : 1.0h 0m
