## SECTION 1: LOCAL ENVIRONMENT SETUP

In [1]:
# ============================================================================
# CRITICAL: Set environment variables BEFORE importing PyTorch
# ============================================================================
import os

# Prevent GPU memory fragmentation - COMBINE both settings
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'

print("✅ GPU memory management environment variables set")
print(f"   PYTORCH_CUDA_ALLOC_CONF = {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")


✅ GPU memory management environment variables set
   PYTORCH_CUDA_ALLOC_CONF = expandable_segments:True,max_split_size_mb:128


In [2]:
# ============================================================================
# SECTION 1: LOCAL ENVIRONMENT SETUP (Windows + RTX 3060)
# ============================================================================

import sys, os
import numpy as np
import pandas as pd
import torch
import transformers
from packaging import version

print("=== LOCAL ENVIRONMENT CHECK ===")
print(f"Python version: {sys.version.split()[0]}")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

print(f"\nCUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Count: {torch.cuda.device_count()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Verify transformers version
assert version.parse(transformers.__version__) >= version.parse("4.26.0"), \
    "Transformers too old for modern TrainingArguments."

print("\n✅ Environment check complete!")

  from .autonotebook import tqdm as notebook_tqdm


=== LOCAL ENVIRONMENT CHECK ===
Python version: 3.13.5
PyTorch version: 2.6.0+cu124
Transformers version: 4.57.1
NumPy version: 2.1.1
Pandas version: 2.2.3

CUDA Available: True
CUDA Version: 12.4
GPU Count: 1
GPU Name: NVIDIA GeForce RTX 3060
GPU Memory: 12.00 GB

✅ Environment check complete!


### SECTION 1.5: Compatibility Shim

In [3]:
# ============================================================================
# SECTION 1.5: TRAININGARGUMENTS COMPATIBILITY SHIM
# ============================================================================

import inspect
import transformers as _tf

print("Transformers version loaded in memory:", _tf.__version__)

def _supported_kwargs_of_training_args():
    try:
        from transformers import TrainingArguments
        sig = inspect.signature(TrainingArguments.__init__)
        return set(sig.parameters.keys())
    except Exception as e:
        print("[Compat] Could not inspect TrainingArguments:", e)
        return set()

_SUPPORTED_TA_KEYS = _supported_kwargs_of_training_args()
print("Sample of supported TrainingArguments kwargs:", sorted(list(_SUPPORTED_TA_KEYS))[:12], "...")

def make_training_args_compat(**kwargs):
    """Create TrainingArguments while dropping any unsupported kwargs."""
    from transformers import TrainingArguments
    
    # Handle evaluation_strategy -> eval_strategy rename
    if "evaluation_strategy" in kwargs and "eval_strategy" not in kwargs:
        kwargs["eval_strategy"] = kwargs.pop("evaluation_strategy")
    
    filtered = {k: v for k, v in kwargs.items() if k in _SUPPORTED_TA_KEYS}
    ignored = [k for k in kwargs.keys() if k not in _SUPPORTED_TA_KEYS]
    if ignored:
        print("[Compat] Ignored unsupported TrainingArguments keys:", ignored)
    return TrainingArguments(**filtered)

def get_early_stopping_callbacks(patience: int):
    """Return EarlyStoppingCallback if available; otherwise return []."""
    try:
        from transformers import EarlyStoppingCallback
        return [EarlyStoppingCallback(early_stopping_patience=patience)]
    except Exception as e:
        print("[Compat] EarlyStoppingCallback unavailable:", e)
        return []

Transformers version loaded in memory: 4.57.1
Sample of supported TrainingArguments kwargs: ['accelerator_config', 'adafactor', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'auto_find_batch_size', 'average_tokens_across_devices', 'batch_eval_metrics', 'bf16', 'bf16_full_eval', 'data_seed', 'dataloader_drop_last'] ...


## SECTION 2: IMPORTS AND TIMING UTILITY

In [4]:
# ============================================================================
# SECTION 2: IMPORTS AND BASIC SETUP
# ============================================================================

import time
from datetime import timedelta

# ============================================================================
# TIMING UTILITY - Track execution time for each section
# ============================================================================
class SectionTimer:
    def __init__(self):
        self.section_times = {}
        self.start_time = None
        self.total_start = time.time()

    def start_section(self, section_name):
        """Start timing a section"""
        self.start_time = time.time()
        print(f"\n🚀 Starting {section_name}...")

    def end_section(self, section_name):
        """End timing and display results"""
        if self.start_time is None:
            self.start_time = time.time()

        elapsed = time.time() - self.start_time
        self.section_times[section_name] = elapsed

        # Format time nicely
        if elapsed < 60:
            time_str = f"{elapsed:.1f}s"
        elif elapsed < 3600:
            time_str = f"{elapsed/60:.1f}m {elapsed%60:.0f}s"
        else:
            time_str = f"{elapsed/3600:.1f}h {(elapsed%3600)/60:.0f}m"

        total_elapsed = time.time() - self.total_start
        if total_elapsed < 60:
            total_str = f"{total_elapsed:.1f}s"
        elif total_elapsed < 3600:
            total_str = f"{total_elapsed/60:.1f}m {total_elapsed%60:.0f}s"
        else:
            total_str = f"{total_elapsed/3600:.1f}h {(total_elapsed%3600)/60:.0f}m"

        print(f"✅ {section_name} completed in {time_str}")
        print(f"🕒 Total runtime so far: {total_str}")
        print("-" * 60)

    def get_summary(self):
        """Get timing summary"""
        total = time.time() - self.total_start
        print("\n" + "="*60)
        print("⏱️  EXECUTION TIME SUMMARY")
        print("="*60)
        for section, elapsed in self.section_times.items():
            if elapsed < 60:
                time_str = f"{elapsed:.1f}s"
            elif elapsed < 3600:
                time_str = f"{elapsed/60:.1f}m {elapsed%60:.0f}s"
            else:
                time_str = f"{elapsed/3600:.1f}h {(elapsed%3600)/60:.0f}m"
            print(f"{section:<40} : {time_str}")

        if total < 60:
            total_str = f"{total:.1f}s"
        elif total < 3600:
            total_str = f"{total/60:.1f}m {total%60:.0f}s"
        else:
            total_str = f"{total/3600:.1f}h {(total%3600)/60:.0f}m"

        print(f"{'='*40} : {'='*10}")
        print(f"{'TOTAL EXECUTION TIME':<40} : {total_str}")
        print("="*60)

# Initialize global timer
timer = SectionTimer()
timer.start_section("SECTION 2: Environment & Imports")

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

timer.end_section("SECTION 2: Environment & Imports")
timer.start_section("SECTION 3: Configuration Setup")


🚀 Starting SECTION 2: Environment & Imports...
✅ SECTION 2: Environment & Imports completed in 4.0s
🕒 Total runtime so far: 4.0s
------------------------------------------------------------

🚀 Starting SECTION 3: Configuration Setup...
✅ SECTION 2: Environment & Imports completed in 4.0s
🕒 Total runtime so far: 4.0s
------------------------------------------------------------

🚀 Starting SECTION 3: Configuration Setup...


In [5]:
import os, random, json, math
from dataclasses import dataclass
from typing import Dict, Tuple, Optional, List

import torch.nn as nn

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

from transformers import (
    AutoTokenizer, AutoModel, TrainingArguments, Trainer,
    DataCollatorWithPadding, EarlyStoppingCallback
)

def seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_all(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## SECTION 3: CONFIGURATION

In [6]:
# ============================================================================
# REMBERT LOCAL TRAINING - RUN #1 BASELINE
# Expected: 50-75 min training time, 65-70% macro-F1 target
# Hardware: RTX 3060 (12GB VRAM), Python 3.13.5, PyTorch 2.6.0+cu124
# ============================================================================

# Data paths (LOCAL - Windows)
CSV_PATH = 'd:/School/NotebookRuns/augmented_adjudications_2025-10-22.csv'

TITLE_COL = "Title"
TEXT_COL  = "Comment"
SENT_COL  = "Final Sentiment"
POL_COL   = "Final Polarization"

# Model configuration - RemBERT
MODEL_CONFIGS = {
    "rembert": {"name": "google/rembert", "desc": "RemBERT (110 langs, decoupled embeddings)"},
}
MODELS_TO_RUN = ["rembert"]
OUT_DIR = "./runs_rembert"

# ============================================================================
# TRAINING CONFIGURATION - RTX 3060 OPTIMIZED (12GB VRAM)
# Strategy: Balance between mBERT (63%) and XLM-R (68%) configurations
# REDUCED: Batch size lowered to prevent system memory crash
# ============================================================================
MAX_LENGTH = 224              # Reduced from 256 (save memory)
EPOCHS = 18                   # Same as XLM-R Run #14 (proven optimal)
BATCH_SIZE = 8                # REDUCED from 14 to prevent crash
LR = 2.5e-5                  # Between mBERT (2.5e-5) and XLM-R (3.0e-5)
WEIGHT_DECAY = 0.035         # Average of mBERT (0.03) and XLM-R (0.04)
WARMUP_RATIO = 0.22          # Average of mBERT (0.20) and XLM-R (0.25)
EARLY_STOP_PATIENCE = 7      # Average of mBERT (8) and XLM-R (6)
GRAD_ACCUM_STEPS = 5         # Effective batch: 40 (was 42)

# Per-task loss configuration
USE_FOCAL_SENTIMENT = True
USE_FOCAL_POLARITY  = True
FOCAL_GAMMA_SENTIMENT = 2.5
FOCAL_GAMMA_POLARITY = 2.8
LABEL_SMOOTH_SENTIMENT = 0.10
LABEL_SMOOTH_POLARITY = 0.08

# Task weights
TASK_LOSS_WEIGHTS = {"sentiment": 1.0, "polarization": 1.4}

# Stability parameters
MAX_GRAD_NORM = 1.0
USE_GRADIENT_CHECKPOINTING = True

# Learning rate scheduling
LR_SCHEDULER_TYPE = "cosine"
NUM_CYCLES = 0.5

# ============================================================================
# CLASS WEIGHTS - BALANCED APPROACH
# Using XLM-R Run #14 proven configuration
# ============================================================================
CLASS_WEIGHT_MULT = {
    "sentiment": {
        "negative": 1.15,
        "neutral":  1.20,
        "positive": 1.40
    },
    "polarization": {
        "non_polarized": 1.20,
        "objective":     2.05,
        "partisan":      1.05
    }
}
MAX_CLASS_WEIGHT = 8.0

# ============================================================================
# OVERSAMPLING STRATEGY
# Using XLM-R Run #14 proven configuration
# ============================================================================
USE_OVERSAMPLING = True
USE_JOINT_OVERSAMPLING = True
USE_SMART_OVERSAMPLING = True
JOINT_ALPHA = 0.65
JOINT_OVERSAMPLING_MAX_MULT = 4.6
OBJECTIVE_BOOST_MULT = 1.75
NEUTRAL_BOOST_MULT = 0.90

# ============================================================================
# ARCHITECTURE - BALANCED CONFIGURATION
# Using proven simple architecture (mBERT/XLM-R lessons)
# ============================================================================
HEAD_HIDDEN = 768
HEAD_DROPOUT = 0.24
REP_POOLING = "last4_mean"
HEAD_LAYERS = 3

# ============================================================================
# REGULARIZATION - PROVEN TECHNIQUES
# ============================================================================
USE_RDROP = True
RDROP_ALPHA = 0.6
RDROP_WARMUP_EPOCHS = 2

# LLRD (layer-wise learning-rate decay)
USE_LLRD = True
LLRD_DECAY = 0.88
HEAD_LR_MULT = 3.5

os.makedirs(OUT_DIR, exist_ok=True)

# ============================================================================
# CONFIGURATION SUMMARY
# ============================================================================
print("="*70)
print("🤖 RemBERT LOCAL TRAINING - RUN #1 BASELINE")
print("="*70)
print(f"📊 Hardware:")
print(f"   GPU: RTX 3060 (12GB VRAM)")
print(f"   Python: {sys.version.split()[0]}")
print(f"   PyTorch: {torch.__version__}")
print(f"   Transformers: {transformers.__version__}")
print()
print(f"🎯 Model: google/rembert (110 languages, decoupled embeddings)")
print(f"   Expected Performance: 65-70% macro-F1")
print(f"   Expected Time: 50-75 minutes")
print(f"   vs mBERT (63.06%): +2-7% target")
print(f"   vs XLM-R (67.80%): competitive")
print()
print(f"📏 Training Configuration:")
print(f"   MAX_LENGTH: {MAX_LENGTH} (balanced)")
print(f"   Epochs: {EPOCHS}")
print(f"   Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * GRAD_ACCUM_STEPS})")
print(f"   Learning Rate: {LR}")
print(f"   Weight Decay: {WEIGHT_DECAY}")
print(f"   Warmup Ratio: {WARMUP_RATIO}")
print(f"   Early Stop Patience: {EARLY_STOP_PATIENCE}")
print()
print(f"🏗️  Architecture:")
print(f"   Head Hidden: {HEAD_HIDDEN}")
print(f"   Head Layers: {HEAD_LAYERS}")
print(f"   Head Dropout: {HEAD_DROPOUT}")
print(f"   Pooling: {REP_POOLING}")
print()
print(f"⚖️  Class Weighting:")
print(f"   Sentiment: {CLASS_WEIGHT_MULT['sentiment']}")
print(f"   Polarization: {CLASS_WEIGHT_MULT['polarization']}")
print()
print(f"📊 Oversampling:")
print(f"   Joint Alpha: {JOINT_ALPHA}")
print(f"   Max Multiplier: {JOINT_OVERSAMPLING_MAX_MULT}x")
print(f"   Objective Boost: {OBJECTIVE_BOOST_MULT}x")
print(f"   Neutral Boost: {NEUTRAL_BOOST_MULT}x")
print()
print(f"🔥 Advanced Techniques:")
print(f"   Focal Loss: γ_sent={FOCAL_GAMMA_SENTIMENT}, γ_pol={FOCAL_GAMMA_POLARITY}")
print(f"   R-Drop: α={RDROP_ALPHA}, warmup={RDROP_WARMUP_EPOCHS} epochs")
print(f"   LLRD: decay={LLRD_DECAY}, head_mult={HEAD_LR_MULT}x")
print()
print(f"💾 Output: {OUT_DIR}")
print("="*70)

timer.end_section("SECTION 3: Configuration Setup")
timer.start_section("SECTION 4: Data Loading & Preprocessing")

🤖 RemBERT LOCAL TRAINING - RUN #1 BASELINE
📊 Hardware:
   GPU: RTX 3060 (12GB VRAM)
   Python: 3.13.5
   PyTorch: 2.6.0+cu124
   Transformers: 4.57.1

🎯 Model: google/rembert (110 languages, decoupled embeddings)
   Expected Performance: 65-70% macro-F1
   Expected Time: 50-75 minutes
   vs mBERT (63.06%): +2-7% target
   vs XLM-R (67.80%): competitive

📏 Training Configuration:
   MAX_LENGTH: 224 (balanced)
   Epochs: 18
   Batch Size: 8 (effective: 40)
   Learning Rate: 2.5e-05
   Weight Decay: 0.035
   Warmup Ratio: 0.22
   Early Stop Patience: 7

🏗️  Architecture:
   Head Hidden: 768
   Head Layers: 3
   Head Dropout: 0.24
   Pooling: last4_mean

⚖️  Class Weighting:
   Sentiment: {'negative': 1.15, 'neutral': 1.2, 'positive': 1.4}
   Polarization: {'non_polarized': 1.2, 'objective': 2.05, 'partisan': 1.05}

📊 Oversampling:
   Joint Alpha: 0.65
   Max Multiplier: 4.6x
   Objective Boost: 1.75x
   Neutral Boost: 0.9x

🔥 Advanced Techniques:
   Focal Loss: γ_sent=2.5, γ_pol=2.8
   R-

## SECTION 4: DATA LOADING

In [7]:
# ===== Section 4 — Load & Prepare Data =====
df = pd.read_csv(CSV_PATH)
df.columns = df.columns.str.strip()

required = [TITLE_COL, TEXT_COL, SENT_COL, POL_COL]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}. Found: {list(df.columns)}")

df = df.dropna(subset=[TITLE_COL, TEXT_COL, SENT_COL, POL_COL]).reset_index(drop=True)

# Encode labels
sent_le = LabelEncoder().fit(df[SENT_COL])
pol_le  = LabelEncoder().fit(df[POL_COL])

df["sent_y"] = sent_le.transform(df[SENT_COL])
df["pol_y"]  = pol_le.transform(df[POL_COL])

num_sent_classes = len(sent_le.classes_)
num_pol_classes  = len(pol_le.classes_)

print("Sentiment classes:", dict(enumerate(sent_le.classes_)))
print("Polarization classes:", dict(enumerate(pol_le.classes_)))

# Splits (stratify by sentiment)
X = df[[TITLE_COL, TEXT_COL]].copy()
y_sent = df["sent_y"].values
y_pol  = df["pol_y"].values

X_train, X_tmp, ysent_train, ysent_tmp, ypol_train, ypol_tmp = train_test_split(
    X, y_sent, y_pol, test_size=0.3, random_state=42, stratify=y_sent
)
X_val, X_test, ysent_val, ysent_test, ypol_val, ypol_test = train_test_split(
    X_tmp, ysent_tmp, ypol_tmp, test_size=0.5, random_state=42, stratify=ysent_tmp
)

print("Train size:", len(X_train), "Val size:", len(X_val), "Test size:", len(X_test))

# Balanced class weights from TRAIN only
def safe_class_weights(y, n_classes):
    classes = np.arange(n_classes)
    counts = np.bincount(y, minlength=n_classes)
    if np.any(counts == 0):
        return np.ones(n_classes, dtype=np.float32)
    return compute_class_weight("balanced", classes=classes, y=y).astype(np.float32)

sent_weights_np = safe_class_weights(ysent_train, num_sent_classes)
pol_weights_np  = safe_class_weights(ypol_train,  num_pol_classes)

# Apply user multipliers by class name
sent_name_to_idx = {name: i for i, name in enumerate(sent_le.classes_)}
pol_name_to_idx  = {name: i for i, name in enumerate(pol_le.classes_)}

for cname, mult in CLASS_WEIGHT_MULT["sentiment"].items():
    if cname in sent_name_to_idx:
        sent_weights_np[sent_name_to_idx[cname]] *= float(mult)

for cname, mult in CLASS_WEIGHT_MULT["polarization"].items():
    if cname in pol_name_to_idx:
        pol_weights_np[pol_name_to_idx[cname]] *= float(mult)

# Apply class weight caps
sent_weights_np = np.clip(sent_weights_np, 0.1, MAX_CLASS_WEIGHT)
pol_weights_np = np.clip(pol_weights_np, 0.1, MAX_CLASS_WEIGHT)

print("Final sentiment class weights:", {sent_le.classes_[i]: float(w) for i, w in enumerate(sent_weights_np)})
print("Final polarization class weights:", {pol_le.classes_[i]: float(w) for i, w in enumerate(pol_weights_np)})

# Save label maps
with open(os.path.join(OUT_DIR, "label_map_sentiment.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(sent_le.classes_)).items()}, f, indent=2)
with open(os.path.join(OUT_DIR, "label_map_polarization.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(pol_le.classes_)).items()}, f, indent=2)

timer.end_section("SECTION 4: Data Loading & Preprocessing")
timer.start_section("SECTION 5-9: Model Architecture & Training Setup")

Sentiment classes: {0: 'negative', 1: 'neutral', 2: 'positive'}
Polarization classes: {0: 'non_polarized', 1: 'objective', 2: 'partisan'}
Train size: 9144 Val size: 1959 Test size: 1960
Final sentiment class weights: {'negative': 0.8481006622314453, 'neutral': 0.9046748280525208, 'positive': 4.40826416015625}
Final polarization class weights: {'non_polarized': 1.2815698385238647, 'objective': 6.1743083000183105, 'partisan': 0.606365978717804}
✅ SECTION 4: Data Loading & Preprocessing completed in 0.1s
🕒 Total runtime so far: 4.1s
------------------------------------------------------------

🚀 Starting SECTION 5-9: Model Architecture & Training Setup...


## SECTION 5: DATASET

In [8]:
# ===== Section 5 — Dataset & Collator =====
from torch.utils.data import Dataset

class TaglishDataset(Dataset):
    def __init__(self, titles, texts, y_sent, y_pol, tokenizer, max_length=256):
        self.titles = list(titles)
        self.texts  = list(texts)
        self.y_sent = np.array(y_sent)
        self.y_pol  = np.array(y_pol)
        self.tok = tokenizer
        self.max_length = max_length
        # RemBERT uses token_type_ids (like mBERT)
        self.use_token_type = "token_type_ids" in tokenizer.model_input_names

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tok(
            text=str(self.titles[idx]),
            text_pair=str(self.texts[idx]),
            truncation="only_second",
            max_length=self.max_length,
            return_token_type_ids=self.use_token_type,
        )
        item = {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "sentiment_labels": torch.tensor(self.y_sent[idx], dtype=torch.long),
            "polarization_labels": torch.tensor(self.y_pol[idx], dtype=torch.long),
        }
        if self.use_token_type and "token_type_ids" in enc:
            item["token_type_ids"] = enc["token_type_ids"]
        return item

## SECTION 6: MODEL ARCHITECTURE

In [9]:
# ===== Section 6 — Multi-Task Model =====
import torch.nn.functional as F

def mean_pooling(token_embeddings, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    summed = (token_embeddings * mask).sum(dim=1)
    denom = mask.sum(dim=1).clamp(min=1e-9)
    return summed / denom

class MultiTaskModel(nn.Module):
    def __init__(self, base_model_name: str, num_sent: int, num_pol: int, dropout: float = 0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        self.hidden = self.encoder.config.hidden_size

        # Enhanced trunk
        self.trunk = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.hidden, HEAD_HIDDEN),
            nn.GELU(),
            nn.LayerNorm(HEAD_HIDDEN),
            nn.Dropout(HEAD_DROPOUT),
        )

        # Multi-layer heads
        if HEAD_LAYERS == 2:
            self.head_sent = nn.Sequential(
                nn.Linear(HEAD_HIDDEN, HEAD_HIDDEN // 2),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 2),
                nn.Dropout(HEAD_DROPOUT * 0.8),
                nn.Linear(HEAD_HIDDEN // 2, num_sent)
            )
            self.head_pol = nn.Sequential(
                nn.Linear(HEAD_HIDDEN, HEAD_HIDDEN // 2),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 2),
                nn.Dropout(HEAD_DROPOUT * 0.8),
                nn.Linear(HEAD_HIDDEN // 2, num_pol)
            )
        elif HEAD_LAYERS >= 3:
            self.head_sent = nn.Sequential(
                nn.Linear(HEAD_HIDDEN, HEAD_HIDDEN // 2),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 2),
                nn.Dropout(HEAD_DROPOUT * 0.8),
                nn.Linear(HEAD_HIDDEN // 2, HEAD_HIDDEN // 4),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 4),
                nn.Dropout(HEAD_DROPOUT * 0.7),
                nn.Linear(HEAD_HIDDEN // 4, num_sent)
            )
            self.head_pol = nn.Sequential(
                nn.Linear(HEAD_HIDDEN, HEAD_HIDDEN // 2),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 2),
                nn.Dropout(HEAD_DROPOUT * 0.8),
                nn.Linear(HEAD_HIDDEN // 2, HEAD_HIDDEN // 4),
                nn.GELU(),
                nn.LayerNorm(HEAD_HIDDEN // 4),
                nn.Dropout(HEAD_DROPOUT * 0.7),
                nn.Linear(HEAD_HIDDEN // 4, num_pol)
            )
        else:
            self.head_sent = nn.Linear(HEAD_HIDDEN, num_sent)
            self.head_pol  = nn.Linear(HEAD_HIDDEN, num_pol)

        # Enable gradient checkpointing
        if USE_GRADIENT_CHECKPOINTING:
            self.encoder.gradient_checkpointing_enable()

    def _pool(self, outputs, attention_mask):
        if REP_POOLING == "pooler" and hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            return outputs.pooler_output
        if REP_POOLING == "cls":
            return outputs.last_hidden_state[:, 0]
        # default: last4_mean
        hs = outputs.hidden_states
        last4 = torch.stack(hs[-4:]).mean(dim=0)
        return mean_pooling(last4, attention_mask)

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                sentiment_labels=None,
                polarization_labels=None):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids if token_type_ids is not None else None,
            output_hidden_states=(REP_POOLING != "pooler")
        )
        pooled = self._pool(outputs, attention_mask)
        z = self.trunk(pooled)
        return {"logits": (self.head_sent(z), self.head_pol(z))}

## SECTION 7: METRICS

In [10]:
# ===== Section 7 — Metrics =====

def compute_metrics_multi(eval_pred):
    (sent_logits, pol_logits) = eval_pred.predictions
    (y_sent, y_pol) = eval_pred.label_ids

    ps = np.argmax(sent_logits, axis=1)
    pp = np.argmax(pol_logits, axis=1)

    sent_report = classification_report(y_sent, ps, output_dict=True, zero_division=0)
    pol_report  = classification_report(y_pol,  pp, output_dict=True, zero_division=0)

    sent_f1 = sent_report["macro avg"]["f1-score"]
    pol_f1  = pol_report["macro avg"]["f1-score"]
    macro_f1_avg = (sent_f1 + pol_f1) / 2.0

    return {
        "sent_acc": sent_report["accuracy"],
        "sent_prec": sent_report["macro avg"]["precision"],
        "sent_rec": sent_report["macro avg"]["recall"],
        "sent_f1": sent_f1,

        "pol_acc": pol_report["accuracy"],
        "pol_prec": pol_report["macro avg"]["precision"],
        "pol_rec": pol_report["macro avg"]["recall"],
        "pol_f1": pol_f1,

        "macro_f1_avg": macro_f1_avg
    }

## SECTION 8: CUSTOM TRAINER

In [11]:
# ===== Section 8 — Custom Trainer (R-Drop + LLRD) =====
from torch.utils.data import DataLoader
from torch.optim import AdamW

class FocalLoss(nn.Module):
    def __init__(self, weight=None, gamma=2.0, reduction="mean"):
        super().__init__()
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction
    def forward(self, logits, target):
        logp = F.log_softmax(logits, dim=1)
        p = torch.exp(logp)
        loss = F.nll_loss((1 - p) ** self.gamma * logp, target, weight=self.weight, reduction="none")
        return loss.mean() if self.reduction == "mean" else loss.sum()

def _sym_kl_with_logits(logits1, logits2):
    p = F.log_softmax(logits1, dim=-1);  q = F.log_softmax(logits2, dim=-1)
    p_exp, q_exp = p.exp(), q.exp()
    return 0.5 * (F.kl_div(p, q_exp, reduction="batchmean") + F.kl_div(q, p_exp, reduction="batchmean"))

class MultiTaskTrainer(Trainer):
    def __init__(self, *args, class_weights=None, task_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights or {}
        self.task_weights  = task_weights or {"sentiment": 1.0, "polarization": 1.0}
        self._custom_train_sampler = None

    def create_optimizer(self):
        if self.optimizer is not None:
            return self.optimizer
        if not USE_LLRD:
            self.optimizer = AdamW(self.get_decay_parameter_groups(self.model), lr=LR, weight_decay=WEIGHT_DECAY)
            return self.optimizer

        no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
        encoder = self.model.encoder
        n_layers = getattr(encoder.config, "num_hidden_layers", 12)
        layers = getattr(getattr(encoder, "encoder", encoder), "layer", None)
        if layers is None:
            self.optimizer = AdamW(self.get_decay_parameter_groups(self.model), lr=LR, weight_decay=WEIGHT_DECAY)
            return self.optimizer

        param_groups = []

        # Embeddings
        emb = getattr(encoder, "embeddings", None)
        if emb is not None:
            lr_emb = LR * (LLRD_DECAY ** n_layers)
            decay, nodecay = [], []
            for n, p in emb.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
            if decay:   param_groups.append({"params": decay,   "lr": lr_emb, "weight_decay": WEIGHT_DECAY})
            if nodecay: param_groups.append({"params": nodecay, "lr": lr_emb, "weight_decay": 0.0})

        # Encoder blocks
        for i in range(n_layers):
            block = layers[i]
            lr_i = LR * (LLRD_DECAY ** (n_layers - 1 - i))
            decay, nodecay = [], []
            for n, p in block.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
            if decay:   param_groups.append({"params": decay,   "lr": lr_i, "weight_decay": WEIGHT_DECAY})
            if nodecay: param_groups.append({"params": nodecay, "lr": lr_i, "weight_decay": 0.0})

        # Pooler
        pooler = getattr(encoder, "pooler", None)
        if pooler is not None:
            decay, nodecay = [], []
            for n, p in pooler.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
            if decay:   param_groups.append({"params": decay,   "lr": LR, "weight_decay": WEIGHT_DECAY})
            if nodecay: param_groups.append({"params": nodecay, "lr": LR, "weight_decay": 0.0})

        # Heads/trunk (highest LR)
        head_lr = LR * HEAD_LR_MULT
        head_modules = [self.model.trunk, self.model.head_sent, self.model.head_pol]
        decay, nodecay = [], []
        for m in head_modules:
            for n, p in m.named_parameters():
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
        if decay:   param_groups.append({"params": decay,   "lr": head_lr, "weight_decay": WEIGHT_DECAY})
        if nodecay: param_groups.append({"params": nodecay, "lr": head_lr, "weight_decay": 0.0})

        self.optimizer = AdamW(param_groups, lr=LR)
        return self.optimizer

    def set_train_sampler(self, sampler):
        self._custom_train_sampler = sampler

    def get_train_dataloader(self):
        if self.train_dataset is None:
            return None
        if self._custom_train_sampler is not None:
            return DataLoader(
                self.train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=self._custom_train_sampler,
                collate_fn=self.data_collator,
                drop_last=self.args.dataloader_drop_last,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
        return super().get_train_dataloader()

    def _sent_loss_fn(self, weight, logits, target):
        if USE_FOCAL_SENTIMENT:
            return FocalLoss(weight=weight, gamma=FOCAL_GAMMA_SENTIMENT)(logits, target)
        return nn.CrossEntropyLoss(weight=weight, label_smoothing=float(LABEL_SMOOTH_SENTIMENT))(logits, target)

    def _pol_loss_fn(self, weight, logits, target):
        if USE_FOCAL_POLARITY:
            return FocalLoss(weight=weight, gamma=FOCAL_GAMMA_POLARITY)(logits, target)
        return nn.CrossEntropyLoss(weight=weight, label_smoothing=float(LABEL_SMOOTH_POLARITY))(logits, target)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Compute loss with compatibility for transformers 4.57+
        Args:
            num_items_in_batch: Added in transformers 4.57+ for gradient accumulation
        """
        y_sent = inputs.pop("sentiment_labels")
        y_pol  = inputs.pop("polarization_labels")

        current_epoch = getattr(self.state, 'epoch', 0) if hasattr(self, 'state') else 0
        use_rdrop_now = USE_RDROP and model.training and current_epoch >= RDROP_WARMUP_EPOCHS

        if use_rdrop_now:
            outputs1 = model(**inputs)
            outputs2 = model(**inputs)
            s1, p1 = outputs1["logits"]
            s2, p2 = outputs2["logits"]

            ws = self.class_weights.get("sentiment", None); ws = ws.to(s1.device) if ws is not None else None
            wp = self.class_weights.get("polarization", None); wp = wp.to(p1.device) if wp is not None else None

            ce_s = 0.5 * (self._sent_loss_fn(ws, s1, y_sent) + self._sent_loss_fn(ws, s2, y_sent))
            ce_p = 0.5 * (self._pol_loss_fn(wp,  p1, y_pol)  + self._pol_loss_fn(wp,  p2, y_pol))
            kl_s = _sym_kl_with_logits(s1, s2)
            kl_p = _sym_kl_with_logits(p1, p2)

            w_s = float(self.task_weights.get("sentiment", 1.0))
            w_p = float(self.task_weights.get("polarization", 1.0))

            rdrop_factor = min(1.0, (current_epoch - RDROP_WARMUP_EPOCHS + 1) / 2.0)
            loss = w_s * ce_s + w_p * ce_p + (RDROP_ALPHA * rdrop_factor) * (kl_s + kl_p)
            if return_outputs:
                return loss, {"logits": (s1, p1)}
            return loss

        # Standard single forward
        outputs = model(**inputs)
        s, p = outputs["logits"]

        ws = self.class_weights.get("sentiment", None); ws = ws.to(s.device) if ws is not None else None
        wp = self.class_weights.get("polarization", None); wp = wp.to(p.device) if wp is not None else None

        loss_s = self._sent_loss_fn(ws, s, y_sent)
        loss_p = self._pol_loss_fn(wp, p, y_pol)

        w_s = float(self.task_weights.get("sentiment", 1.0))
        w_p = float(self.task_weights.get("polarization", 1.0))
        loss = w_s * loss_s + w_p * loss_p

        if return_outputs:
            outputs = dict(outputs); outputs["labels"] = (y_sent, y_pol)
            return loss, outputs
        return loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        y_sent = inputs.get("sentiment_labels", None)
        y_pol  = inputs.get("polarization_labels", None)

        model_inputs = {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]}
        if "token_type_ids" in inputs:
            model_inputs["token_type_ids"] = inputs["token_type_ids"]

        model.eval()
        with torch.no_grad():
            outputs = model(**model_inputs)
            s, p = outputs["logits"]

        loss = None
        logits = (s.detach(), p.detach())
        labels = (y_sent, y_pol) if isinstance(y_sent, torch.Tensor) and isinstance(y_pol, torch.Tensor) else None
        return (loss, logits, labels)


## SECTION 9: TRAINING FUNCTION

In [12]:
# ===== Section 9 — Train/Evaluate One Model =====
from torch.utils.data import WeightedRandomSampler

def train_eval_one_model(model_key: str,
                         X_tr: pd.DataFrame, X_v: pd.DataFrame, X_te: pd.DataFrame,
                         ysent_tr: np.ndarray, ysent_v: np.ndarray, ysent_te: np.ndarray,
                         ypol_tr: np.ndarray,  ypol_v: np.ndarray,  ypol_te: np.ndarray,
                         sent_w_np: np.ndarray, pol_w_np: np.ndarray):
    base_name = MODEL_CONFIGS[model_key]["name"]
    run_dir = os.path.join(OUT_DIR, f"{model_key}")
    os.makedirs(run_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(base_name)
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    tr_titles, tr_texts = X_tr[TITLE_COL].values, X_tr[TEXT_COL].values
    v_titles,  v_texts  = X_v[TITLE_COL].values, X_v[TEXT_COL].values
    te_titles, te_texts = X_te[TITLE_COL].values, X_te[TEXT_COL].values

    train_ds = TaglishDataset(tr_titles, tr_texts, ysent_tr, ypol_tr, tokenizer, max_length=MAX_LENGTH)
    val_ds   = TaglishDataset(v_titles,  v_texts,  ysent_v,  ypol_v,  tokenizer, max_length=MAX_LENGTH)
    test_ds  = TaglishDataset(te_titles, te_texts, ysent_te, ypol_te, tokenizer, max_length=MAX_LENGTH)

    model = MultiTaskModel(base_name, num_sent_classes, num_pol_classes).to(device)

    sent_w = torch.tensor(sent_w_np, dtype=torch.float32)
    pol_w  = torch.tensor(pol_w_np,  dtype=torch.float32)

    args = make_training_args_compat(
        output_dir=run_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_RATIO,
        lr_scheduler_type=LR_SCHEDULER_TYPE,
        lr_scheduler_kwargs={"num_cycles": NUM_CYCLES},
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1_avg",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        logging_dir=os.path.join(run_dir, "logs"),
        logging_steps=25,
        logging_first_step=True,
        save_steps=500,
        eval_steps=None,
        report_to="none",
        seed=42,
        remove_unused_columns=False,
        eval_accumulation_steps=1,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        dataloader_pin_memory=True,
        dataloader_num_workers=0,  # Windows compatibility
        max_grad_norm=MAX_GRAD_NORM,
        label_smoothing_factor=0.0,
        save_total_limit=3,
        prediction_loss_only=False
    )

    callbacks = get_early_stopping_callbacks(EARLY_STOP_PATIENCE)

    trainer = MultiTaskTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_multi,
        callbacks=callbacks,
        class_weights={"sentiment": sent_w, "polarization": pol_w},
        task_weights=TASK_LOSS_WEIGHTS
    )

    # Enhanced joint oversampling with objective + neutral boost
    if USE_OVERSAMPLING and USE_JOINT_OVERSAMPLING:
        pair_counts = Counter(zip(ysent_tr.tolist(), ypol_tr.tolist()))
        counts = np.array(list(pair_counts.values()), dtype=np.float32)
        med = float(np.median(counts)) if len(counts) else 1.0

        obj_idx = np.where(pol_le.classes_ == "objective")[0][0] if "objective" in pol_le.classes_ else 1
        neutral_idx = np.where(sent_le.classes_ == "neutral")[0][0] if "neutral" in sent_le.classes_ else 1

        def inv_mult(c):
            if c <= 0: return JOINT_OVERSAMPLING_MAX_MULT
            return float(np.clip(med / float(c), 1.0, JOINT_OVERSAMPLING_MAX_MULT))

        inv_by_pair = {k: inv_mult(v) for k, v in pair_counts.items()}
        sample_weights = []

        for ys, yp in zip(ysent_tr, ypol_tr):
            inv = inv_by_pair.get((int(ys), int(yp)), 1.0)
            w = (1.0 - JOINT_ALPHA) * 1.0 + JOINT_ALPHA * inv

            if USE_SMART_OVERSAMPLING and int(yp) == obj_idx:
                w *= OBJECTIVE_BOOST_MULT
            if USE_SMART_OVERSAMPLING and int(ys) == neutral_idx:
                w *= NEUTRAL_BOOST_MULT

            sample_weights.append(w)

        obj_boost_count = sum(1 for i, yp in enumerate(ypol_tr) if int(yp) == obj_idx and sample_weights[i] > 2.0)
        neutral_boost_count = sum(1 for i, ys in enumerate(ysent_tr) if int(ys) == neutral_idx and sample_weights[i] > 2.0)
        print(f"🔥 Enhanced Oversampling: min={min(sample_weights):.2f}, max={max(sample_weights):.2f}")
        print(f"   ├─ Objective boosted samples: {obj_boost_count}")
        print(f"   └─ Neutral boosted samples: {neutral_boost_count}")
        trainer.set_train_sampler(WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True))

    trainer.train()

    # Test
    test_out = trainer.predict(test_ds)
    metrics = {f"test_{k}": float(v) for k, v in test_out.metrics.items()}
    trainer.save_model()
    
    # Ensure weights exist
    model_path = os.path.join(run_dir, "pytorch_model.bin")
    if not os.path.exists(model_path):
        torch.save(trainer.model.state_dict(), model_path)
    tokenizer.save_pretrained(run_dir)
    
    with open(os.path.join(run_dir, "metrics_test.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    sent_logits, pol_logits = test_out.predictions
    ysent_pred = np.argmax(sent_logits, axis=1)
    ypol_pred  = np.argmax(pol_logits,  axis=1)

    cm_sent = confusion_matrix(ysent_te, ysent_pred, labels=list(range(num_sent_classes)))
    cm_pol  = confusion_matrix(ypol_te,  ypol_pred,  labels=list(range(num_pol_classes)))
    np.save(os.path.join(run_dir, "cm_sent.npy"), cm_sent)
    np.save(os.path.join(run_dir, "cm_pol.npy"),  cm_pol)

    def plot_cm(cm, labels, title, path_png):
        fig, ax = plt.subplots(figsize=(4.5, 4))
        im = ax.imshow(cm, interpolation="nearest")
        ax.set_title(title); ax.set_xlabel("Predicted"); ax.set_ylabel("True")
        ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45, ha="right")
        ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, cm[i, j], ha="center", va="center")
        fig.colorbar(im, ax=ax, fraction=0.046); plt.tight_layout(); plt.savefig(path_png, dpi=160); plt.close(fig)

    plot_cm(cm_sent, sent_le.classes_, "Sentiment Confusion", os.path.join(run_dir, "cm_sent.png"))
    plot_cm(cm_pol,  pol_le.classes_,  "Polarization Confusion", os.path.join(run_dir, "cm_pol.png"))

    rep_sent = classification_report(ysent_te, ysent_pred, target_names=sent_le.classes_, digits=4, zero_division=0)
    rep_pol  = classification_report(ypol_te,  ypol_pred,  target_names=pol_le.classes_,  digits=4, zero_division=0)
    with open(os.path.join(run_dir, "report_sentiment.txt"), "w") as f: f.write(rep_sent)
    with open(os.path.join(run_dir, "report_polarization.txt"), "w") as f: f.write(rep_pol)

    return {"model_key": model_key, "base_name": base_name, **metrics}, (ysent_pred, ypol_pred)

timer.end_section("SECTION 5-9: Model Architecture & Training Setup")

✅ SECTION 5-9: Model Architecture & Training Setup completed in 0.1s
🕒 Total runtime so far: 4.2s
------------------------------------------------------------


## SECTION 10: RUN TRAINING

In [13]:
# ===== GPU Memory Management =====
import torch
import gc

# DO NOT SET environment variable here - it was already set in Cell 1!
# Environment variable must be set BEFORE PyTorch is imported.

# Clear any existing GPU memory
torch.cuda.empty_cache()
gc.collect()

# Verify GPU status
if torch.cuda.is_available():
    print(f"🔧 GPU Memory Management:")
    print(f"   Total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"   Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"   Reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
    print(f"   Free: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved(0)) / 1024**3:.2f} GB")
    print(f"✅ Memory cleared and ready for training")
else:
    print("⚠️ CUDA not available!")


🔧 GPU Memory Management:
   Total: 12.00 GB
   Allocated: 0.00 GB
   Reserved: 0.00 GB
   Free: 12.00 GB
✅ Memory cleared and ready for training


In [14]:
# ===== Section 10 — Run Training =====

timer.start_section("SECTION 10: Model Training Execution")

results = []
pred_cache = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Running {key} -> {MODEL_CONFIGS[key]['name']} ===")
    row, preds = train_eval_one_model(
        key,
        X_train, X_val, X_test,
        ysent_train, ysent_val, ysent_test,
        ypol_train,  ypol_val,  ypol_test,
        sent_weights_np, pol_weights_np
    )
    results.append(row)
    pred_cache[key] = preds

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(OUT_DIR, "summary_results.csv"), index=False)

timer.end_section("SECTION 10: Model Training Execution")

print("\n" + "="*70)
print("🎉 TRAINING COMPLETE!")
print("="*70)
display(results_df)


🚀 Starting SECTION 10: Model Training Execution...

=== Running rembert -> google/rembert ===
🔥 Enhanced Oversampling: min=0.90, max=5.84
   ├─ Objective boosted samples: 168
   └─ Neutral boosted samples: 0
🔥 Enhanced Oversampling: min=0.90, max=5.84
   ├─ Objective boosted samples: 168
   └─ Neutral boosted samples: 0


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 22.00 MiB. GPU 0 has a total capacity of 12.00 GiB of which 2.01 GiB is free. Of the allocated memory 7.64 GiB is allocated by PyTorch, and 1.07 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## SECTION 11: DETAILED ANALYSIS

In [None]:
# ===== Section 11 — Detailed Breakdown Reports =====

DETAILS_DIR = os.path.join(OUT_DIR, "details")
os.makedirs(DETAILS_DIR, exist_ok=True)

def per_class_breakdown(y_true, y_pred, class_names):
    rep = classification_report(
        y_true, y_pred,
        target_names=list(class_names),
        output_dict=True, zero_division=0
    )
    rows = []
    for cname in class_names:
        if cname in rep:
            rows.append({
                "class": cname,
                "precision": rep[cname]["precision"],
                "recall":    rep[cname]["recall"],
                "f1":        rep[cname]["f1-score"],
                "support":   int(rep[cname]["support"]),
            })
        else:
            rows.append({"class": cname, "precision": 0.0, "recall": 0.0, "f1": 0.0, "support": 0})
    return pd.DataFrame(rows)

all_breakdowns = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Detailed breakdowns for {key} ===")
    ysent_pred, ypol_pred = pred_cache[key]

    sent_per_class = per_class_breakdown(ysent_test, ysent_pred, sent_le.classes_)
    pol_per_class  = per_class_breakdown(ypol_test,  ypol_pred,  pol_le.classes_)

    sent_csv = os.path.join(DETAILS_DIR, f"{key}_sentiment_per_class.csv")
    pol_csv  = os.path.join(DETAILS_DIR, f"{key}_polarization_per_class.csv")
    sent_per_class.to_csv(sent_csv, index=False)
    pol_per_class.to_csv(pol_csv, index=False)

    print("\nSentiment — per class:")
    display(sent_per_class)

    print("\nPolarization — per class:")
    display(pol_per_class)

    all_breakdowns[key] = {
        "sentiment_per_class_csv": sent_csv,
        "polarization_per_class_csv": pol_csv,
    }

with open(os.path.join(DETAILS_DIR, "index.json"), "w") as f:
    json.dump(all_breakdowns, f, indent=2)

print("\nSaved detailed breakdowns to:", DETAILS_DIR)

## SECTION 12: FINAL SUMMARY

In [None]:
# ===== Section 12 — Final Summary =====

timer.get_summary()

print("\n" + "="*70)
print("📊 REMBERT RUN #1 - FINAL RESULTS")
print("="*70)
print(f"Model: google/rembert")
print(f"Hardware: RTX 3060 (12GB VRAM)")
print(f"Dataset: {len(df)} samples (augmented)")
print(f"Train/Val/Test: {len(X_train)}/{len(X_val)}/{len(X_test)}")
print()
print("Results:")
for _, row in results_df.iterrows():
    print(f"  Overall Macro-F1: {row.get('test_macro_f1_avg', 0)*100:.2f}%")
    print(f"  Sentiment F1: {row.get('test_sent_f1', 0)*100:.2f}%")
    print(f"  Polarization F1: {row.get('test_pol_f1', 0)*100:.2f}%")
print()
print(f"Output directory: {OUT_DIR}")
print("="*70)