## SECTION 1

In [2]:
# ============================================================================
# SECTION 1: ENVIRONMENT SETUP AND INSTALLATIONS
# ============================================================================

# Check GPU availability
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Device Count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current CUDA Device: {torch.cuda.get_device_name(0)}")

# Install required packages
!pip install transformers datasets torch torchvision torchaudio
!pip install accelerate
!pip install scikit-learn
!pip install pandas numpy matplotlib seaborn
!pip install wandb  # For experiment tracking (optional)


CUDA Available: True
CUDA Device Count: 1
Current CUDA Device: Tesla T4


## SECTION 2

In [3]:

# ============================================================================
# SECTION 2: IMPORTS AND BASIC SETUP
# ============================================================================

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)


In [4]:
import os, random, json, math
from dataclasses import dataclass
from typing import Dict, Tuple, Optional, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

from transformers import (
    AutoTokenizer, AutoModel, TrainingArguments, Trainer,
    DataCollatorWithPadding, EarlyStoppingCallback
)

def seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_all(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

## SECTION 3

In [48]:
# SECTION 3

data_path = '/content/adjudications_2025-10-20 (1).csv'
# ===== Section 3 — Config (updated) =====
# === EDIT PATH ONLY ===
CSV_PATH = '/content/adjudications_2025-10-20 (1).csv'

# Your exact column names
TITLE_COL = "Title"
TEXT_COL  = "Comment"
SENT_COL  = "Final Sentiment"
POL_COL   = "Final Polarization"

MODEL_CONFIGS = {
    "mbert":       {"name": "bert-base-multilingual-cased", "desc": "mBERT (104 langs)"},
    "xlm_roberta": {"name": "xlm-roberta-base",             "desc": "XLM-R base"},
    "rembert":     {"name": "google/rembert",               "desc": "RemBERT"},
}
MODELS_TO_RUN = ["mbert", "xlm_roberta"]  # add "rembert" if you like

# Training hyperparams
MAX_LENGTH = 224
EPOCHS = 4
BATCH_SIZE = 16
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
EARLY_STOP_PATIENCE = 2

# --- Per-task loss config ---
# Use CE (+ smoothing) for sentiment to recover NEG recall;
# Keep focal for polarity to keep OBJ recall gains.
USE_FOCAL_SENTIMENT = False
USE_FOCAL_POLARITY  = True
FOCAL_GAMMA_POLARITY = 1.5
LABEL_SMOOTH_SENTIMENT = 0.05  # small smoothing stabilizes precision/recall

# Task loss weighting: polarity is harder
TASK_LOSS_WEIGHTS = {"sentiment": 1.0, "polarization": 1.3}

# Per-class multipliers (on top of balanced weights from train split)
CLASS_WEIGHT_MULT = {
    "sentiment": {
        "negative": 1.10,  # nudge up to lift NEG recall
        "neutral":  1.05,
        "positive": 1.15
    },
    "polarization": {
        "non_polarized": 1.00,
        "objective":     2.00,  # hardest & tiny
        "partisan":      1.00
    }
}

# Oversampling
USE_OVERSAMPLING = True
USE_JOINT_OVERSAMPLING = True   # NEW: oversample by (sentiment, polarity) rarity
JOINT_OVERSAMPLING_MAX_MULT = 5.0  # cap to avoid extreme repeats

# Output dir
OUT_DIR = "./runs_multitask"
os.makedirs(OUT_DIR, exist_ok=True)


## SECTION 4

In [49]:
# ===== Section 4 — Load & Prepare Data (updated for multipliers) =====
df = pd.read_csv(CSV_PATH)
df.columns = df.columns.str.strip()

required = [TITLE_COL, TEXT_COL, SENT_COL, POL_COL]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}. Found: {list(df.columns)}")

df = df.dropna(subset=[TITLE_COL, TEXT_COL, SENT_COL, POL_COL]).reset_index(drop=True)

# Encode labels
from sklearn.preprocessing import LabelEncoder
sent_le = LabelEncoder().fit(df[SENT_COL])
pol_le  = LabelEncoder().fit(df[POL_COL])

df["sent_y"] = sent_le.transform(df[SENT_COL])
df["pol_y"]  = pol_le.transform(df[POL_COL])

num_sent_classes = len(sent_le.classes_)
num_pol_classes  = len(pol_le.classes_)

print("Sentiment classes:", dict(enumerate(sent_le.classes_)))
print("Polarization classes:", dict(enumerate(pol_le.classes_)))

# Splits (stratify by sentiment)
from sklearn.model_selection import train_test_split
X = df[[TITLE_COL, TEXT_COL]].copy()
y_sent = df["sent_y"].values
y_pol  = df["pol_y"].values

X_train, X_tmp, ysent_train, ysent_tmp, ypol_train, ypol_tmp = train_test_split(
    X, y_sent, y_pol, test_size=0.3, random_state=42, stratify=y_sent
)
X_val, X_test, ysent_val, ysent_test, ypol_val, ypol_test = train_test_split(
    X_tmp, ysent_tmp, ypol_tmp, test_size=0.5, random_state=42, stratify=ysent_tmp
)

print("Train size:", len(X_train), "Val size:", len(X_val), "Test size:", len(X_test))

# Balanced class weights from TRAIN only
from sklearn.utils.class_weight import compute_class_weight
import numpy as np, json, os

def safe_class_weights(y, n_classes):
    classes = np.arange(n_classes)
    counts = np.bincount(y, minlength=n_classes)
    if np.any(counts == 0):
        return np.ones(n_classes, dtype=np.float32)
    return compute_class_weight("balanced", classes=classes, y=y).astype(np.float32)

sent_weights_np = safe_class_weights(ysent_train, num_sent_classes)
pol_weights_np  = safe_class_weights(ypol_train,  num_pol_classes)

# Apply user multipliers by class name
sent_name_to_idx = {name: i for i, name in enumerate(sent_le.classes_)}
pol_name_to_idx  = {name: i for i, name in enumerate(pol_le.classes_)}

for cname, mult in CLASS_WEIGHT_MULT["sentiment"].items():
    if cname in sent_name_to_idx:
        sent_weights_np[sent_name_to_idx[cname]] *= float(mult)

for cname, mult in CLASS_WEIGHT_MULT["polarization"].items():
    if cname in pol_name_to_idx:
        pol_weights_np[pol_name_to_idx[cname]] *= float(mult)

print("Final sentiment class weights:", {sent_le.classes_[i]: float(w) for i, w in enumerate(sent_weights_np)})
print("Final polarization class weights:", {pol_le.classes_[i]: float(w) for i, w in enumerate(pol_weights_np)})

# Save label maps
with open(os.path.join(OUT_DIR, "label_map_sentiment.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(sent_le.classes_)).items()}, f, indent=2)
with open(os.path.join(OUT_DIR, "label_map_polarization.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(pol_le.classes_)).items()}, f, indent=2)


Sentiment classes: {0: 'negative', 1: 'neutral', 2: 'positive'}
Polarization classes: {0: 'non_polarized', 1: 'objective', 2: 'partisan'}
Train size: 3934 Val size: 843 Test size: 844
Final sentiment class weights: {'negative': 0.6448220014572144, 'neutral': 1.206748366355896, 'positive': 2.712290048599243}
Final polarization class weights: {'non_polarized': 1.2945047616958618, 'objective': 11.207977294921875, 'partisan': 0.48802879452705383}


## SECTION 5

In [50]:
# ===== Section 5 — Dataset & Collator (proper text-pair encoding) =====
from torch.utils.data import Dataset

class TaglishDataset(Dataset):
    def __init__(self, titles, texts, y_sent, y_pol, tokenizer, max_length=224):
        self.titles = list(titles)
        self.texts  = list(texts)
        self.y_sent = np.array(y_sent)
        self.y_pol  = np.array(y_pol)
        self.tok = tokenizer
        self.max_length = max_length
        # mBERT has token_type_ids; XLM-R/RemBERT don't, and that's fine.
        self.use_token_type = "token_type_ids" in tokenizer.model_input_names

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Pass title as text, comment as text_pair so the tokenizer inserts the correct separators.
        # We also bias truncation to the comment since titles are short.
        enc = self.tok(
            text=str(self.titles[idx]),
            text_pair=str(self.texts[idx]),
            truncation="only_second",     # keep the title intact; trim the comment if needed
            max_length=self.max_length,
            return_token_type_ids=self.use_token_type,
        )
        item = {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "sentiment_labels": torch.tensor(self.y_sent[idx], dtype=torch.long),
            "polarization_labels": torch.tensor(self.y_pol[idx], dtype=torch.long),
        }
        if self.use_token_type and "token_type_ids" in enc:
            item["token_type_ids"] = enc["token_type_ids"]
        return item


## SECTION 6

In [51]:
# SECTION 6

class MultiTaskModel(nn.Module):
    def __init__(self, base_model_name: str, num_sent: int, num_pol: int, dropout: float = 0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        hidden = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.head_sent = nn.Linear(hidden, num_sent)
        self.head_pol  = nn.Linear(hidden, num_pol)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        sentiment_labels=None,
        polarization_labels=None
    ):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids if token_type_ids is not None else None
        )
        # Try pooler; fallback to CLS
        pooled = getattr(outputs, "pooler_output", None)
        if pooled is None:
            pooled = outputs.last_hidden_state[:, 0]
        x = self.dropout(pooled)
        sent_logits = self.head_sent(x)
        pol_logits  = self.head_pol(x)
        # Return tuple 'logits' to work with custom Trainer prediction_step
        return {"logits": (sent_logits, pol_logits)}


## SECTION 7

In [52]:
# SECTION 7

def compute_metrics_multi(eval_pred):
    (sent_logits, pol_logits) = eval_pred.predictions
    (y_sent, y_pol) = eval_pred.label_ids

    ps = np.argmax(sent_logits, axis=1)
    pp = np.argmax(pol_logits, axis=1)

    # Macro metrics
    sent_report = classification_report(y_sent, ps, output_dict=True, zero_division=0)
    pol_report  = classification_report(y_pol,  pp, output_dict=True, zero_division=0)

    sent_f1 = sent_report["macro avg"]["f1-score"]
    pol_f1  = pol_report["macro avg"]["f1-score"]
    macro_f1_avg = (sent_f1 + pol_f1) / 2.0

    return {
        "sent_acc": sent_report["accuracy"],
        "sent_prec": sent_report["macro avg"]["precision"],
        "sent_rec": sent_report["macro avg"]["recall"],
        "sent_f1": sent_f1,

        "pol_acc": pol_report["accuracy"],
        "pol_prec": pol_report["macro avg"]["precision"],
        "pol_rec": pol_report["macro avg"]["recall"],
        "pol_f1": pol_f1,

        "macro_f1_avg": macro_f1_avg
    }


## SECTION 8

In [53]:
# ===== Section 8 — Custom Trainer (per-task losses) =====
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer
from torch.utils.data import DataLoader

class FocalLoss(nn.Module):
    def __init__(self, weight=None, gamma=2.0, reduction="mean"):
        super().__init__()
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction
    def forward(self, logits, target):
        logp = F.log_softmax(logits, dim=1)
        p = torch.exp(logp)
        focal = (1 - p) ** self.gamma
        loss = F.nll_loss(focal * logp, target, weight=self.weight, reduction="none")
        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        return loss

class MultiTaskTrainer(Trainer):
    def __init__(self, *args, class_weights=None, task_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights or {}
        self.task_weights  = task_weights or {"sentiment": 1.0, "polarization": 1.0}
        self._custom_train_sampler = None

    def set_train_sampler(self, sampler):
        self._custom_train_sampler = sampler

    def get_train_dataloader(self):
        if self.train_dataset is None:
            return None
        if self._custom_train_sampler is not None:
            return DataLoader(
                self.train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=self._custom_train_sampler,
                collate_fn=self.data_collator,
                drop_last=self.args.dataloader_drop_last,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
        return super().get_train_dataloader()

    def _sent_loss_fn(self, weight, logits, target):
        # Sentiment: CE with small label smoothing
        return nn.CrossEntropyLoss(weight=weight, label_smoothing=float(LABEL_SMOOTH_SENTIMENT))(logits, target)

    def _pol_loss_fn(self, weight, logits, target):
        # Polarity: focal loss to fight majority dominance
        return FocalLoss(weight=weight, gamma=FOCAL_GAMMA_POLARITY)(logits, target)

    def compute_loss(self, model, inputs, return_outputs=False):
        y_sent = inputs.pop("sentiment_labels")
        y_pol  = inputs.pop("polarization_labels")

        outputs = model(**inputs)
        sent_logits, pol_logits = outputs["logits"]

        # prepare weights on device
        ws = self.class_weights.get("sentiment", None)
        wp = self.class_weights.get("polarization", None)
        ws = ws.to(sent_logits.device) if ws is not None else None
        wp = wp.to(pol_logits.device)  if wp is not None else None

        # per-task losses
        loss_sent = self._sent_loss_fn(ws, sent_logits, y_sent) if not USE_FOCAL_SENTIMENT \
                    else FocalLoss(weight=ws, gamma=1.5)(sent_logits, y_sent)
        loss_pol  = self._pol_loss_fn(wp,  pol_logits,  y_pol)  if USE_FOCAL_POLARITY \
                    else nn.CrossEntropyLoss(weight=wp)(pol_logits, y_pol)

        # task-weighted sum
        w_s = float(self.task_weights.get("sentiment", 1.0))
        w_p = float(self.task_weights.get("polarization", 1.0))
        loss = w_s * loss_sent + w_p * loss_pol

        if return_outputs:
            outputs = dict(outputs)
            outputs["labels"] = (y_sent, y_pol)
            return loss, outputs
        return loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        y_sent = inputs.get("sentiment_labels")
        y_pol  = inputs.get("polarization_labels")

        model_inputs = {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]}
        if "token_type_ids" in inputs:
            model_inputs["token_type_ids"] = inputs["token_type_ids"]

        model.eval()
        with torch.no_grad():
            outputs = model(**model_inputs)
            sent_logits, pol_logits = outputs["logits"]

        loss = None
        logits = (sent_logits.detach(), pol_logits.detach())
        labels = (y_sent, y_pol)
        return (loss, logits, labels)


## SECTION 9

In [54]:
# ===== Section 9 — Train/Evaluate One Model (joint oversampling) =====
from transformers import AutoTokenizer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
import math, json, numpy as np, pandas as pd, os
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import torch
from torch.utils.data import WeightedRandomSampler
from collections import Counter

def train_eval_one_model(model_key: str,
                         X_tr: pd.DataFrame, X_v: pd.DataFrame, X_te: pd.DataFrame,
                         ysent_tr: np.ndarray, ysent_v: np.ndarray, ysent_te: np.ndarray,
                         ypol_tr: np.ndarray,  ypol_v: np.ndarray,  ypol_te: np.ndarray,
                         sent_w_np: np.ndarray, pol_w_np: np.ndarray):
    base_name = MODEL_CONFIGS[model_key]["name"]
    run_dir = os.path.join(OUT_DIR, f"{model_key}")
    os.makedirs(run_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(base_name)
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    tr_titles, tr_texts = X_tr[TITLE_COL].values, X_tr[TEXT_COL].values
    v_titles,  v_texts  = X_v[TITLE_COL].values,  X_v[TEXT_COL].values
    te_titles, te_texts = X_te[TITLE_COL].values, X_te[TEXT_COL].values

    train_ds = TaglishDataset(tr_titles, tr_texts, ysent_tr, ypol_tr, tokenizer, max_length=MAX_LENGTH)
    val_ds   = TaglishDataset(v_titles,  v_texts,  ysent_v,  ypol_v,  tokenizer, max_length=MAX_LENGTH)
    test_ds  = TaglishDataset(te_titles, te_texts, ysent_te, ypol_te, tokenizer, max_length=MAX_LENGTH)

    # Model
    model = MultiTaskModel(base_name, num_sent_classes, num_pol_classes).to(device)

    # Class weights (torch tensors)
    sent_w = torch.tensor(sent_w_np, dtype=torch.float32)
    pol_w  = torch.tensor(pol_w_np,  dtype=torch.float32)

    args = TrainingArguments(
        output_dir=run_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_RATIO,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1_avg",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        logging_dir=os.path.join(run_dir, "logs"),
        logging_steps=50,
        report_to="none",
        seed=42,
        remove_unused_columns=False,
        eval_accumulation_steps=1
    )

    trainer = MultiTaskTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_multi,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOP_PATIENCE)],
        class_weights={"sentiment": sent_w, "polarization": pol_w},
        task_weights=TASK_LOSS_WEIGHTS
    )

    # ----- JOINT OVERSAMPLING -----
    if USE_OVERSAMPLING and USE_JOINT_OVERSAMPLING:
        pair_counts = Counter(zip(ysent_tr.tolist(), ypol_tr.tolist()))
        # Inverse frequency, clipped to avoid extremes
        min_freq = min(pair_counts.values())
        sample_weights = []
        for ys, yp in zip(ysent_tr, ypol_tr):
            freq = pair_counts[(int(ys), int(yp))]
            inv = (min_freq / freq) if freq > 0 else JOINT_OVERSAMPLING_MAX_MULT
            inv = min(inv, JOINT_OVERSAMPLING_MAX_MULT)
            sample_weights.append(inv)
        sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        trainer.set_train_sampler(sampler)
    elif USE_OVERSAMPLING:
        # Fallback: per-task max weight (previous behavior)
        sample_weights = []
        for ys, yp in zip(ysent_tr, ypol_tr):
            ws = float(sent_w_np[ys]); wp = float(pol_w_np[yp])
            sample_weights.append(max(ws, wp))
        sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        trainer.set_train_sampler(sampler)

    trainer.train()

    # Eval on test
    test_out = trainer.predict(test_ds)
    metrics = {f"test_{k}": float(v) for k, v in test_out.metrics.items()}

    # Save artifacts
    trainer.save_model(run_dir)
    tokenizer.save_pretrained(run_dir)
    with open(os.path.join(run_dir, "metrics_test.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    # Confusion matrices
    sent_logits, pol_logits = test_out.predictions
    ysent_pred = np.argmax(sent_logits, axis=1)
    ypol_pred  = np.argmax(pol_logits,  axis=1)

    cm_sent = confusion_matrix(ysent_te, ysent_pred, labels=list(range(num_sent_classes)))
    cm_pol  = confusion_matrix(ypol_te,  ypol_pred,  labels=list(range(num_pol_classes)))

    np.save(os.path.join(run_dir, "cm_sent.npy"), cm_sent)
    np.save(os.path.join(run_dir, "cm_pol.npy"),  cm_pol)

    # Quick plots
    def plot_cm(cm, labels, title, path_png):
        fig, ax = plt.subplots(figsize=(4.5, 4))
        im = ax.imshow(cm, interpolation="nearest")
        ax.set_title(title)
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45, ha="right")
        ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, cm[i, j], ha="center", va="center")
        fig.colorbar(im, ax=ax, fraction=0.046)
        plt.tight_layout()
        plt.savefig(path_png, dpi=160)
        plt.close(fig)

    plot_cm(cm_sent, sent_le.classes_, "Sentiment Confusion", os.path.join(run_dir, "cm_sent.png"))
    plot_cm(cm_pol,  pol_le.classes_,  "Polarization Confusion", os.path.join(run_dir, "cm_pol.png"))

    # Text reports
    rep_sent = classification_report(ysent_te, ysent_pred, target_names=sent_le.classes_, digits=4, zero_division=0)
    rep_pol  = classification_report(ypol_te,  ypol_pred,  target_names=pol_le.classes_,  digits=4, zero_division=0)
    with open(os.path.join(run_dir, "report_sentiment.txt"), "w") as f:
        f.write(rep_sent)
    with open(os.path.join(run_dir, "report_polarization.txt"), "w") as f:
        f.write(rep_pol)

    summary_row = {
        "model_key": model_key,
        "base_name": base_name,
        **metrics
    }
    return summary_row, (ysent_pred, ypol_pred)


## SECTION 10


In [55]:
# SECTION 10

results = []
pred_cache = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Running {key} -> {MODEL_CONFIGS[key]['name']} ===")
    row, preds = train_eval_one_model(
        key,
        X_train, X_val, X_test,
        ysent_train, ysent_val, ysent_test,
        ypol_train,  ypol_val,  ypol_test,
        sent_weights_np, pol_weights_np
    )
    results.append(row)
    pred_cache[key] = preds

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(OUT_DIR, "summary_results.csv"), index=False)
results_df



=== Running mbert -> bert-base-multilingual-cased ===


Epoch,Training Loss,Validation Loss,Sent Acc,Sent Prec,Sent Rec,Sent F1,Pol Acc,Pol Prec,Pol Rec,Pol F1,Macro F1 Avg
1,1.7327,No log,0.330961,0.556954,0.520516,0.323991,0.329775,0.433472,0.51594,0.297323,0.310657
2,1.0413,No log,0.379597,0.497519,0.564819,0.384368,0.498221,0.484157,0.584527,0.436165,0.410267
3,0.7057,No log,0.474496,0.588468,0.620063,0.507096,0.534994,0.520156,0.574943,0.477287,0.492191
4,0.5638,No log,0.493476,0.569426,0.629401,0.515982,0.576512,0.518482,0.598734,0.498779,0.507381



=== Running xlm_roberta -> xlm-roberta-base ===


Epoch,Training Loss,Validation Loss,Sent Acc,Sent Prec,Sent Rec,Sent F1,Pol Acc,Pol Prec,Pol Rec,Pol F1,Macro F1 Avg
1,2.159,No log,0.142349,0.04745,0.333333,0.083074,0.137604,0.113475,0.436942,0.146279,0.114676
2,1.6559,No log,0.30605,0.207784,0.499271,0.292967,0.418743,0.443889,0.544425,0.369278,0.331123
3,1.2942,No log,0.316726,0.498628,0.50331,0.30658,0.440095,0.463142,0.538633,0.387086,0.346833
4,1.0017,No log,0.338078,0.570635,0.530652,0.332562,0.396204,0.469198,0.519757,0.357038,0.3448


Unnamed: 0,model_key,base_name,test_test_sent_acc,test_test_sent_prec,test_test_sent_rec,test_test_sent_f1,test_test_pol_acc,test_test_pol_prec,test_test_pol_rec,test_test_pol_f1,test_test_macro_f1_avg,test_test_runtime,test_test_samples_per_second,test_test_steps_per_second
0,mbert,bert-base-multilingual-cased,0.484597,0.562475,0.621471,0.501628,0.579384,0.523074,0.595621,0.50737,0.504499,2.7037,312.166,19.603
1,xlm_roberta,xlm-roberta-base,0.305687,0.53987,0.4882,0.297664,0.441943,0.460146,0.534562,0.398321,0.347993,3.266,258.419,16.228


## SECTION 11


In [57]:
# ===== Section 11 — Detailed Breakdown Reports (per-class + cross-slices) =====
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import os
import json

def per_class_breakdown(y_true, y_pred, class_names):
    rep = classification_report(
        y_true, y_pred,
        target_names=list(class_names),
        output_dict=True, zero_division=0
    )
    # Keep only the class rows in the given order
    rows = []
    for cname in class_names:
        if cname in rep:
            rows.append({
                "class": cname,
                "precision": rep[cname]["precision"],
                "recall":    rep[cname]["recall"],
                "f1":        rep[cname]["f1-score"],
                "support":   int(rep[cname]["support"]),
            })
        else:
            rows.append({"class": cname, "precision": 0.0, "recall": 0.0, "f1": 0.0, "support": 0})
    return pd.DataFrame(rows)

def cross_slice_breakdown(
    slice_true,  # array of ints for the slicing label (e.g., true sentiment indices)
    slice_names, # names of the slicing label classes (e.g., sentiment class names)
    task_true,   # array of ints for the task we evaluate (e.g., true polarity indices)
    task_pred,   # array of ints for the task predictions (e.g., predicted polarity indices)
    task_names,  # names of the task classes (e.g., polarity class names)
    slice_label  # string for the slice axis name, e.g., "sentiment" or "polarity"
):
    """
    For each class s in slice_true, evaluate the task predictions on the subset where slice_true == s.
    Returns one row per slice value, including macro-F1, accuracy, and per-class F1 for the task.
    """
    rows = []
    for idx, sname in enumerate(slice_names):
        mask = (slice_true == idx)
        n = int(mask.sum())
        if n == 0:
            # No samples for this slice in test set
            row = {"slice": sname, "support": 0, "accuracy": np.nan, "macro_f1": np.nan}
            for tname in task_names:
                row[f"f1_{tname}"] = np.nan
            rows.append(row)
            continue

        rep = classification_report(
            task_true[mask], task_pred[mask],
            target_names=list(task_names),
            output_dict=True, zero_division=0
        )
        row = {
            "slice": sname,
            "support": n,
            "accuracy": rep["accuracy"],
            "macro_f1": rep["macro avg"]["f1-score"],
        }
        for tname in task_names:
            row[f"f1_{tname}"] = rep[tname]["f1-score"]
        rows.append(row)

    df = pd.DataFrame(rows)
    # Sort slices by support (desc) for readability
    df = df.sort_values(by="support", ascending=False).reset_index(drop=True)
    return df

# Where to save things
DETAILS_DIR = os.path.join(OUT_DIR, "details")
os.makedirs(DETAILS_DIR, exist_ok=True)

all_breakdowns = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Detailed breakdowns for {key} ===")
    ysent_pred, ypol_pred = pred_cache[key]

    # ---- Per-class reports on the full test set
    sent_per_class = per_class_breakdown(ysent_test, ysent_pred, sent_le.classes_)
    pol_per_class  = per_class_breakdown(ypol_test,  ypol_pred,  pol_le.classes_)

    # Save + show
    sent_csv = os.path.join(DETAILS_DIR, f"{key}_sentiment_per_class.csv")
    pol_csv  = os.path.join(DETAILS_DIR, f"{key}_polarization_per_class.csv")
    sent_per_class.to_csv(sent_csv, index=False)
    pol_per_class.to_csv(pol_csv, index=False)

    print("\nSentiment — per class (precision/recall/F1/support):")
    display(sent_per_class)

    print("\nPolarization — per class (precision/recall/F1/support):")
    display(pol_per_class)

    # ---- Cross-slice reports
    # Polarity performance within each (true) sentiment slice
    pol_given_sent = cross_slice_breakdown(
        slice_true=ysent_test, slice_names=sent_le.classes_,
        task_true=ypol_test,   task_pred=ypol_pred, task_names=pol_le.classes_,
        slice_label="sentiment"
    )
    pol_given_sent_csv = os.path.join(DETAILS_DIR, f"{key}_polarity_given_sentiment.csv")
    pol_given_sent.to_csv(pol_given_sent_csv, index=False)

    print("\nPolarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):")
    display(pol_given_sent)

    # Sentiment performance within each (true) polarity slice
    sent_given_pol = cross_slice_breakdown(
        slice_true=ypol_test,  slice_names=pol_le.classes_,
        task_true=ysent_test,  task_pred=ysent_pred, task_names=sent_le.classes_,
        slice_label="polarity"
    )
    sent_given_pol_csv = os.path.join(DETAILS_DIR, f"{key}_sentiment_given_polarity.csv")
    sent_given_pol.to_csv(sent_given_pol_csv, index=False)

    print("\nSentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):")
    display(sent_given_pol)

    # Keep for a single JSON bundle if you like
    all_breakdowns[key] = {
        "sentiment_per_class_csv": sent_csv,
        "polarization_per_class_csv": pol_csv,
        "polarity_given_sentiment_csv": pol_given_sent_csv,
        "sentiment_given_polarity_csv": sent_given_pol_csv
    }

# Optional: write an index JSON pointing to all CSVs
with open(os.path.join(DETAILS_DIR, "index.json"), "w") as f:
    json.dump(all_breakdowns, f, indent=2)
print("\nSaved detailed breakdowns to:", DETAILS_DIR)



=== Detailed breakdowns for mbert ===

Sentiment — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,negative,0.836735,0.25625,0.392344,480
1,neutral,0.384937,0.75102,0.50899,245
2,positive,0.465753,0.857143,0.60355,119



Polarization — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,non_polarized,0.382353,0.78972,0.515244,214
1,objective,0.277778,0.487805,0.353982,41
2,partisan,0.909091,0.509338,0.652884,589



Polarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_non_polarized,f1_objective,f1_partisan
0,negative,480,0.554167,0.400468,0.296029,0.222222,0.683153
1,neutral,245,0.604082,0.536054,0.695341,0.4,0.512821
2,positive,119,0.630252,0.58232,0.62,0.454545,0.672414



Sentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_negative,f1_neutral,f1_positive
0,partisan,589,0.427844,0.452423,0.422794,0.352657,0.581818
1,non_polarized,214,0.626168,0.527208,0.205882,0.728682,0.647059
2,objective,41,0.560976,0.475,0.133333,0.666667,0.625



=== Detailed breakdowns for xlm_roberta ===

Sentiment — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,negative,1.0,0.0125,0.024691,480
1,neutral,0.28051,0.628571,0.387909,245
2,positive,0.3391,0.823529,0.480392,119



Polarization — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,non_polarized,0.318618,0.775701,0.451701,214
1,objective,0.192661,0.512195,0.28,41
2,partisan,0.869159,0.315789,0.463263,589



Polarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_non_polarized,f1_objective,f1_partisan
0,negative,480,0.441667,0.333688,0.28754,0.15873,0.554795
1,neutral,245,0.477551,0.387057,0.618557,0.271186,0.271429
2,positive,119,0.369748,0.390431,0.473282,0.571429,0.126582



Sentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_negative,f1_neutral,f1_positive
0,partisan,589,0.220713,0.259574,0.028169,0.254826,0.495726
1,non_polarized,214,0.518692,0.37764,0.0,0.646809,0.486111
2,objective,41,0.414634,0.306233,0.0,0.585366,0.333333



Saved detailed breakdowns to: ./runs_multitask/details


## SECTION 12

In [46]:
# ===== Section 12 — Length Diagnostics (clean) =====
import warnings

def token_lengths_summary(texts, titles, tokenizer, n=5000):
    # Random sample (or full if dataset is small)
    n = min(n, len(texts))
    idx = np.random.choice(len(texts), size=n, replace=False) if len(texts) > n else np.arange(len(texts))

    lengths = []
    # Silence the "sequence > 512" warnings emitted by some tokenizers for inspection
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Token indices sequence length is longer.*")
        for i in idx:
            s = f"{titles[i]} [SEP] {texts[i]}"
            # We want raw length pre-truncation to choose MAX_LENGTH wisely
            ids = tokenizer.encode(s, add_special_tokens=True, truncation=False)
            lengths.append(len(ids))

    arr = np.array(lengths)
    stats = {
        "mean": float(arr.mean()),
        "p50":  float(np.percentile(arr, 50)),
        "p90":  float(np.percentile(arr, 90)),
        "p95":  float(np.percentile(arr, 95)),
        "p99":  float(np.percentile(arr, 99)),
        "max":  int(arr.max())
    }
    print("Token length stats:", stats)
    return stats

for key in MODELS_TO_RUN:
    name = MODEL_CONFIGS[key]["name"]
    tok = AutoTokenizer.from_pretrained(name)
    print(f"\n[{key}] {name}")
    token_lengths_summary(
        texts=X_train[TEXT_COL].values,
        titles=X_train[TITLE_COL].values,
        tokenizer=tok,
        n=5000
    )

# Tip:
# If p95 is comfortably < 192, you're fine. If you see p95 > 192, consider MAX_LENGTH=224
# (Update in Section 3 if you decide to bump it.)


Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors



[mbert] bert-base-multilingual-cased
Token length stats: {'mean': 107.43823080833756, 'p50': 90.0, 'p90': 184.0, 'p95': 197.0, 'p99': 232.67000000000007, 'max': 916}


Token indices sequence length is longer than the specified maximum sequence length for this model (950 > 512). Running this sequence through the model will result in indexing errors



[xlm_roberta] xlm-roberta-base
Token length stats: {'mean': 106.15734621250635, 'p50': 93.0, 'p90': 170.0, 'p95': 181.0, 'p99': 218.34000000000015, 'max': 950}
