## SECTION 1

In [2]:
# ============================================================================
# SECTION 1: ENVIRONMENT SETUP AND INSTALLATIONS
# ============================================================================

# Check GPU availability
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Device Count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current CUDA Device: {torch.cuda.get_device_name(0)}")

# Install required packages
!pip install transformers datasets torch torchvision torchaudio
!pip install accelerate
!pip install scikit-learn
!pip install pandas numpy matplotlib seaborn
!pip install wandb  # For experiment tracking (optional)


CUDA Available: True
CUDA Device Count: 1
Current CUDA Device: Tesla T4


## SECTION 2

In [3]:

# ============================================================================
# SECTION 2: IMPORTS AND BASIC SETUP
# ============================================================================

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)


In [4]:
import os, random, json, math
from dataclasses import dataclass
from typing import Dict, Tuple, Optional, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

from transformers import (
    AutoTokenizer, AutoModel, TrainingArguments, Trainer,
    DataCollatorWithPadding, EarlyStoppingCallback
)

def seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_all(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

## SECTION 3

In [15]:
# SECTION 3

data_path = '/content/adjudications_2025-10-20 (1).csv'
# === EDIT PATH ONLY ===
CSV_PATH = '/content/adjudications_2025-10-20 (1).csv'

# Your exact column names
TITLE_COL = "Title"
TEXT_COL  = "Comment"
SENT_COL  = "Final Sentiment"
POL_COL   = "Final Polarization"

MODEL_CONFIGS = {
    "mbert":       {"name": "bert-base-multilingual-cased", "desc": "mBERT (104 langs)"},
    "xlm_roberta": {"name": "xlm-roberta-base",             "desc": "XLM-R base"},
    "rembert":     {"name": "google/rembert",               "desc": "RemBERT"},
}
MODELS_TO_RUN = ["xlm_roberta", "mbert"]  # add "rembert" if you like

MAX_LENGTH = 192
EPOCHS = 4
BATCH_SIZE = 16
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
EARLY_STOP_PATIENCE = 2

OUT_DIR = "./runs_multitask"
os.makedirs(OUT_DIR, exist_ok=True)


## SECTION 4

In [16]:
# SECTION 4

df = pd.read_csv(CSV_PATH)

# Guard against stray spaces/BOMs in headers
df.columns = df.columns.str.strip()

# Quick sanity: will raise a clear error if any are missing
required = [TITLE_COL, TEXT_COL, SENT_COL, POL_COL]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}. Found: {list(df.columns)}")

# Drop NA
df = df.dropna(subset=[TITLE_COL, TEXT_COL, SENT_COL, POL_COL]).reset_index(drop=True)

# Encode labels
sent_le = LabelEncoder().fit(df[SENT_COL])
pol_le  = LabelEncoder().fit(df[POL_COL])

df["sent_y"] = sent_le.transform(df[SENT_COL])
df["pol_y"]  = pol_le.transform(df[POL_COL])

num_sent_classes = len(sent_le.classes_)
num_pol_classes  = len(pol_le.classes_)

print("Sentiment classes:", dict(enumerate(sent_le.classes_)))
print("Polarization classes:", dict(enumerate(pol_le.classes_)))

# Splits (stratify by sentiment)
X = df[[TITLE_COL, TEXT_COL]].copy()
y_sent = df["sent_y"].values
y_pol  = df["pol_y"].values

X_train, X_tmp, ysent_train, ysent_tmp, ypol_train, ypol_tmp = train_test_split(
    X, y_sent, y_pol, test_size=0.3, random_state=42, stratify=y_sent
)
X_val, X_test, ysent_val, ysent_test, ypol_val, ypol_test = train_test_split(
    X_tmp, ysent_tmp, ypol_tmp, test_size=0.5, random_state=42, stratify=ysent_tmp
)

print("Train size:", len(X_train), "Val size:", len(X_val), "Test size:", len(X_test))

# Per-task class weights (from TRAIN only)
def safe_class_weights(y, n_classes):
    classes = np.arange(n_classes)
    counts = np.bincount(y, minlength=n_classes)
    if np.any(counts == 0):
        return np.ones(n_classes, dtype=np.float32)
    return compute_class_weight("balanced", classes=classes, y=y).astype(np.float32)

sent_weights_np = safe_class_weights(ysent_train, num_sent_classes)
pol_weights_np  = safe_class_weights(ypol_train,  num_pol_classes)
print("Sentiment class weights:", dict(enumerate(np.round(sent_weights_np, 3))))
print("Polarization class weights:", dict(enumerate(np.round(pol_weights_np, 3))))

# Save label maps
with open(os.path.join(OUT_DIR, "label_map_sentiment.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(sent_le.classes_)).items()}, f, indent=2)
with open(os.path.join(OUT_DIR, "label_map_polarization.json"), "w") as f:
    json.dump({int(k): v for k, v in dict(enumerate(pol_le.classes_)).items()}, f, indent=2)


Sentiment classes: {0: 'negative', 1: 'neutral', 2: 'positive'}
Polarization classes: {0: 'non_polarized', 1: 'objective', 2: 'partisan'}
Train size: 3934 Val size: 843 Test size: 844
Sentiment class weights: {0: np.float32(0.586), 1: np.float32(1.149), 2: np.float32(2.359)}
Polarization class weights: {0: np.float32(1.295), 1: np.float32(5.604), 2: np.float32(0.488)}


## SECTION 5

In [17]:
# SECTION 5

from torch.utils.data import Dataset

class TaglishDataset(Dataset):
    def __init__(self, titles, texts, y_sent, y_pol, tokenizer, max_length=192):
        self.titles = list(titles)
        self.texts  = list(texts)
        self.y_sent = np.array(y_sent)
        self.y_pol  = np.array(y_pol)
        self.tok = tokenizer
        self.max_length = max_length
        self.use_token_type = "token_type_ids" in tokenizer.model_input_names

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        combined = f"{self.titles[idx]} [SEP] {self.texts[idx]}"
        enc = self.tok(
            combined,
            truncation=True,
            max_length=self.max_length,
            return_token_type_ids=self.use_token_type
        )
        item = {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "sentiment_labels": torch.tensor(self.y_sent[idx], dtype=torch.long),
            "polarization_labels": torch.tensor(self.y_pol[idx], dtype=torch.long),
        }
        if self.use_token_type and "token_type_ids" in enc:
            item["token_type_ids"] = enc["token_type_ids"]
        return item

# Simple accessor helpers
def get_arrays(split_df, ysent, ypol):
    return split_df[TITLE_COL].values, split_df[TEXT_COL].values, ysent, ypol


## SECTION 6

In [18]:
# SECTION 6

class MultiTaskModel(nn.Module):
    def __init__(self, base_model_name: str, num_sent: int, num_pol: int, dropout: float = 0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        hidden = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.head_sent = nn.Linear(hidden, num_sent)
        self.head_pol  = nn.Linear(hidden, num_pol)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        sentiment_labels=None,
        polarization_labels=None
    ):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids if token_type_ids is not None else None
        )
        # Try pooler; fallback to CLS
        pooled = getattr(outputs, "pooler_output", None)
        if pooled is None:
            pooled = outputs.last_hidden_state[:, 0]
        x = self.dropout(pooled)
        sent_logits = self.head_sent(x)
        pol_logits  = self.head_pol(x)
        # Return tuple 'logits' to work with custom Trainer prediction_step
        return {"logits": (sent_logits, pol_logits)}


## SECTION 7

In [19]:
# SECTION 7

def compute_metrics_multi(eval_pred):
    (sent_logits, pol_logits) = eval_pred.predictions
    (y_sent, y_pol) = eval_pred.label_ids

    ps = np.argmax(sent_logits, axis=1)
    pp = np.argmax(pol_logits, axis=1)

    # Macro metrics
    sent_report = classification_report(y_sent, ps, output_dict=True, zero_division=0)
    pol_report  = classification_report(y_pol,  pp, output_dict=True, zero_division=0)

    sent_f1 = sent_report["macro avg"]["f1-score"]
    pol_f1  = pol_report["macro avg"]["f1-score"]
    macro_f1_avg = (sent_f1 + pol_f1) / 2.0

    return {
        "sent_acc": sent_report["accuracy"],
        "sent_prec": sent_report["macro avg"]["precision"],
        "sent_rec": sent_report["macro avg"]["recall"],
        "sent_f1": sent_f1,

        "pol_acc": pol_report["accuracy"],
        "pol_prec": pol_report["macro avg"]["precision"],
        "pol_rec": pol_report["macro avg"]["recall"],
        "pol_f1": pol_f1,

        "macro_f1_avg": macro_f1_avg
    }


## SECTION 8

In [24]:
# SECTION 8

# ===== Section 8 — Custom Trainer (weighted losses + tuple predictions) =====
class MultiTaskTrainer(Trainer):
    def __init__(self, *args, class_weights: Dict[str, torch.Tensor] = None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights or {}

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels (leave as tensors)
        y_sent = inputs.pop("sentiment_labels")
        y_pol  = inputs.pop("polarization_labels")

        outputs = model(**inputs)
        sent_logits, pol_logits = outputs["logits"]

        # Weighted CE (move weights to the right device lazily)
        ce_sent = nn.CrossEntropyLoss(
            weight=self.class_weights.get("sentiment", None).to(sent_logits.device)
            if self.class_weights.get("sentiment", None) is not None else None
        )
        ce_pol  = nn.CrossEntropyLoss(
            weight=self.class_weights.get("polarization", None).to(pol_logits.device)
            if self.class_weights.get("polarization", None) is not None else None
        )

        loss_sent = ce_sent(sent_logits, y_sent)
        loss_pol  = ce_pol(pol_logits,  y_pol)
        loss = loss_sent + loss_pol

        if return_outputs:
            # Put the labels back only if caller needs outputs (not required for training loop)
            outputs = dict(outputs)
            outputs["labels"] = (y_sent, y_pol)
            return loss, outputs
        return loss

    # IMPORTANT: return **torch tensors** (not numpy) so Accelerate can pad/gather
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        # Labels kept as tensors
        y_sent = inputs.get("sentiment_labels")
        y_pol  = inputs.get("polarization_labels")

        # Build model inputs explicitly (Trainer has already moved tensors to device)
        model_inputs = {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
        }
        if "token_type_ids" in inputs:
            model_inputs["token_type_ids"] = inputs["token_type_ids"]

        model.eval()
        with torch.no_grad():
            outputs = model(**model_inputs)
            sent_logits, pol_logits = outputs["logits"]

        # Return tensors; Trainer will handle device/concat and convert to numpy for compute_metrics
        loss = None
        logits = (sent_logits.detach(), pol_logits.detach())
        labels = (y_sent, y_pol)
        return (loss, logits, labels)


## SECTION 9

In [25]:
# ===== Section 9 — Train/Evaluate One Model =====
def train_eval_one_model(model_key: str,
                         X_tr: pd.DataFrame, X_v: pd.DataFrame, X_te: pd.DataFrame,
                         ysent_tr: np.ndarray, ysent_v: np.ndarray, ysent_te: np.ndarray,
                         ypol_tr: np.ndarray,  ypol_v: np.ndarray,  ypol_te: np.ndarray,
                         sent_w_np: np.ndarray, pol_w_np: np.ndarray):
    base_name = MODEL_CONFIGS[model_key]["name"]
    run_dir = os.path.join(OUT_DIR, f"{model_key}")
    os.makedirs(run_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(base_name)
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    # Build datasets
    tr_titles, tr_texts = X_tr[TITLE_COL].values, X_tr[TEXT_COL].values
    v_titles,  v_texts  = X_v[TITLE_COL].values,  X_v[TEXT_COL].values
    te_titles, te_texts = X_te[TITLE_COL].values, X_te[TEXT_COL].values

    train_ds = TaglishDataset(tr_titles, tr_texts, ysent_tr, ypol_tr, tokenizer, max_length=MAX_LENGTH)
    val_ds   = TaglishDataset(v_titles,  v_texts,  ysent_v,  ypol_v,  tokenizer, max_length=MAX_LENGTH)
    test_ds  = TaglishDataset(te_titles, te_texts, ysent_te, ypol_te, tokenizer, max_length=MAX_LENGTH)

    # Model
    model = MultiTaskModel(base_name, num_sent_classes, num_pol_classes).to(device)

    # Class weights (torch tensors)
    sent_w = torch.tensor(sent_w_np, dtype=torch.float32)
    pol_w  = torch.tensor(pol_w_np,  dtype=torch.float32)

    args = TrainingArguments(
        output_dir=run_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_RATIO,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1_avg",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        logging_dir=os.path.join(run_dir, "logs"),
        logging_steps=50,
        report_to="none",
        seed=42,
        # Safety tweaks so our custom inputs aren't dropped and eval is smooth
        remove_unused_columns=False,
        eval_accumulation_steps=1
    )

    trainer = MultiTaskTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_multi,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOP_PATIENCE)],
        class_weights={"sentiment": sent_w, "polarization": pol_w}
    )

    trainer.train()

    # Eval on test
    test_out = trainer.predict(test_ds)
    metrics = {f"test_{k}": float(v) for k, v in test_out.metrics.items()}

    # Save artifacts
    trainer.save_model(run_dir)
    tokenizer.save_pretrained(run_dir)
    with open(os.path.join(run_dir, "metrics_test.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    # Confusion matrices
    sent_logits, pol_logits = test_out.predictions
    ysent_pred = np.argmax(sent_logits, axis=1)
    ypol_pred  = np.argmax(pol_logits,  axis=1)

    cm_sent = confusion_matrix(ysent_te, ysent_pred, labels=list(range(num_sent_classes)))
    cm_pol  = confusion_matrix(ypol_te,  ypol_pred,  labels=list(range(num_pol_classes)))

    np.save(os.path.join(run_dir, "cm_sent.npy"), cm_sent)
    np.save(os.path.join(run_dir, "cm_pol.npy"),  cm_pol)

    # Quick plots
    def plot_cm(cm, labels, title, path_png):
        fig, ax = plt.subplots(figsize=(4.5, 4))
        im = ax.imshow(cm, interpolation="nearest")
        ax.set_title(title)
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45, ha="right")
        ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, cm[i, j], ha="center", va="center")
        fig.colorbar(im, ax=ax, fraction=0.046)
        plt.tight_layout()
        plt.savefig(path_png, dpi=160)
        plt.close(fig)

    plot_cm(cm_sent, sent_le.classes_, "Sentiment Confusion", os.path.join(run_dir, "cm_sent.png"))
    plot_cm(cm_pol,  pol_le.classes_,  "Polarization Confusion", os.path.join(run_dir, "cm_pol.png"))

    # Text reports
    rep_sent = classification_report(ysent_te, ysent_pred, target_names=sent_le.classes_, digits=4, zero_division=0)
    rep_pol  = classification_report(ypol_te,  ypol_pred,  target_names=pol_le.classes_,  digits=4, zero_division=0)
    with open(os.path.join(run_dir, "report_sentiment.txt"), "w") as f:
        f.write(rep_sent)
    with open(os.path.join(run_dir, "report_polarization.txt"), "w") as f:
        f.write(rep_pol)

    summary_row = {
        "model_key": model_key,
        "base_name": base_name,
        **metrics
    }
    return summary_row, (ysent_pred, ypol_pred)


## SECTION 10


In [26]:
# SECTION 10

results = []
pred_cache = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Running {key} -> {MODEL_CONFIGS[key]['name']} ===")
    row, preds = train_eval_one_model(
        key,
        X_train, X_val, X_test,
        ysent_train, ysent_val, ysent_test,
        ypol_train,  ypol_val,  ypol_test,
        sent_weights_np, pol_weights_np
    )
    results.append(row)
    pred_cache[key] = preds

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(OUT_DIR, "summary_results.csv"), index=False)
results_df



=== Running xlm_roberta -> xlm-roberta-base ===


Epoch,Training Loss,Validation Loss,Sent Acc,Sent Prec,Sent Rec,Sent F1,Pol Acc,Pol Prec,Pol Rec,Pol F1,Macro F1 Avg
1,1.9547,No log,0.628707,0.599832,0.63337,0.609307,0.578885,0.478575,0.509999,0.461664,0.535485
2,1.763,No log,0.672598,0.62524,0.663039,0.639528,0.650059,0.485977,0.528664,0.49529,0.567409
3,1.6551,No log,0.66548,0.613008,0.67298,0.628617,0.664294,0.518205,0.607128,0.537641,0.583129
4,1.5108,No log,0.677343,0.63598,0.700858,0.655016,0.650059,0.509979,0.602428,0.525821,0.590419



=== Running mbert -> bert-base-multilingual-cased ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 47e8cf21-1ff6-4064-b6c3-306347935433)')' thrown while requesting GET https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json
Retrying in 1s [Retry 1/5].


tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Sent Acc,Sent Prec,Sent Rec,Sent F1,Pol Acc,Pol Prec,Pol Rec,Pol F1,Macro F1 Avg
1,1.8988,No log,0.558719,0.566175,0.61567,0.556444,0.582444,0.490396,0.47872,0.447067,0.501755
2,1.7151,No log,0.655991,0.62195,0.640449,0.629032,0.640569,0.48978,0.580857,0.500081,0.564556
3,1.5594,No log,0.667853,0.623004,0.656904,0.636647,0.708185,0.538238,0.622987,0.559943,0.598295
4,1.3797,No log,0.669039,0.628295,0.669136,0.644036,0.702254,0.542355,0.637287,0.566428,0.605232


Unnamed: 0,model_key,base_name,test_test_sent_acc,test_test_sent_prec,test_test_sent_rec,test_test_sent_f1,test_test_pol_acc,test_test_pol_prec,test_test_pol_rec,test_test_pol_f1,test_test_macro_f1_avg,test_test_runtime,test_test_samples_per_second,test_test_steps_per_second
0,xlm_roberta,xlm-roberta-base,0.646919,0.600964,0.654803,0.617347,0.670616,0.518963,0.592957,0.535185,0.576266,3.5607,237.031,14.885
1,mbert,bert-base-multilingual-cased,0.659953,0.625592,0.659235,0.638644,0.697867,0.541913,0.632263,0.55997,0.599307,2.5322,333.304,20.93


## SECTION 11


In [32]:
# ===== Section 11 — Detailed Breakdown Reports (per-class + cross-slices) =====
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import os
import json

def per_class_breakdown(y_true, y_pred, class_names):
    rep = classification_report(
        y_true, y_pred,
        target_names=list(class_names),
        output_dict=True, zero_division=0
    )
    # Keep only the class rows in the given order
    rows = []
    for cname in class_names:
        if cname in rep:
            rows.append({
                "class": cname,
                "precision": rep[cname]["precision"],
                "recall":    rep[cname]["recall"],
                "f1":        rep[cname]["f1-score"],
                "support":   int(rep[cname]["support"]),
            })
        else:
            rows.append({"class": cname, "precision": 0.0, "recall": 0.0, "f1": 0.0, "support": 0})
    return pd.DataFrame(rows)

def cross_slice_breakdown(
    slice_true,  # array of ints for the slicing label (e.g., true sentiment indices)
    slice_names, # names of the slicing label classes (e.g., sentiment class names)
    task_true,   # array of ints for the task we evaluate (e.g., true polarity indices)
    task_pred,   # array of ints for the task predictions (e.g., predicted polarity indices)
    task_names,  # names of the task classes (e.g., polarity class names)
    slice_label  # string for the slice axis name, e.g., "sentiment" or "polarity"
):
    """
    For each class s in slice_true, evaluate the task predictions on the subset where slice_true == s.
    Returns one row per slice value, including macro-F1, accuracy, and per-class F1 for the task.
    """
    rows = []
    for idx, sname in enumerate(slice_names):
        mask = (slice_true == idx)
        n = int(mask.sum())
        if n == 0:
            # No samples for this slice in test set
            row = {"slice": sname, "support": 0, "accuracy": np.nan, "macro_f1": np.nan}
            for tname in task_names:
                row[f"f1_{tname}"] = np.nan
            rows.append(row)
            continue

        rep = classification_report(
            task_true[mask], task_pred[mask],
            target_names=list(task_names),
            output_dict=True, zero_division=0
        )
        row = {
            "slice": sname,
            "support": n,
            "accuracy": rep["accuracy"],
            "macro_f1": rep["macro avg"]["f1-score"],
        }
        for tname in task_names:
            row[f"f1_{tname}"] = rep[tname]["f1-score"]
        rows.append(row)

    df = pd.DataFrame(rows)
    # Sort slices by support (desc) for readability
    df = df.sort_values(by="support", ascending=False).reset_index(drop=True)
    return df

# Where to save things
DETAILS_DIR = os.path.join(OUT_DIR, "details")
os.makedirs(DETAILS_DIR, exist_ok=True)

all_breakdowns = {}

for key in MODELS_TO_RUN:
    print(f"\n=== Detailed breakdowns for {key} ===")
    ysent_pred, ypol_pred = pred_cache[key]

    # ---- Per-class reports on the full test set
    sent_per_class = per_class_breakdown(ysent_test, ysent_pred, sent_le.classes_)
    pol_per_class  = per_class_breakdown(ypol_test,  ypol_pred,  pol_le.classes_)

    # Save + show
    sent_csv = os.path.join(DETAILS_DIR, f"{key}_sentiment_per_class.csv")
    pol_csv  = os.path.join(DETAILS_DIR, f"{key}_polarization_per_class.csv")
    sent_per_class.to_csv(sent_csv, index=False)
    pol_per_class.to_csv(pol_csv, index=False)

    print("\nSentiment — per class (precision/recall/F1/support):")
    display(sent_per_class)

    print("\nPolarization — per class (precision/recall/F1/support):")
    display(pol_per_class)

    # ---- Cross-slice reports
    # Polarity performance within each (true) sentiment slice
    pol_given_sent = cross_slice_breakdown(
        slice_true=ysent_test, slice_names=sent_le.classes_,
        task_true=ypol_test,   task_pred=ypol_pred, task_names=pol_le.classes_,
        slice_label="sentiment"
    )
    pol_given_sent_csv = os.path.join(DETAILS_DIR, f"{key}_polarity_given_sentiment.csv")
    pol_given_sent.to_csv(pol_given_sent_csv, index=False)

    print("\nPolarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):")
    display(pol_given_sent)

    # Sentiment performance within each (true) polarity slice
    sent_given_pol = cross_slice_breakdown(
        slice_true=ypol_test,  slice_names=pol_le.classes_,
        task_true=ysent_test,  task_pred=ysent_pred, task_names=sent_le.classes_,
        slice_label="polarity"
    )
    sent_given_pol_csv = os.path.join(DETAILS_DIR, f"{key}_sentiment_given_polarity.csv")
    sent_given_pol.to_csv(sent_given_pol_csv, index=False)

    print("\nSentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):")
    display(sent_given_pol)

    # Keep for a single JSON bundle if you like
    all_breakdowns[key] = {
        "sentiment_per_class_csv": sent_csv,
        "polarization_per_class_csv": pol_csv,
        "polarity_given_sentiment_csv": pol_given_sent_csv,
        "sentiment_given_polarity_csv": sent_given_pol_csv
    }

# Optional: write an index JSON pointing to all CSVs
with open(os.path.join(DETAILS_DIR, "index.json"), "w") as f:
    json.dump(all_breakdowns, f, indent=2)
print("\nSaved detailed breakdowns to:", DETAILS_DIR)



=== Detailed breakdowns for xlm_roberta ===

Sentiment — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,negative,0.792771,0.685417,0.735196,480
1,neutral,0.510121,0.514286,0.512195,245
2,positive,0.5,0.764706,0.604651,119



Polarization — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,non_polarized,0.464286,0.607477,0.526316,214
1,objective,0.218391,0.463415,0.296875,41
2,partisan,0.874214,0.70798,0.782364,589



Polarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_non_polarized,f1_objective,f1_partisan
0,negative,480,0.8,0.525586,0.472222,0.216216,0.888318
1,neutral,245,0.518367,0.462211,0.566524,0.28125,0.53886
2,positive,119,0.462185,0.45384,0.512821,0.444444,0.404255



Sentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_negative,f1_neutral,f1_positive
0,partisan,589,0.70798,0.621709,0.808237,0.421053,0.635838
1,non_polarized,214,0.514019,0.473717,0.222222,0.606335,0.592593
2,objective,41,0.463415,0.444227,0.421053,0.511628,0.4



=== Detailed breakdowns for mbert ===

Sentiment — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,negative,0.787736,0.695833,0.738938,480
1,neutral,0.5,0.559184,0.527938,245
2,positive,0.589041,0.722689,0.649057,119



Polarization — per class (precision/recall/F1/support):


Unnamed: 0,class,precision,recall,f1,support
0,non_polarized,0.522822,0.588785,0.553846,214
1,objective,0.221154,0.560976,0.317241,41
2,partisan,0.881764,0.747029,0.808824,589



Polarity performance within each Sentiment slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_non_polarized,f1_objective,f1_partisan
0,negative,480,0.789583,0.536371,0.462687,0.264151,0.882277
1,neutral,245,0.559184,0.494601,0.596491,0.28125,0.606061
2,positive,119,0.613445,0.582437,0.580645,0.5,0.666667



Sentiment performance within each Polarity slice (accuracy / macro-F1 / per-class F1):


Unnamed: 0,slice,support,accuracy,macro_f1,f1_negative,f1_neutral,f1_positive
0,partisan,589,0.701188,0.624545,0.801027,0.401639,0.670968
1,non_polarized,214,0.593458,0.567697,0.365385,0.663793,0.673913
2,objective,41,0.414634,0.376892,0.285714,0.511628,0.333333



Saved detailed breakdowns to: ./runs_multitask/details


## SECTION 12

In [33]:
# ===== Section 12 — Length Diagnostics (clean) =====
import warnings

def token_lengths_summary(texts, titles, tokenizer, n=5000):
    # Random sample (or full if dataset is small)
    n = min(n, len(texts))
    idx = np.random.choice(len(texts), size=n, replace=False) if len(texts) > n else np.arange(len(texts))

    lengths = []
    # Silence the "sequence > 512" warnings emitted by some tokenizers for inspection
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Token indices sequence length is longer.*")
        for i in idx:
            s = f"{titles[i]} [SEP] {texts[i]}"
            # We want raw length pre-truncation to choose MAX_LENGTH wisely
            ids = tokenizer.encode(s, add_special_tokens=True, truncation=False)
            lengths.append(len(ids))

    arr = np.array(lengths)
    stats = {
        "mean": float(arr.mean()),
        "p50":  float(np.percentile(arr, 50)),
        "p90":  float(np.percentile(arr, 90)),
        "p95":  float(np.percentile(arr, 95)),
        "p99":  float(np.percentile(arr, 99)),
        "max":  int(arr.max())
    }
    print("Token length stats:", stats)
    return stats

for key in MODELS_TO_RUN:
    name = MODEL_CONFIGS[key]["name"]
    tok = AutoTokenizer.from_pretrained(name)
    print(f"\n[{key}] {name}")
    token_lengths_summary(
        texts=X_train[TEXT_COL].values,
        titles=X_train[TITLE_COL].values,
        tokenizer=tok,
        n=5000
    )

# Tip:
# If p95 is comfortably < 192, you're fine. If you see p95 > 192, consider MAX_LENGTH=224
# (Update in Section 3 if you decide to bump it.)



[xlm_roberta] xlm-roberta-base


Token indices sequence length is longer than the specified maximum sequence length for this model (950 > 512). Running this sequence through the model will result in indexing errors


Token length stats: {'mean': 106.15734621250635, 'p50': 93.0, 'p90': 170.0, 'p95': 181.0, 'p99': 218.34000000000015, 'max': 950}


Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors



[mbert] bert-base-multilingual-cased
Token length stats: {'mean': 107.43823080833756, 'p50': 90.0, 'p90': 184.0, 'p95': 197.0, 'p99': 232.67000000000007, 'max': 916}
