## Setup: single switch for CPU/GPU, deterministic settings, seeds, paths

In [1]:
# 01 - Setup: device, imports, seeds, paths, hyperparams

import os
from pathlib import Path
import random
import math
import numpy as np
import pandas as pd
from typing import Tuple, Optional

# ---- Choose device in ONE place: "cpu" or "cuda"
SELECT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Environment toggles
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if SELECT_DEVICE == "cpu":
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import torch
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
device = torch.device(SELECT_DEVICE)
if device.type == "cpu":
    torch.set_num_threads(max(1, os.cpu_count() // 2))

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

# ---- Paths
DATA_DIR = Path("SemEval_2022_Task2-idiomaticity/SubTaskA")
TRAIN_ONE_SHOT = DATA_DIR / "Data" / "train_one_shot.csv"
TRAIN_ZERO_SHOT = DATA_DIR / "Data" / "train_zero_shot.csv"
DEV = DATA_DIR / "Data" / "dev.csv"
DEV_GOLD = DATA_DIR / "Data" / "dev_gold.csv"
EVAL = DATA_DIR / "Data" / "eval.csv"

OUT_DIR = Path("outputs_pt_xlmr")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Model & training hyperparams
MODEL_NAME = "xlm-roberta-base"   # can switch to xlm-roberta-large if you have headroom
MAX_LEN = 256

# zero-shot (full FT)
ZS_EPOCHS = 2
ZS_LR = 2e-5
ZS_WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01

# one-shot (head-only FT + focal)
OS_EPOCHS = 12
OS_LR = 1e-4
OS_WARMUP_RATIO = 0.1

# batch sizes
BATCH_SIZE = 8                     # eval/diagnostics
BATCH_SIZE_MICRO = 8               # training micro-batch
GRAD_ACCUM_STEPS = 2               # effective batch = micro * accum

# pin_memory toggle for CUDA
PIN_MEMORY = (device.type == "cuda")

# ----- Colab VRAM helpers (no-ops on CPU)
def cuda_mem():
    if device.type == "cuda":
        alloc = torch.cuda.memory_allocated() / (1024**3)
        reserv = torch.cuda.memory_reserved() / (1024**3)
        print(f"CUDA allocated: {alloc:.2f} GiB | reserved: {reserv:.2f} GiB")

def free_vram():
    if device.type == "cuda":
        torch.cuda.empty_cache()
        import gc; gc.collect()
        print("VRAM cleanup done.")


Using GPU: Tesla T4


## IO helpers and data preparation for Subtask A (PT only)

In [2]:
# 02 - IO helpers and data preparation for Subtask A (PT only)

def load_any_csv(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep=None, engine="python", dtype=str)

def ensure_label_int(df: pd.DataFrame, col="Label") -> pd.DataFrame:
    if col in df.columns:
        df[col] = df[col].astype(int)
    return df

def mark_first_case_insensitive(text: str, needle: str, ltag: str="<mwe>", rtag: str="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    low_t = text.lower()
    low_n = needle.lower()
    idx = low_t.find(low_n)
    if idx == -1:
        return text
    return text[:idx] + ltag + text[idx:idx+len(needle)] + rtag + text[idx+len(needle):]

def build_input(prev: str, target: str, nxt: str, mwe: str, sep_token: str) -> str:
    target_marked = mark_first_case_insensitive(target, mwe, "<mwe>", "</mwe>")
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    return f"{prev} {sep_token} {target_marked} {sep_token} {nxt}".strip()

def prepare_supervised_frame(train_path: Path, dev_path: Path, dev_gold_path: Path, language="PT") -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_df = load_any_csv(train_path)
    dev_df = load_any_csv(dev_path)
    gold_df = load_any_csv(dev_gold_path)

    train_df.columns = [c.strip() for c in train_df.columns]
    dev_df.columns = [c.strip() for c in dev_df.columns]
    gold_df.columns = [c.strip() for c in gold_df.columns]

    train_df = train_df[train_df["Language"] == language].copy()
    dev_df = dev_df[dev_df["Language"] == language].copy()

    dev_gold = gold_df[gold_df["Language"] == language][["ID", "Label"]].copy()
    dev_gold["ID"] = dev_gold["ID"].astype(str)
    dev_df["ID"] = dev_df["ID"].astype(str)
    dev_labeled = dev_df.merge(dev_gold, on="ID", how="left")
    dev_labeled = ensure_label_int(dev_labeled, "Label")

    train_df = ensure_label_int(train_df, "Label")
    return train_df, dev_labeled

def prepare_eval_frame(eval_path: Path, language="PT") -> pd.DataFrame:
    eval_df = load_any_csv(eval_path)
    eval_df.columns = [c.strip() for c in eval_df.columns]
    eval_df = eval_df[eval_df["Language"] == language].copy()
    return eval_df


## Dataset & collate for XLM-R

In [3]:
# 03 - Dataset & collate for XLM-R

class IdiomDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_len: int, is_infer: bool=False):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_infer = is_infer
        self.sep = tokenizer.sep_token if tokenizer.sep_token is not None else "</s>"
        self.texts = []
        self.labels = []

        for _, row in self.df.iterrows():
            prev = row.get("Previous", "")
            target = row.get("Target", "")
            nxt = row.get("Next", "")
            mwe = row.get("MWE", "")
            text = build_input(prev, target, nxt, mwe, self.sep)
            self.texts.append(text)
            if not is_infer:
                self.labels.append(int(row["Label"]))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {"text": self.texts[idx]}
        if not self.is_infer:
            item["label"] = self.labels[idx]
        return item

def collate_fn(batch, tokenizer, max_len: int, is_infer: bool=False):
    texts = [b["text"] for b in batch]
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    if not is_infer:
        labels = torch.tensor([b["label"] for b in batch], dtype=torch.long)
        return enc, labels
    return enc


## Model, training, evaluation (fp32 only; gradient accumulation on GPU to keep identical updates)

In [4]:
# 04 - Model, training, evaluation (fp32; accumulation; + probs for thresholding)

def ensure_mwe_tokens(tokenizer, model=None):
    add_list = []
    for t in ["<mwe>", "</mwe>"]:
        if t not in tokenizer.get_vocab():
            add_list.append(t)
    if add_list:
        tokenizer.add_special_tokens({"additional_special_tokens": add_list})
        if model is not None:
            model.resize_token_embeddings(len(tokenizer))

def build_model_and_tokenizer(model_name: str, move_to_device: bool = False):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # DO NOT add tokens here; call ensure_mwe_tokens later once per run

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32
    )
    if move_to_device:
        model.to(device)
    return model, tokenizer

def _to_device_batch(batch):
    enc, labels = batch
    enc = {k: v.to(device, non_blocking=(device.type=="cuda")) for k, v in enc.items()}
    labels = labels.to(device, non_blocking=(device.type=="cuda"))
    return enc, labels

def run_epoch(model, loader, tokenizer, optimizer, scheduler, train_mode: bool, accum_steps: int = 1):
    model.train() if train_mode else model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    if train_mode:
        optimizer.zero_grad(set_to_none=True)

    step_count = 0
    for batch in loader:
        enc, labels = _to_device_batch(batch)
        out = model(**enc, labels=labels)
        loss, logits = out.loss, out.logits

        if train_mode:
            (loss / accum_steps).backward()
            step_count += 1
            if step_count % accum_steps == 0:
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()
                optimizer.zero_grad(set_to_none=True)

        total_loss += loss.item() * labels.size(0)
        preds = torch.argmax(logits, dim=-1).detach().cpu().tolist()
        labs = labels.detach().cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labs)

    if train_mode and (step_count % accum_steps != 0):
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        optimizer.zero_grad(set_to_none=True)

    avg_loss = total_loss / max(1, len(all_labels))
    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    return avg_loss, macro_f1, all_labels, all_preds

import torch.nn.functional as F

def run_epoch_weighted(model, loader, tokenizer, optimizer, scheduler, train_mode: bool,
                       class_weights: Optional[torch.Tensor]=None, accum_steps: int = 1):
    model.train() if train_mode else model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    if train_mode:
        optimizer.zero_grad(set_to_none=True)

    step_count = 0
    for batch in loader:
        enc, labels = _to_device_batch(batch)
        logits = model(**enc).logits
        loss = F.cross_entropy(logits, labels, weight=class_weights)

        if train_mode:
            (loss / accum_steps).backward()
            step_count += 1
            if step_count % accum_steps == 0:
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()
                optimizer.zero_grad(set_to_none=True)

        total_loss += loss.item() * labels.size(0)
        preds = torch.argmax(logits, dim=-1).detach().cpu().tolist()
        labs = labels.detach().cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labs)

    if train_mode and (step_count % accum_steps != 0):
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        optimizer.zero_grad(set_to_none=True)

    avg_loss = total_loss / max(1, len(all_labels))
    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    return avg_loss, macro_f1, all_labels, all_preds

@torch.no_grad()
def predict(model, loader, tokenizer):
    model.eval()
    all_preds = []
    for batch in loader:
        enc = batch
        enc = {k: v.to(device, non_blocking=(device.type=="cuda")) for k, v in enc.items()}
        logits = model(**enc).logits
        preds = torch.argmax(logits, dim=-1).cpu().tolist()
        all_preds.extend(preds)
    return all_preds

@torch.no_grad()
def predict_probs(model, loader, tokenizer) -> np.ndarray:
    model.eval()
    probs = []
    for batch in loader:
        enc = batch
        enc = {k: v.to(device, non_blocking=(device.type=="cuda")) for k, v in enc.items()}
        logits = model(**enc).logits
        p = torch.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()
        probs.append(p)
    return np.concatenate(probs, axis=0)


## Data loaders factory

In [5]:
# 05 - Data loaders factory

def make_loaders(train_df: pd.DataFrame, dev_df: pd.DataFrame, tokenizer, max_len: int, batch_size_micro: int):
    train_ds = IdiomDataset(train_df, tokenizer, max_len, is_infer=False)
    dev_ds = IdiomDataset(dev_df, tokenizer, max_len, is_infer=False)

    train_loader = DataLoader(
        train_ds,
        batch_size=batch_size_micro,
        shuffle=True,
        pin_memory=(device.type == "cuda"),
        collate_fn=lambda b: collate_fn(b, tokenizer, max_len, is_infer=False)
    )
    dev_loader = DataLoader(
        dev_ds,
        batch_size=batch_size_micro,
        shuffle=False,
        pin_memory=(device.type == "cuda"),
        collate_fn=lambda b: collate_fn(b, tokenizer, max_len, is_infer=False)
    )
    return train_loader, dev_loader

def make_infer_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size_micro: int):
    ds = IdiomDataset(df, tokenizer, max_len, is_infer=True)
    loader = DataLoader(
        ds,
        batch_size=batch_size_micro,
        shuffle=False,
        pin_memory=(device.type == "cuda"),
        collate_fn=lambda b: collate_fn(b, tokenizer, max_len, is_infer=True)
    )
    return loader


## Training runner (fp32, accumulation to match CPU effective batch & update schedule)

In [6]:
# 06 - Training runner (fp32, accumulation; supports class-weighted loss)

import math

def _num_update_steps_per_epoch(n_examples: int, effective_batch: int) -> int:
    return math.ceil(n_examples / effective_batch)

def train_and_eval(
    train_df: pd.DataFrame,
    dev_df: pd.DataFrame,
    run_name: str,
    model_name: str = MODEL_NAME,
    max_len: int = MAX_LEN,
    batch_size_micro: int = BATCH_SIZE_MICRO,
    epochs: int = EPOCHS,
    lr: float = LR,
    weight_decay: float = WEIGHT_DECAY,
    warmup_ratio: float = WARMUP_RATIO,
    accum_steps: int = GRAD_ACCUM_STEPS,
    class_weights: Optional[torch.Tensor] = None
):
    ckpt_dir = OUT_DIR / f"ckpt_{run_name}"
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    model, tokenizer = build_model_and_tokenizer(model_name, move_to_device=False)
    model.to(device)

    train_loader, dev_loader = make_loaders(train_df, dev_df, tokenizer, max_len, batch_size_micro)

    n_train = len(train_df)
    updates_per_epoch = _num_update_steps_per_epoch(n_train, batch_size_micro * accum_steps)
    total_update_steps = max(1, epochs * updates_per_epoch)
    warmup_steps = max(1, int(warmup_ratio * total_update_steps))

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, foreach=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_update_steps
    )

    best_f1 = -1.0
    best_path = ckpt_dir / "best.pt"

    for epoch in range(1, epochs + 1):
        if class_weights is None:
            tr_loss, tr_f1, _, _ = run_epoch(
                model, train_loader, tokenizer, optimizer, scheduler, train_mode=True, accum_steps=accum_steps
            )
        else:
            tr_loss, tr_f1, _, _ = run_epoch_weighted(
                model, train_loader, tokenizer, optimizer, scheduler, train_mode=True,
                class_weights=class_weights, accum_steps=accum_steps
            )

        dv_loss, dv_f1, y_true, y_pred = run_epoch(
            model, dev_loader, tokenizer, optimizer=None, scheduler=None, train_mode=False, accum_steps=1
        )
        print(f"[{run_name}] Epoch {epoch}/{epochs} | Train loss {tr_loss:.4f} F1 {tr_f1:.4f} | Dev loss {dv_loss:.4f} F1 {dv_f1:.4f}")

        if dv_f1 > best_f1:
            best_f1 = dv_f1
            torch.save({"model_state": model.state_dict(), "tokenizer": tokenizer.get_vocab(), "f1": best_f1}, best_path)

        if device.type == "cuda":
            torch.cuda.empty_cache()

    print(f"[{run_name}] Best dev macro-F1: {best_f1:.4f}")
    return best_path, tokenizer


In [7]:
# GPU/VRAM cleanup helper â€” run this before (re)building models on GPU
def free_vram():
    import gc, torch
    # delete any torch models/tensors we kept around in globals()
    for k, v in list(globals().items()):
        try:
            import torch.nn as nn
            if isinstance(v, nn.Module):
                del globals()[k]
        except Exception:
            pass
        if isinstance(v, torch.Tensor):
            del globals()[k]
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    print("VRAM cleanup done.")

# Optional: print quick CUDA memory stats
def cuda_mem():
    import torch
    if torch.cuda.is_available():
        a = torch.cuda.memory_allocated() / (1024**3)
        r = torch.cuda.memory_reserved() / (1024**3)
        print(f"CUDA allocated: {a:.2f} GiB | reserved: {r:.2f} GiB")
    else:
        print("CUDA not available.")


## One-shot (PT) with class-weighted loss (fp32), eval submission

In [8]:
# 07 - One-shot (PT) with class-weighted loss + dev threshold tuning, eval submission
from collections import Counter
import math

if device.type == "cuda":
    cuda_mem(); free_vram(); cuda_mem()

train_1s_df, dev_1s_df = prepare_supervised_frame(TRAIN_ONE_SHOT, DEV, DEV_GOLD, language="PT")

# Sanity: label balance
cnt_train_1s = Counter(train_1s_df["Label"].astype(int).tolist())
cnt_dev_1s   = Counter(dev_1s_df["Label"].astype(int).tolist())
print("[PT one-shot] Train label counts:", cnt_train_1s, "| Dev label counts:", cnt_dev_1s)

# Class weights
num0, num1 = cnt_train_1s.get(0, 1), cnt_train_1s.get(1, 1)
total = num0 + num1
w0 = total / (2.0 * num0)
w1 = total / (2.0 * num1)
class_weights_1s = torch.tensor([w0, w1], dtype=torch.float32, device=device)

# Build fresh model/tokenizer; add <mwe> tokens exactly once
model_1s, tok_1s = build_model_and_tokenizer(MODEL_NAME, move_to_device=False)
ensure_mwe_tokens(tok_1s, model_1s)
model_1s.to(device)

train_loader_1s, dev_loader_1s = make_loaders(train_1s_df, dev_1s_df, tok_1s, MAX_LEN, BATCH_SIZE_MICRO)

EPOCHS_1S = 6
LR_1S = 1.5e-5
WARMUP_RATIO_1S = 0.1
ACCUM_1S = GRAD_ACCUM_STEPS

n_train_1s = len(train_1s_df)
updates_per_epoch_1s = math.ceil(n_train_1s / (BATCH_SIZE_MICRO * ACCUM_1S))
total_update_steps_1s = max(1, EPOCHS_1S * updates_per_epoch_1s)
warmup_steps_1s = max(1, int(WARMUP_RATIO_1S * total_update_steps_1s))

optimizer_1s = torch.optim.AdamW(model_1s.parameters(), lr=LR_1S, weight_decay=WEIGHT_DECAY, foreach=False)
scheduler_1s = get_linear_schedule_with_warmup(optimizer_1s, num_warmup_steps=warmup_steps_1s, num_training_steps=total_update_steps_1s)

best_f1 = -1.0
best_dir_1s = OUT_DIR / "ckpt_oneshot_pt_xlmr_weighted"
best_dir_1s.mkdir(parents=True, exist_ok=True)
best_file_1s = best_dir_1s / "best.pt"

for epoch in range(1, EPOCHS_1S + 1):
    tr_loss, tr_f1, _, _ = run_epoch_weighted(
        model_1s, train_loader_1s, tok_1s, optimizer_1s, scheduler_1s,
        train_mode=True, class_weights=class_weights_1s, accum_steps=ACCUM_1S
    )
    dv_loss, dv_f1, y_true, y_pred = run_epoch(
        model_1s, dev_loader_1s, tok_1s, optimizer_1s, scheduler_1s,
        train_mode=False, accum_steps=1
    )
    print(f"[oneshot_pt_weighted] Epoch {epoch}/{EPOCHS_1S} | Train loss {tr_loss:.4f} F1 {tr_f1:.4f} | Dev loss {dv_loss:.4f} F1 {dv_f1:.4f}")
    if dv_f1 > best_f1:
        best_f1 = dv_f1
        torch.save({"model_state": model_1s.state_dict(), "f1": best_f1, "vocab_size": len(tok_1s)}, best_file_1s)
    if device.type == "cuda":
        torch.cuda.empty_cache()

print(f"[oneshot_pt_weighted] Best dev macro-F1: {best_f1:.4f}")

# Reload best for threshold tuning / inference (ensure identical vocab)
model_1s_eval = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
ensure_mwe_tokens(tok_1s, model_1s_eval)
state = torch.load(best_file_1s, map_location=str(device))
# If vocab expanded during training, match it before loading weights
if "vocab_size" in state:
    vs = state["vocab_size"]
    if model_1s_eval.get_input_embeddings().weight.shape[0] != vs:
        model_1s_eval.resize_token_embeddings(vs)
model_1s_eval.load_state_dict(state["model_state"])
model_1s_eval.to(device)
model_1s_eval.eval()

# ---- Dev threshold tuning with a wider grid
dev_loader_1s_eval = make_infer_loader(dev_1s_df, tok_1s, MAX_LEN, BATCH_SIZE_MICRO)

@torch.no_grad()
def predict_probs_inferloader(model, loader) -> np.ndarray:
    model.eval()
    probs = []
    for enc in loader:
        enc = {k: v.to(device, non_blocking=(device.type=="cuda")) for k, v in enc.items()}
        p = torch.softmax(model(**enc).logits, dim=-1)[:, 1].detach().cpu().numpy()
        probs.append(p)
    return np.concatenate(probs, axis=0)

dev_probs = predict_probs_inferloader(model_1s_eval, dev_loader_1s_eval)
dev_true = dev_1s_df["Label"].astype(int).to_numpy()
print(f"[oneshot_pt_weighted] Dev positive rate (probs>=0.5): {(dev_probs>=0.5).mean():.3f}")

def _tune_threshold(probs, ytrue):
    grid = np.linspace(0.05, 0.95, 37)  # wider sweep
    best_th, best_f1 = 0.5, -1.0
    for th in grid:
        yhat = (probs >= th).astype(int)
        f1 = f1_score(ytrue, yhat, average="macro")
        if f1 > best_f1:
            best_f1, best_th = f1, th
    return best_th, best_f1

best_th_1s, best_th_f1_1s = _tune_threshold(dev_probs, dev_true)
print(f"[oneshot_pt_weighted] Tuned threshold on dev: th={best_th_1s:.3f}, macro-F1={best_th_f1_1s:.4f}")

# ---- Eval predictions
eval_pt_df = prepare_eval_frame(EVAL, language="PT")
eval_loader_1s = make_infer_loader(eval_pt_df, tok_1s, MAX_LEN, BATCH_SIZE_MICRO)

@torch.no_grad()
def predict_with_threshold_inferloader(model, loader, th: float) -> list:
    outs = []
    for enc in loader:
        enc = {k: v.to(device, non_blocking=(device.type=="cuda")) for k, v in enc.items()}
        p1 = torch.softmax(model(**enc).logits, dim=-1)[:, 1]
        outs.extend((p1 >= th).int().cpu().tolist())
    return outs

eval_preds_1s = predict_with_threshold_inferloader(model_1s_eval, eval_loader_1s, best_th_1s)

sub_1s = pd.DataFrame({
    "ID": eval_pt_df["ID"].astype(str),
    "Language": eval_pt_df["Language"],
    "Setting": ["zero_shot"] * len(eval_pt_df),
    "Label": eval_preds_1s
})
sub_path_1s = OUT_DIR / "eval_submission_pt_oneshot_weighted.csv"
sub_1s.to_csv(sub_path_1s, index=False)
print(f"Wrote {sub_path_1s}")


CUDA allocated: 0.00 GiB | reserved: 0.00 GiB
VRAM cleanup done.
CUDA allocated: 0.00 GiB | reserved: 0.00 GiB
[PT one-shot] Train label counts: Counter({0: 28, 1: 25}) | Dev label counts: Counter({0: 154, 1: 119})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.sta

[oneshot_pt_weighted] Epoch 1/6 | Train loss 0.7015 F1 0.3457 | Dev loss 0.6853 F1 0.3607
[oneshot_pt_weighted] Epoch 2/6 | Train loss 0.7009 F1 0.3457 | Dev loss 0.6850 F1 0.3607
[oneshot_pt_weighted] Epoch 3/6 | Train loss 0.6883 F1 0.3788 | Dev loss 0.6852 F1 0.3607
[oneshot_pt_weighted] Epoch 4/6 | Train loss 0.6887 F1 0.4421 | Dev loss 0.6856 F1 0.3607
[oneshot_pt_weighted] Epoch 5/6 | Train loss 0.6881 F1 0.4940 | Dev loss 0.6857 F1 0.3607
[oneshot_pt_weighted] Epoch 6/6 | Train loss 0.6864 F1 0.4500 | Dev loss 0.6858 F1 0.3607
[oneshot_pt_weighted] Best dev macro-F1: 0.3607


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[oneshot_pt_weighted] Dev positive rate (probs>=0.5): 0.000
[oneshot_pt_weighted] Tuned threshold on dev: th=0.425, macro-F1=0.4619
Wrote outputs_pt_xlmr/eval_submission_pt_oneshot_weighted.csv


## Zero-shot (PT): load, train, eval, and write eval predictions

In [9]:
# 08 - Zero-shot (PT): weighted training + dev threshold tuning, eval submission

if device.type == "cuda":
    cuda_mem(); free_vram(); cuda_mem()

train_0s_df, dev_0s_df = prepare_supervised_frame(TRAIN_ZERO_SHOT, DEV, DEV_GOLD, language="PT")

from collections import Counter
cnt0s = Counter(train_0s_df["Label"].astype(int).tolist())
n0, n1 = cnt0s.get(0, 1), cnt0s.get(1, 1)
tot = n0 + n1
w0_0s = tot / (2.0 * n0)
w1_0s = tot / (2.0 * n1)
class_weights_0s = torch.tensor([w0_0s, w1_0s], dtype=torch.float32, device=device)

best_zeroshot_path, zeroshot_tokenizer = train_and_eval(
    train_df=train_0s_df,
    dev_df=dev_0s_df,
    run_name="zeroshot_pt_xlmr",
    batch_size_micro=BATCH_SIZE_MICRO,
    epochs=EPOCHS,
    lr=LR,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    accum_steps=GRAD_ACCUM_STEPS,
    class_weights=class_weights_0s
)

# After training, when reloading best for tuning/eval:
model_zeroshot = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
ensure_mwe_tokens(zeroshot_tokenizer, model_zeroshot)
state = torch.load(OUT_DIR / "ckpt_zeroshot_pt_xlmr" / "best.pt", map_location=str(device))
if "vocab_size" in state:
    vs = state["vocab_size"]
    if model_zeroshot.get_input_embeddings().weight.shape[0] != vs:
        model_zeroshot.resize_token_embeddings(vs)
model_zeroshot.load_state_dict(state["model_state"])
model_zeroshot.to(device)
model_zeroshot.eval()

dev_loader_0s = DataLoader(
    IdiomDataset(dev_0s_df, zeroshot_tokenizer, MAX_LEN, is_infer=False),
    batch_size=BATCH_SIZE_MICRO,
    shuffle=False,
    pin_memory=(device.type == "cuda"),
    collate_fn=lambda b: collate_fn(b, zeroshot_tokenizer, MAX_LEN, is_infer=False)
)
dev_probs_0s = predict_probs(model_zeroshot, dev_loader_0s, zeroshot_tokenizer)
dev_true_0s = dev_0s_df["Label"].astype(int).to_numpy()

best_th_0s, best_th_f1_0s = _tune_threshold(dev_probs_0s, dev_true_0s)
print(f"[zeroshot_pt_xlmr] Tuned threshold on dev: th={best_th_0s:.3f}, macro-F1={best_th_f1_0s:.4f}")

# Eval with tuned threshold
eval_pt_df = prepare_eval_frame(EVAL, language="PT")
eval_loader_0s = make_infer_loader(eval_pt_df, zeroshot_tokenizer, MAX_LEN, BATCH_SIZE_MICRO)

eval_preds_0s = predict_with_threshold(model_zeroshot, eval_loader_0s, zeroshot_tokenizer, best_th_0s)

sub_0s = pd.DataFrame({
    "ID": eval_pt_df["ID"].astype(str),
    "Language": eval_pt_df["Language"],
    "Setting": ["zero_shot"] * len(eval_pt_df),
    "Label": eval_preds_0s
})
sub_path_0s = OUT_DIR / "eval_submission_pt_zeroshot.csv"
sub_0s.to_csv(sub_path_0s, index=False)
print(f"Wrote {sub_path_0s}")


CUDA allocated: 5.19 GiB | reserved: 5.25 GiB
VRAM cleanup done.
CUDA allocated: 4.16 GiB | reserved: 4.19 GiB


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[zeroshot_pt_xlmr] Epoch 1/2 | Train loss 0.6804 F1 0.5463 | Dev loss 0.7009 F1 0.3607
[zeroshot_pt_xlmr] Epoch 2/2 | Train loss 0.6607 F1 0.4674 | Dev loss 0.7000 F1 0.3607
[zeroshot_pt_xlmr] Best dev macro-F1: 0.3607


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Error(s) in loading state_dict for XLMRobertaForSequenceClassification:
	size mismatch for roberta.embeddings.word_embeddings.weight: copying a param with shape torch.Size([250002, 768]) from checkpoint, the shape in current model is torch.Size([250004, 768]).

## Dev-set diagnostics: confusion matrix & report for best runs

In [None]:
# 09 - Dev-set diagnostics: confusion matrix & report for best runs (robust to vocab size)

from sklearn.metrics import classification_report, confusion_matrix

def _resize_model_to_ckpt_vocab(model, state_dict):
    # Works for XLM-R; key name uses "roberta.embeddings.word_embeddings.weight"
    emb_key = "roberta.embeddings.word_embeddings.weight"
    if emb_key not in state_dict:
        # fallback for other arch names if needed
        for k in state_dict.keys():
            if k.endswith("embeddings.word_embeddings.weight"):
                emb_key = k
                break
    expected_vocab = state_dict[emb_key].shape[0]
    current_vocab = model.get_input_embeddings().weight.shape[0]
    if expected_vocab != current_vocab:
        model.resize_token_embeddings(expected_vocab)

def eval_on_dev(best_ckpt_path: Path, tokenizer, dev_df: pd.DataFrame, tag: str):
    # Fresh model (fp32)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    # Load checkpoint state
    state = torch.load(best_ckpt_path, map_location=str(device))
    sd = state["model_state"]

    # Ensure model embedding size matches checkpoint BEFORE load_state_dict
    _resize_model_to_ckpt_vocab(model, sd)

    # Optional: make sure tokenizer has the markers
    vocab = tokenizer.get_vocab()
    if ("<mwe>" not in vocab) or ("</mwe>" not in vocab):
        tokenizer.add_special_tokens({"additional_special_tokens": ["<mwe>", "</mwe>"]})
        # If we just changed tokenizer, we won't change model size again here,
        # because the checkpoint dictates the embedding size we've already set.

    # Load weights and move to device
    model.load_state_dict(sd, strict=True)
    model.to(device)
    model.eval()

    dev_loader = DataLoader(
        IdiomDataset(dev_df, tokenizer, MAX_LEN, is_infer=False),
        batch_size=BATCH_SIZE_MICRO,
        shuffle=False,
        pin_memory=(device.type == "cuda"),
        collate_fn=lambda b: collate_fn(b, tokenizer, MAX_LEN, is_infer=False)
    )

    # accum_steps=1 for eval
    _, f1, y_true, y_pred = run_epoch(
        model, dev_loader, tokenizer, optimizer=None, scheduler=None, train_mode=False, accum_steps=1
    )
    print(f"[{tag}] Dev macro-F1: {f1:.4f}")
    print(classification_report(y_true, y_pred, digits=4))
    print(confusion_matrix(y_true, y_pred))


# Optionally free VRAM between evals on GPU
if device.type == "cuda":
    cuda_mem(); free_vram(); cuda_mem()

eval_on_dev(OUT_DIR / "ckpt_oneshot_pt_xlmr_weighted" / "best.pt", tok_1s, dev_1s_df, "One-shot PT XLM-R (weighted)")

if device.type == "cuda":
    cuda_mem(); free_vram(); cuda_mem()

eval_on_dev(OUT_DIR / "ckpt_zeroshot_pt_xlmr" / "best.pt", zeroshot_tokenizer, dev_0s_df, "Zero-shot PT XLM-R")


## Save final best checkpoints to disk

In [None]:
# 10 - Save final best checkpoints to disk

final_oneshot_dir = OUT_DIR / "final_oneshot_pt_xlmr"
final_zeroshot_dir = OUT_DIR / "final_zeroshot_pt_xlmr"
final_oneshot_dir.mkdir(parents=True, exist_ok=True)
final_zeroshot_dir.mkdir(parents=True, exist_ok=True)

torch.save(torch.load(OUT_DIR / "ckpt_oneshot_pt_xlmr_weighted" / "best.pt", map_location=str(device)), final_oneshot_dir / "model.pt")
torch.save(torch.load(OUT_DIR / "ckpt_zeroshot_pt_xlmr" / "best.pt", map_location=str(device)), final_zeroshot_dir / "model.pt")

print("Saved final checkpoints.")
