## Setup: device, imports, seeds, paths, hyperparams

In [3]:
# 01 - Setup: device, imports, seeds, paths, hyperparams

import os
from pathlib import Path
import random
import math
import numpy as np
import pandas as pd
from typing import Tuple, Optional

# ---- Single toggle: choose "cpu" or "gpu"
RUN_DEVICE = "gpu"   # change to "cpu" to force CPU

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch

# Resolve requested device against availability
if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    SELECT_DEVICE = "cuda"
else:
    SELECT_DEVICE = "cpu"
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # hard-disable CUDA if CPU is selected or not available

device = torch.device(SELECT_DEVICE)

# Determinism-ish settings
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
if device.type == "cpu":
    torch.set_num_threads(max(1, os.cpu_count() // 2))

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

# ---- Paths
DATA_DIR = Path("SemEval_2022_Task2-idiomaticity/SubTaskA")
TRAIN_ONE_SHOT = DATA_DIR / "Data" / "train_one_shot.csv"
TRAIN_ZERO_SHOT = DATA_DIR / "Data" / "train_zero_shot.csv"
DEV = DATA_DIR / "Data" / "dev.csv"
DEV_GOLD = DATA_DIR / "Data" / "dev_gold.csv"
EVAL = DATA_DIR / "Data" / "eval.csv"

OUT_DIR = Path("outputs_pt_xlmr")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Model & training hyperparams
MODEL_NAME = "xlm-roberta-base"   # you can try xlm-roberta-large if you have headroom
MAX_LEN = 256

# zero-shot (full FT)
ZS_EPOCHS = 2
ZS_LR = 2e-5
ZS_WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01

# one-shot (head-only FT + focal)
OS_EPOCHS = 12
OS_LR = 1e-4
OS_WARMUP_RATIO = 0.1

# batch sizes
BATCH_SIZE = 8                     # eval/diagnostics
BATCH_SIZE_MICRO = 8               # training micro-batch
GRAD_ACCUM_STEPS = 2               # effective batch = micro * accum

# pin_memory toggle for CUDA
PIN_MEMORY = (device.type == "cuda")

# ----- Colab VRAM helpers (no-ops on CPU)
def cuda_mem():
    if device.type == "cuda":
        alloc = torch.cuda.memory_allocated() / (1024**3)
        reserv = torch.cuda.memory_reserved() / (1024**3)
        print(f"CUDA allocated: {alloc:.2f} GiB | reserved: {reserv:.2f} GiB")

def free_vram():
    if device.type == "cuda":
        torch.cuda.empty_cache()
        import gc; gc.collect()
        print("VRAM cleanup done.")

print(f"Using device: {device.type.upper()}")


Using device: CUDA


## IO helpers and data preparation for Subtask A (PT only)

In [4]:
# 02 - IO helpers and data prep (PT)

def load_any_csv(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep=None, engine="python", dtype=str)

def ensure_label_int(df: pd.DataFrame, col="Label") -> pd.DataFrame:
    if col in df.columns:
        df[col] = df[col].astype(int)
    return df

def prepare_supervised_frame(train_path: Path, dev_path: Path, dev_gold_path: Path, language="PT") -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_df = load_any_csv(train_path)
    dev_df = load_any_csv(dev_path)
    gold_df = load_any_csv(dev_gold_path)

    train_df.columns = [c.strip() for c in train_df.columns]
    dev_df.columns = [c.strip() for c in dev_df.columns]
    gold_df.columns = [c.strip() for c in gold_df.columns]

    train_df = train_df[train_df["Language"] == language].copy()
    dev_df = dev_df[dev_df["Language"] == language].copy()

    gold = gold_df[gold_df["Language"] == language][["ID", "Label"]].copy()
    gold["ID"] = gold["ID"].astype(str); dev_df["ID"] = dev_df["ID"].astype(str)
    dev_lab = dev_df.merge(gold, on="ID", how="left")
    dev_lab = ensure_label_int(dev_lab, "Label")
    train_df = ensure_label_int(train_df, "Label")
    return train_df, dev_lab

def prepare_eval_frame(eval_path: Path, language="PT") -> pd.DataFrame:
    eval_df = load_any_csv(eval_path)
    eval_df.columns = [c.strip() for c in eval_df.columns]
    return eval_df[eval_df["Language"] == language].copy()

def mark_first_case_insensitive(text: str, needle: str, ltag: str="<mwe>", rtag: str="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    lt, ln = text.lower(), needle.lower()
    i = lt.find(ln)
    if i == -1:
        return text
    return text[:i] + ltag + text[i:i+len(needle)] + rtag + text[i+len(needle):]

def build_input(prev: str, target: str, nxt: str, mwe: str, sep_token: str) -> str:
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    target_marked = mark_first_case_insensitive(target, mwe, "<mwe>", "</mwe>")
    return f"{prev} {sep_token} {target_marked} {sep_token} {nxt}".strip()


## Dataset & collate for XLM-R

In [5]:
# 03 - Dataset & collate

class IdiomDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_len: int, is_infer: bool=False):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_infer = is_infer
        self.sep = tokenizer.sep_token if tokenizer.sep_token is not None else "</s>"

        self.texts = []
        self.labels = []
        for _, r in self.df.iterrows():
            prev = r.get("Previous", "")
            target = r.get("Target", "")
            nxt = r.get("Next", "")
            mwe = r.get("MWE", "")
            self.texts.append(build_input(prev, target, nxt, mwe, self.sep))
            if not self.is_infer:
                self.labels.append(int(r["Label"]))

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        item = {"text": self.texts[idx]}
        if not self.is_infer: item["label"] = self.labels[idx]
        return item

def collate_fn(batch, tokenizer, max_len: int, is_infer: bool=False):
    texts = [b["text"] for b in batch]
    enc = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    if not is_infer:
        labels = torch.tensor([b["label"] for b in batch], dtype=torch.long)
        return enc, labels
    return enc


## Model, training, evaluation (fp32 only; gradient accumulation on GPU to keep identical updates)

In [6]:
# 04 - Model helpers, special tokens, training/eval utils

def ensure_mwe_tokens(tokenizer, model=None):
    add_list = []
    for t in ["<mwe>", "</mwe>"]:
        if t not in tokenizer.get_vocab():
            add_list.append(t)
    if add_list:
        tokenizer.add_special_tokens({"additional_special_tokens": add_list})
        if model is not None:
            model.resize_token_embeddings(len(tokenizer))

def build_model_and_tokenizer(model_name: str, move_to_device: bool = False):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32
    )
    if move_to_device:
        model.to(device)
    return model, tokenizer

def freeze_backbone_roberta(model):
    for p in model.roberta.embeddings.parameters():
        p.requires_grad = False
    for p in model.roberta.encoder.parameters():
        p.requires_grad = False
    for p in model.classifier.parameters():
        p.requires_grad = True
    return model

import torch.nn.functional as F

def focal_loss(logits, targets, alpha=None, gamma: float = 2.0):
    logp = F.log_softmax(logits, dim=-1)
    p = torch.exp(logp)
    pt = p[range(p.size(0)), targets]
    loss = -(1 - pt) ** gamma * logp[range(logp.size(0)), targets]
    if alpha is not None:
        at = alpha[targets]
        loss = at * loss
    return loss.mean()

def run_epoch(model, loader, tokenizer, optimizer=None, scheduler=None, train_mode: bool=True, accum_steps: int=1):
    model.train() if train_mode else model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0
    step = 0

    for batch in loader:
        (enc, labels) = batch
        enc = {k: v.to(device, non_blocking=PIN_MEMORY) for k, v in enc.items()}
        labels = labels.to(device, non_blocking=PIN_MEMORY)

        with torch.set_grad_enabled(train_mode):
            out = model(**enc)
            logits = out.logits
            loss = F.cross_entropy(logits, labels) / accum_steps

            if train_mode:
                optimizer.zero_grad(set_to_none=True) if (step % accum_steps == 0) else None
                loss.backward()
                if (step + 1) % accum_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    if scheduler is not None: scheduler.step()

        step += 1
        total_loss += loss.item() * accum_steps * labels.size(0)
        all_preds.extend(torch.argmax(logits, dim=-1).detach().cpu().tolist())
        all_labels.extend(labels.detach().cpu().tolist())

    avg_loss = total_loss / max(1, len(all_labels))
    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    return avg_loss, macro_f1, all_labels, all_preds

@torch.no_grad()
def predict(model, loader, tokenizer):
    model.eval()
    outs = []
    for enc in loader:
        enc = {k: v.to(device, non_blocking=PIN_MEMORY) for k, v in enc.items()}
        logits = model(**enc).logits
        outs.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    return outs

@torch.no_grad()
def predict_probs_inferloader(model, loader) -> np.ndarray:
    model.eval()
    probs = []
    for enc in loader:
        enc = {k: v.to(device, non_blocking=PIN_MEMORY) for k, v in enc.items()}
        p = torch.softmax(model(**enc).logits, dim=-1)[:, 1].detach().cpu().numpy()
        probs.append(p)
    return np.concatenate(probs, axis=0)


## Data loaders factory

In [7]:
# 05 - DataLoader factories

def make_loaders(train_df: pd.DataFrame, dev_df: pd.DataFrame, tokenizer, max_len: int, batch_size_micro: int):
    train_ds = IdiomDataset(train_df, tokenizer, max_len, is_infer=False)
    dev_ds = IdiomDataset(dev_df, tokenizer, max_len, is_infer=False)

    train_loader = DataLoader(
        train_ds, batch_size=batch_size_micro, shuffle=True, pin_memory=PIN_MEMORY,
        collate_fn=lambda b: collate_fn(b, tokenizer, max_len, is_infer=False)
    )
    dev_loader = DataLoader(
        dev_ds, batch_size=batch_size_micro, shuffle=False, pin_memory=PIN_MEMORY,
        collate_fn=lambda b: collate_fn(b, tokenizer, max_len, is_infer=False)
    )
    return train_loader, dev_loader

def make_infer_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size: int):
    ds = IdiomDataset(df, tokenizer, max_len, is_infer=True)
    loader = DataLoader(
        ds, batch_size=batch_size, shuffle=False, pin_memory=PIN_MEMORY,
        collate_fn=lambda b: collate_fn(b, tokenizer, max_len, is_infer=True)
    )
    return loader


## Zero-shot (PT): train full model, tune threshold, write eval

In [9]:
# 06 - Zero-shot (PT): train full model, tune threshold, write eval

if device.type == "cuda":
    cuda_mem(); free_vram(); cuda_mem()

train_0s_df, dev_0s_df = prepare_supervised_frame(TRAIN_ZERO_SHOT, DEV, DEV_GOLD, language="PT")

model_zs, tok_zs = build_model_and_tokenizer(MODEL_NAME, move_to_device=False)
ensure_mwe_tokens(tok_zs, model_zs)
model_zs.to(device)

train_loader_0s, dev_loader_0s = make_loaders(train_0s_df, dev_0s_df, tok_zs, MAX_LEN, BATCH_SIZE_MICRO)

n_train_0s = len(train_0s_df)
updates_per_epoch_0s = math.ceil(n_train_0s / (BATCH_SIZE_MICRO * GRAD_ACCUM_STEPS))
total_update_steps_0s = max(1, ZS_EPOCHS * updates_per_epoch_0s)
warmup_steps_0s = max(1, int(ZS_WARMUP_RATIO * total_update_steps_0s))

optimizer_0s = torch.optim.AdamW(model_zs.parameters(), lr=ZS_LR, weight_decay=WEIGHT_DECAY, foreach=False)
scheduler_0s = get_linear_schedule_with_warmup(optimizer_0s, num_warmup_steps=warmup_steps_0s, num_training_steps=total_update_steps_0s)

best_f1_zs = -1.0
best_dir_zs = OUT_DIR / "ckpt_zeroshot_pt_xlmr"
best_dir_zs.mkdir(parents=True, exist_ok=True)
best_file_zs = best_dir_zs / "best.pt"

for epoch in range(1, ZS_EPOCHS + 1):
    tr_loss, tr_f1, _, _ = run_epoch(model_zs, train_loader_0s, tok_zs, optimizer_0s, scheduler_0s, train_mode=True, accum_steps=GRAD_ACCUM_STEPS)
    dv_loss, dv_f1, y_true, y_pred = run_epoch(model_zs, dev_loader_0s, tok_zs, optimizer=None, scheduler=None, train_mode=False, accum_steps=1)
    print(f"[zeroshot_pt_xlmr] Epoch {epoch}/{ZS_EPOCHS} | Train loss {tr_loss:.4f} F1 {tr_f1:.4f} | Dev loss {dv_loss:.4f} F1 {dv_f1:.4f}")
    if dv_f1 > best_f1_zs:
        best_f1_zs = dv_f1
        torch.save({"model_state": model_zs.state_dict(), "vocab_size": len(tok_zs)}, best_file_zs)
    if device.type == "cuda": torch.cuda.empty_cache()

print(f"[zeroshot_pt_xlmr] Best dev macro-F1: {best_f1_zs:.4f}")

# ---- Reload best & tune threshold on dev ----
model_zeroshot = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
ensure_mwe_tokens(tok_zs, model_zeroshot)
state = torch.load(best_file_zs, map_location=str(device))
if "vocab_size" in state and model_zeroshot.get_input_embeddings().weight.shape[0] != state["vocab_size"]:
    model_zeroshot.resize_token_embeddings(state["vocab_size"])
model_zeroshot.load_state_dict(state["model_state"])
model_zeroshot.to(device); model_zeroshot.eval()

dev_loader_0s_eval = make_infer_loader(dev_0s_df, tok_zs, MAX_LEN, BATCH_SIZE)
dev_probs_0s = predict_probs_inferloader(model_zeroshot, dev_loader_0s_eval)
dev_true_0s = dev_0s_df["Label"].astype(int).to_numpy()

def _tune_threshold(probs, ytrue):
    grid = np.linspace(0.05, 0.95, 37)
    best_th, best_f1 = 0.5, -1.0
    for th in grid:
        yhat = (probs >= th).astype(int)
        f1 = f1_score(ytrue, yhat, average="macro")
        if f1 > best_f1:
            best_f1, best_th = f1, th
    return best_th, best_f1

best_th_0s, best_f1_tuned_0s = _tune_threshold(dev_probs_0s, dev_true_0s)
print(f"[zeroshot_pt_xlmr] Tuned threshold on dev: th={best_th_0s:.3f}, macro-F1={best_f1_tuned_0s:.4f}")

# ---- Eval predictions (zero-shot)
eval_pt_df = prepare_eval_frame(EVAL, language="PT")
eval_loader_0s = make_infer_loader(eval_pt_df, tok_zs, MAX_LEN, BATCH_SIZE)

@torch.no_grad()
def predict_with_threshold_inferloader(model, loader, th: float) -> list:
    outs = []
    for enc in loader:
        enc = {k: v.to(device, non_blocking=PIN_MEMORY) for k, v in enc.items()}
        p1 = torch.softmax(model(**enc).logits, dim=-1)[:, 1]
        outs.extend((p1 >= th).int().cpu().tolist())
    return outs

eval_preds_0s = predict_with_threshold_inferloader(model_zeroshot, eval_loader_0s, best_th_0s)
sub_0s = pd.DataFrame({
    "ID": eval_pt_df["ID"].astype(str),
    "Language": eval_pt_df["Language"],
    "Setting": ["zero_shot"] * len(eval_pt_df),
    "Label": eval_preds_0s
})
sub_path_0s = OUT_DIR / "eval_submission_pt_zeroshot.csv"
sub_0s.to_csv(sub_path_0s, index=False)
print(f"Wrote {sub_path_0s}")


CUDA allocated: 0.00 GiB | reserved: 0.00 GiB
VRAM cleanup done.
CUDA allocated: 0.00 GiB | reserved: 0.00 GiB


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


[zeroshot_pt_xlmr] Epoch 1/2 | Train loss 0.6457 F1 0.3991 | Dev loss 0.7019 F1 0.3607
[zeroshot_pt_xlmr] Epoch 2/2 | Train loss 0.6329 F1 0.3991 | Dev loss 0.7379 F1 0.3607
[zeroshot_pt_xlmr] Best dev macro-F1: 0.3607


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[zeroshot_pt_xlmr] Tuned threshold on dev: th=0.350, macro-F1=0.4821
Wrote outputs_pt_xlmr/eval_submission_pt_zeroshot.csv


In [10]:
# GPU/VRAM cleanup helper — run this before (re)building models on GPU
def free_vram():
    import gc, torch
    # delete any torch models/tensors we kept around in globals()
    for k, v in list(globals().items()):
        try:
            import torch.nn as nn
            if isinstance(v, nn.Module):
                del globals()[k]
        except Exception:
            pass
        if isinstance(v, torch.Tensor):
            del globals()[k]
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    print("VRAM cleanup done.")

# Optional: print quick CUDA memory stats
def cuda_mem():
    import torch
    if torch.cuda.is_available():
        a = torch.cuda.memory_allocated() / (1024**3)
        r = torch.cuda.memory_reserved() / (1024**3)
        print(f"CUDA allocated: {a:.2f} GiB | reserved: {r:.2f} GiB")
    else:
        print("CUDA not available.")


## One-shot (PT) with class-weighted loss (fp32), eval submission

In [11]:
# 07 - One-shot (PT): frozen backbone + focal loss + threshold tuning

if device.type == "cuda":
    cuda_mem(); free_vram(); cuda_mem()

train_1s_df, dev_1s_df = prepare_supervised_frame(TRAIN_ONE_SHOT, DEV, DEV_GOLD, language="PT")
from collections import Counter
cnt_train_1s = Counter(train_1s_df["Label"].astype(int).tolist())
cnt_dev_1s = Counter(dev_1s_df["Label"].astype(int).tolist())
print("[PT one-shot] Train label counts:", cnt_train_1s, "| Dev label counts:", cnt_dev_1s)

# alpha vector for focal loss from class weights
n0, n1 = cnt_train_1s.get(0, 1), cnt_train_1s.get(1, 1); tot = n0 + n1
w0, w1 = tot / (2.0 * n0), tot / (2.0 * n1)
alpha_vec = torch.tensor([w0, w1], dtype=torch.float32, device=device)

# Build model/tokenizer, add tokens once, freeze encoder
model_1s, tok_1s = build_model_and_tokenizer(MODEL_NAME, move_to_device=False)
ensure_mwe_tokens(tok_1s, model_1s)
freeze_backbone_roberta(model_1s)
model_1s.to(device)

train_loader_1s, dev_loader_1s = make_loaders(train_1s_df, dev_1s_df, tok_1s, MAX_LEN, BATCH_SIZE_MICRO)

n_train_1s = len(train_1s_df)
updates_per_epoch_1s = math.ceil(n_train_1s / (BATCH_SIZE_MICRO * GRAD_ACCUM_STEPS))
total_update_steps_1s = max(1, OS_EPOCHS * updates_per_epoch_1s)
warmup_steps_1s = max(1, int(OS_WARMUP_RATIO * total_update_steps_1s))

optim_params = [p for p in model_1s.parameters() if p.requires_grad]
optimizer_1s = torch.optim.AdamW(optim_params, lr=OS_LR, weight_decay=WEIGHT_DECAY, foreach=False)
scheduler_1s = get_linear_schedule_with_warmup(optimizer_1s, num_warmup_steps=warmup_steps_1s, num_training_steps=total_update_steps_1s)

best_f1_1s = -1.0
best_dir_1s = OUT_DIR / "ckpt_oneshot_pt_xlmr_weighted"
best_dir_1s.mkdir(parents=True, exist_ok=True)
best_file_1s = best_dir_1s / "best.pt"

# custom train loop with focal loss
step = 0
for epoch in range(1, OS_EPOCHS + 1):
    model_1s.train()
    tr_loss_acc, tr_preds, tr_labels = 0.0, [], []

    for batch in train_loader_1s:
        enc, labels = batch
        enc = {k: v.to(device, non_blocking=PIN_MEMORY) for k, v in enc.items()}
        labels = labels.to(device, non_blocking=PIN_MEMORY)

        logits = model_1s(**enc).logits
        loss = focal_loss(logits, labels, alpha=alpha_vec, gamma=2.0) / GRAD_ACCUM_STEPS
        loss.backward()
        step += 1

        tr_loss_acc += loss.item() * GRAD_ACCUM_STEPS * labels.size(0)
        tr_preds.extend(torch.argmax(logits, dim=-1).detach().cpu().tolist())
        tr_labels.extend(labels.detach().cpu().tolist())

        if step % GRAD_ACCUM_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(optim_params, 1.0)
            optimizer_1s.step()
            scheduler_1s.step()
            optimizer_1s.zero_grad(set_to_none=True)

    # dev eval (argmax)
    _, dv_f1, y_true, y_pred = run_epoch(model_1s, dev_loader_1s, tok_1s, optimizer=None, scheduler=None, train_mode=False, accum_steps=1)
    tr_f1 = f1_score(tr_labels, tr_preds, average="macro")
    tr_loss = tr_loss_acc / max(1, len(tr_labels))
    print(f"[oneshot_pt_frozen_focal] Epoch {epoch}/{OS_EPOCHS} | Train loss {tr_loss:.4f} F1 {tr_f1:.4f} | Dev F1 {dv_f1:.4f}")
    if dv_f1 > best_f1_1s:
        best_f1_1s = dv_f1
        torch.save({"model_state": model_1s.state_dict(), "vocab_size": len(tok_1s)}, best_file_1s)
    if device.type == "cuda": torch.cuda.empty_cache()

print(f"[oneshot_pt_frozen_focal] Best dev macro-F1 (argmax): {best_f1_1s:.4f}")

# reload & threshold tuning on dev
model_1s_eval = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
ensure_mwe_tokens(tok_1s, model_1s_eval)
state = torch.load(best_file_1s, map_location=str(device))
if "vocab_size" in state and model_1s_eval.get_input_embeddings().weight.shape[0] != state["vocab_size"]:
    model_1s_eval.resize_token_embeddings(state["vocab_size"])
model_1s_eval.load_state_dict(state["model_state"])
model_1s_eval.to(device); model_1s_eval.eval()

dev_loader_1s_eval = make_infer_loader(dev_1s_df, tok_1s, MAX_LEN, BATCH_SIZE)
dev_probs_1s = predict_probs_inferloader(model_1s_eval, dev_loader_1s_eval)
dev_true_1s = dev_1s_df["Label"].astype(int).to_numpy()
print(f"[oneshot_pt_frozen_focal] Dev positive rate (>=0.5): {(dev_probs_1s>=0.5).mean():.3f}")

def _tune_threshold(probs, ytrue):
    grid = np.linspace(0.05, 0.95, 37)
    best_th, best_f1 = 0.5, -1.0
    for th in grid:
        yhat = (probs >= th).astype(int)
        f1 = f1_score(ytrue, yhat, average="macro")
        if f1 > best_f1:
            best_f1, best_th = f1, th
    return best_th, best_f1

best_th_1s, best_f1_tuned_1s = _tune_threshold(dev_probs_1s, dev_true_1s)
print(f"[oneshot_pt_frozen_focal] Tuned threshold on dev: th={best_th_1s:.3f}, macro-F1={best_f1_tuned_1s:.4f}")

# eval predictions
eval_pt_df = prepare_eval_frame(EVAL, language="PT")
eval_loader_1s = make_infer_loader(eval_pt_df, tok_1s, MAX_LEN, BATCH_SIZE)

@torch.no_grad()
def predict_with_threshold_inferloader(model, loader, th: float) -> list:
    outs = []
    for enc in loader:
        enc = {k: v.to(device, non_blocking=PIN_MEMORY) for k, v in enc.items()}
        p1 = torch.softmax(model(**enc).logits, dim=-1)[:, 1]
        outs.extend((p1 >= th).int().cpu().tolist())
    return outs

eval_preds_1s = predict_with_threshold_inferloader(model_1s_eval, eval_loader_1s, best_th_1s)

sub_1s = pd.DataFrame({
    "ID": eval_pt_df["ID"].astype(str),
    "Language": eval_pt_df["Language"],
    "Setting": ["zero_shot"] * len(eval_pt_df),
    "Label": eval_preds_1s
})
sub_path_1s = OUT_DIR / "eval_submission_pt_oneshot_weighted.csv"
sub_1s.to_csv(sub_path_1s, index=False)
print(f"Wrote {sub_path_1s}")


CUDA allocated: 6.24 GiB | reserved: 6.55 GiB
VRAM cleanup done.
CUDA allocated: 5.21 GiB | reserved: 5.38 GiB
[PT one-shot] Train label counts: Counter({0: 28, 1: 25}) | Dev label counts: Counter({0: 154, 1: 119})


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[oneshot_pt_frozen_focal] Epoch 1/12 | Train loss 0.2356 F1 0.3592 | Dev F1 0.3036
[oneshot_pt_frozen_focal] Epoch 2/12 | Train loss 0.1798 F1 0.5335 | Dev F1 0.4878
[oneshot_pt_frozen_focal] Epoch 3/12 | Train loss 0.2296 F1 0.4289 | Dev F1 0.3607
[oneshot_pt_frozen_focal] Epoch 4/12 | Train loss 0.2037 F1 0.4395 | Dev F1 0.5487
[oneshot_pt_frozen_focal] Epoch 5/12 | Train loss 0.1772 F1 0.5276 | Dev F1 0.3036
[oneshot_pt_frozen_focal] Epoch 6/12 | Train loss 0.1659 F1 0.5488 | Dev F1 0.3036
[oneshot_pt_frozen_focal] Epoch 7/12 | Train loss 0.1836 F1 0.4172 | Dev F1 0.3018
[oneshot_pt_frozen_focal] Epoch 8/12 | Train loss 0.1737 F1 0.6193 | Dev F1 0.4435
[oneshot_pt_frozen_focal] Epoch 9/12 | Train loss 0.1865 F1 0.3394 | Dev F1 0.3607
[oneshot_pt_frozen_focal] Epoch 10/12 | Train loss 0.1718 F1 0.5662 | Dev F1 0.3607
[oneshot_pt_frozen_focal] Epoch 11/12 | Train loss 0.1780 F1 0.3783 | Dev F1 0.3607
[oneshot_pt_frozen_focal] Epoch 12/12 | Train loss 0.1687 F1 0.6032 | Dev F1 0.3607
[

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[oneshot_pt_frozen_focal] Dev positive rate (>=0.5): 0.465
[oneshot_pt_frozen_focal] Tuned threshold on dev: th=0.500, macro-F1=0.5487
Wrote outputs_pt_xlmr/eval_submission_pt_oneshot_weighted.csv


## Dev diagnostics (confusion/report)

In [12]:
# 08 - Dev-set diagnostics: report & confusion matrices

def eval_on_dev(best_ckpt_path: Path, tokenizer, dev_df: pd.DataFrame, tag: str):
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    ensure_mwe_tokens(tokenizer, model)
    state = torch.load(best_ckpt_path, map_location=str(device))
    if "vocab_size" in state and model.get_input_embeddings().weight.shape[0] != state["vocab_size"]:
        model.resize_token_embeddings(state["vocab_size"])
    model.load_state_dict(state["model_state"])
    model.to(device); model.eval()

    dev_loader = DataLoader(
        IdiomDataset(dev_df, tokenizer, MAX_LEN, is_infer=False),
        batch_size=BATCH_SIZE, shuffle=False, pin_memory=PIN_MEMORY,
        collate_fn=lambda b: collate_fn(b, tokenizer, MAX_LEN, is_infer=False)
    )
    _, f1, y_true, y_pred = run_epoch(model, dev_loader, tokenizer, optimizer=None, scheduler=None, train_mode=False)
    print(f"[{tag}] Dev macro-F1: {f1:.4f}")
    print(classification_report(y_true, y_pred, digits=4))
    print(confusion_matrix(y_true, y_pred))

# one-shot diagnostics
eval_on_dev(OUT_DIR / "ckpt_oneshot_pt_xlmr_weighted" / "best.pt", tok_1s, dev_1s_df, "One-shot PT XLM-R (frozen+focal)")

# zero-shot diagnostics
eval_on_dev(OUT_DIR / "ckpt_zeroshot_pt_xlmr" / "best.pt", tok_zs, dev_0s_df, "Zero-shot PT XLM-R")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[One-shot PT XLM-R (frozen+focal)] Dev macro-F1: 0.5487
              precision    recall  f1-score   support

           0     0.6096    0.5779    0.5933       154
           1     0.4882    0.5210    0.5041       119

    accuracy                         0.5531       273
   macro avg     0.5489    0.5495    0.5487       273
weighted avg     0.5567    0.5531    0.5544       273

[[89 65]
 [57 62]]


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Zero-shot PT XLM-R] Dev macro-F1: 0.3607
              precision    recall  f1-score   support

           0     0.5641    1.0000    0.7213       154
           1     0.0000    0.0000    0.0000       119

    accuracy                         0.5641       273
   macro avg     0.2821    0.5000    0.3607       273
weighted avg     0.3182    0.5641    0.4069       273

[[154   0]
 [119   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Save final best checkpoints to disk

In [13]:
# 09 - Save final checkpoints

final_oneshot_dir = OUT_DIR / "final_oneshot_pt_xlmr"
final_zeroshot_dir = OUT_DIR / "final_zeroshot_pt_xlmr"
final_oneshot_dir.mkdir(parents=True, exist_ok=True)
final_zeroshot_dir.mkdir(parents=True, exist_ok=True)

torch.save(torch.load(OUT_DIR / "ckpt_oneshot_pt_xlmr_weighted" / "best.pt", map_location=str(device)),
           final_oneshot_dir / "model.pt")
torch.save(torch.load(OUT_DIR / "ckpt_zeroshot_pt_xlmr" / "best.pt", map_location=str(device)),
           final_zeroshot_dir / "model.pt")
print("Saved final checkpoints.")


Saved final checkpoints.
