## Shared setup: device, imports, paths, utilities

In [1]:
# 01 - Shared setup: device, imports, paths, utilities

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import random
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple

import torch

# ---- Device selection ----
RUN_DEVICE = "gpu"  # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ---- Seeds ----
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ---- Metrics ----
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# ---- HF transformers ----
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)

# ---- Data paths (SemEval SubTaskA) ----
DATA_DIR = Path("SemEval_2022_Task2-idiomaticity/SubTaskA")
TRAIN_ONE_SHOT = DATA_DIR / "Data" / "train_one_shot.csv"
TRAIN_ZERO_SHOT = DATA_DIR / "Data" / "train_zero_shot.csv"
DEV = DATA_DIR / "Data" / "dev.csv"
DEV_GOLD = DATA_DIR / "Data" / "dev_gold.csv"
EVAL = DATA_DIR / "Data" / "eval.csv"

# ---- Outputs ----
OUT_DIR = Path("outputs_pt_xlmr_en")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Model & training config ----
MODEL_NAME = "xlm-roberta-base"
NUM_LABELS = 2

NUM_EPOCHS = 3
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1

BATCH_SIZE_GPU = 16
BATCH_SIZE_CPU = 8
BATCH_SIZE = BATCH_SIZE_GPU if device.type == "cuda" else BATCH_SIZE_CPU

print(f"BATCH_SIZE={BATCH_SIZE}, NUM_EPOCHS={NUM_EPOCHS}, LR={LR}")

# =============================================================================
# Data loading & context utilities
# =============================================================================

def load_any_csv(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep=None, engine="python", dtype=str)

def ensure_label_int(df: pd.DataFrame, col="Label") -> pd.DataFrame:
    if col in df.columns:
        df[col] = df[col].astype(int)
    return df

def mark_first_case_insensitive(text: str, needle: str, ltag="<mwe>", rtag="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    lt = text.lower()
    ln = needle.lower()
    i = lt.find(ln)
    if i == -1:
        return text
    return text[:i] + ltag + text[i:i+len(needle)] + rtag + text[i+len(needle):]

def pack_context(prev: str, target: str, nxt: str, mwe: str) -> str:
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    target = "" if pd.isna(target) else target
    tgt_marked = mark_first_case_insensitive(target, mwe)
    return f"Previous: {prev}\nTarget: {tgt_marked}\nNext: {nxt}"

def prepare_supervised_frame(
    train_path: Path,
    dev_path: Path,
    dev_gold_path: Path,
    language: str,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Prepare train/dev frames for a given language:
    - Filter by Language column
    - Merge dev with gold labels
    - Build 'text' column using pack_context
    """
    train_df = load_any_csv(train_path)
    dev_df = load_any_csv(dev_path)
    gold_df = load_any_csv(dev_gold_path)

    # Strip column names
    train_df.columns = [c.strip() for c in train_df.columns]
    dev_df.columns = [c.strip() for c in dev_df.columns]
    gold_df.columns = [c.strip() for c in gold_df.columns]

    # Filter by language
    train_df = train_df[train_df["Language"] == language].copy()
    dev_df = dev_df[dev_df["Language"] == language].copy()

    # Merge dev with gold labels
    gold = gold_df[gold_df["Language"] == language][["ID", "Label"]].copy()
    gold["ID"] = gold["ID"].astype(str)
    dev_df["ID"] = dev_df["ID"].astype(str)
    dev_lab = dev_df.merge(gold, on="ID", how="left")
    dev_lab = ensure_label_int(dev_lab, "Label")
    train_df = ensure_label_int(train_df, "Label")

    # Build text column
    def _build_text(row):
        return pack_context(
            row.get("Previous", ""),
            row.get("Target", ""),
            row.get("Next", ""),
            row.get("MWE", ""),
        )

    train_df["text"] = train_df.apply(_build_text, axis=1)
    dev_lab["text"] = dev_lab.apply(_build_text, axis=1)

    return train_df, dev_lab

def prepare_eval_frame(eval_path: Path, language: str) -> pd.DataFrame:
    """
    Prepare eval frame (no labels) for a given language.
    """
    df = load_any_csv(eval_path)
    df.columns = [c.strip() for c in df.columns]
    df = df[df["Language"] == language].copy()

    def _build_text(row):
        return pack_context(
            row.get("Previous", ""),
            row.get("Target", ""),
            row.get("Next", ""),
            row.get("MWE", ""),
        )

    df["text"] = df.apply(_build_text, axis=1)
    return df

# =============================================================================
# Dataset, model, dataloaders
# =============================================================================

class IdiomDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length: int = 256, with_labels: bool = True):
        self.texts = df["text"].tolist()
        self.with_labels = with_labels
        self.labels = df["Label"].tolist() if with_labels and "Label" in df.columns else None
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = self.texts[idx]
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
        }
        if self.with_labels and self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def build_model_and_tokenizer(move_to_device: bool = True):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Add MWE markers if not present
    special_tokens = {"additional_special_tokens": ["<mwe>", "</mwe>"]}
    tokenizer.add_special_tokens(special_tokens)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
    )

    # Resize embeddings for new special tokens
    model.resize_token_embeddings(len(tokenizer))

    if move_to_device:
        model.to(device)

    return model, tokenizer

def make_loader(df: pd.DataFrame, tokenizer, batch_size: int, shuffle: bool, max_length: int = 256, with_labels: bool = True):
    ds = IdiomDataset(df, tokenizer, max_length=max_length, with_labels=with_labels)
    return torch.utils.data.DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=2 if device.type == "cuda" else 0,
    )

# =============================================================================
# Training & evaluation functions
# =============================================================================

def run_epoch(
    model,
    dataloader,
    optimizer,
    scheduler,
    train: bool = True,
    class_weights: torch.Tensor = None,
) -> Tuple[float, float]:
    """
    Run one epoch. Returns (avg_loss, macro_f1).
    If train=False, no optimizer.step / scheduler.step is done.
    """
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_y_true = []
    all_y_pred = []

    if class_weights is not None:
        class_weights = class_weights.to(device)

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.set_grad_enabled(train):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            logits = outputs.logits
            if class_weights is not None:
                log_probs = torch.log_softmax(logits, dim=-1)
                loss = torch.nn.functional.nll_loss(
                    log_probs,
                    labels,
                    weight=class_weights,
                )
            else:
                loss = outputs.loss

            if train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()

        total_loss += loss.item() * labels.size(0)

        preds = torch.argmax(logits, dim=-1)
        all_y_true.extend(labels.detach().cpu().tolist())
        all_y_pred.extend(preds.detach().cpu().tolist())

    avg_loss = total_loss / max(1, len(all_y_true))
    macro_f1 = f1_score(all_y_true, all_y_pred, average="macro")
    return avg_loss, macro_f1

def predict(model, dataloader) -> Tuple[List[int], List[int]]:
    model.eval()
    all_y_true = []
    all_y_pred = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_y_true.extend(labels.detach().cpu().tolist())
            all_y_pred.extend(preds.detach().cpu().tolist())

    return all_y_true, all_y_pred

def predict_no_labels(model, dataloader) -> List[int]:
    model.eval()
    all_y_pred = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_y_pred.extend(preds.detach().cpu().tolist())

    return all_y_pred


Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb


  from .autonotebook import tqdm as notebook_tqdm


BATCH_SIZE=16, NUM_EPOCHS=3, LR=2e-05


## EN→PT ZERO-SHOT: train on EN train_zero_shot, test on PT

In [2]:
# 02 - EN→PT ZERO-SHOT: train on EN train_zero_shot, test on PT

print("=" * 80)
print("EN→PT XLM-R (ZERO-SHOT SETTING: train_zero_shot.csv for EN)")
print("=" * 80)

# ---- Prepare EN train/dev (zero-shot setting) ----
train_en_0s, dev_en_0s = prepare_supervised_frame(
    train_path=TRAIN_ZERO_SHOT,
    dev_path=DEV,
    dev_gold_path=DEV_GOLD,
    language="EN",
)

print(f"EN (zero-shot) train size: {len(train_en_0s)}, EN dev size: {len(dev_en_0s)}")

# Class weights for EN zero-shot training
label_counts_0s = train_en_0s["Label"].value_counts().to_dict()
print("EN (zero-shot) train label counts:", label_counts_0s)
total_0s = sum(label_counts_0s.values())
class_weights_0s = [total_0s / (2.0 * label_counts_0s.get(i, 1)) for i in range(NUM_LABELS)]
class_weights_0s_tensor = torch.tensor(class_weights_0s, dtype=torch.float)
print("EN (zero-shot) class weights:", class_weights_0s)

# ---- Build model & tokenizer ----
model_0s, tok_0s = build_model_and_tokenizer(move_to_device=True)

# ---- EN dataloaders ----
train_loader_en_0s = make_loader(
    train_en_0s,
    tok_0s,
    batch_size=BATCH_SIZE,
    shuffle=True,
    max_length=256,
    with_labels=True,
)
dev_loader_en_0s = make_loader(
    dev_en_0s,
    tok_0s,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=True,
)

# ---- Optimizer & scheduler ----
num_training_steps_0s = NUM_EPOCHS * len(train_loader_en_0s)
optimizer_0s = torch.optim.AdamW(
    model_0s.parameters(),
    lr=LR,
    weight_decay=WEIGHT_DECAY,
)
scheduler_0s = get_linear_schedule_with_warmup(
    optimizer_0s,
    num_warmup_steps=int(WARMUP_RATIO * num_training_steps_0s),
    num_training_steps=num_training_steps_0s,
)

best_en_dev_f1_0s = -1.0
best_path_en_0s = OUT_DIR / "xlmr_en_zero_shot_best.pt"

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n=== Epoch {epoch}/{NUM_EPOCHS} (EN zero-shot training) ===")
    train_loss_0s, train_f1_0s = run_epoch(
        model_0s,
        train_loader_en_0s,
        optimizer_0s,
        scheduler_0s,
        train=True,
        class_weights=class_weights_0s_tensor,
    )
    print(f"[EN zero-shot train] loss={train_loss_0s:.4f} macro-F1={train_f1_0s:.4f}")

    dev_loss_0s, dev_f1_0s = run_epoch(
        model_0s,
        dev_loader_en_0s,
        optimizer=None,
        scheduler=None,
        train=False,
        class_weights=None,
    )
    print(f"[EN zero-shot dev]   loss={dev_loss_0s:.4f} macro-F1={dev_f1_0s:.4f}")

    if dev_f1_0s > best_en_dev_f1_0s:
        best_en_dev_f1_0s = dev_f1_0s
        torch.save(model_0s.state_dict(), best_path_en_0s)
        print(f"New best EN dev macro-F1={best_en_dev_f1_0s:.4f} -> saved to {best_path_en_0s}")

print(f"\nBest EN dev macro-F1 (zero-shot): {best_en_dev_f1_0s:.4f}")

# ---- Reload best EN zero-shot checkpoint ----
model_0s_xfer, tok_0s_xfer = build_model_and_tokenizer(move_to_device=True)
state_0s = torch.load(best_path_en_0s, map_location=device)
model_0s_xfer.load_state_dict(state_0s)
model_0s_xfer.to(device)
model_0s_xfer.eval()
print(f"Loaded EN zero-shot checkpoint from: {best_path_en_0s}")

# ---- Prepare PT dev + eval ----
_, dev_pt_0s = prepare_supervised_frame(
    train_path=TRAIN_ZERO_SHOT,  # unused here; function needs it but we ignore train
    dev_path=DEV,
    dev_gold_path=DEV_GOLD,
    language="PT",
)
eval_pt_0s = prepare_eval_frame(EVAL, language="PT")

print(f"PT dev size: {len(dev_pt_0s)}, PT eval size: {len(eval_pt_0s)}")

dev_loader_pt_0s = make_loader(
    dev_pt_0s,
    tok_0s_xfer,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=True,
)
eval_loader_pt_0s = make_loader(
    eval_pt_0s,
    tok_0s_xfer,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=False,
)

# ---- Evaluate EN-trained-zero-shot model on PT dev ----
ytrue_pt_0s, ypred_pt_0s = predict(model_0s_xfer, dev_loader_pt_0s)
pt_dev_f1_0s = f1_score(ytrue_pt_0s, ypred_pt_0s, average="macro")
print(f"\n[EN-trained XLM-R (zero-shot) -> PT dev] macro-F1={pt_dev_f1_0s:.4f}")
print(classification_report(ytrue_pt_0s, ypred_pt_0s, digits=4))
print(confusion_matrix(ytrue_pt_0s, ypred_pt_0s))

# ---- Predict on PT eval ----
eval_preds_pt_0s = predict_no_labels(model_0s_xfer, eval_loader_pt_0s)

sub_pt_0s = pd.DataFrame({
    "ID": eval_pt_0s["ID"].astype(str),
    "Language": eval_pt_0s["Language"],
    "Setting": ["en_trained_zero_shot"] * len(eval_pt_0s),
    "Label": eval_preds_pt_0s,
})
sub_pt_path_0s = OUT_DIR / "eval_submission_pt_xlmr_en_zero_shot.csv"
sub_pt_0s.to_csv(sub_pt_path_0s, index=False)
print(f"\nWrote PT eval submission (EN zero-shot) to: {sub_pt_path_0s}")


EN→PT XLM-R (ZERO-SHOT SETTING: train_zero_shot.csv for EN)
EN (zero-shot) train size: 3327, EN dev size: 466
EN (zero-shot) train label counts: {0: 1762, 1: 1565}
EN (zero-shot) class weights: [0.9440976163450624, 1.0629392971246006]


2025-11-26 20:30:29.552979: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-26 20:30:39.863389: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764217841.258534 4132549 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764217842.128042 4132549 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764217845.533746 4132549 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 


=== Epoch 1/3 (EN zero-shot training) ===
[EN zero-shot train] loss=0.5996 macro-F1=0.6544
[EN zero-shot dev]   loss=0.7342 macro-F1=0.7429
New best EN dev macro-F1=0.7429 -> saved to outputs_pt_xlmr_en/xlmr_en_zero_shot_best.pt

=== Epoch 2/3 (EN zero-shot training) ===
[EN zero-shot train] loss=0.1871 macro-F1=0.9478
[EN zero-shot dev]   loss=0.9633 macro-F1=0.7972
New best EN dev macro-F1=0.7972 -> saved to outputs_pt_xlmr_en/xlmr_en_zero_shot_best.pt

=== Epoch 3/3 (EN zero-shot training) ===
[EN zero-shot train] loss=0.0737 macro-F1=0.9810
[EN zero-shot dev]   loss=1.1441 macro-F1=0.7975
New best EN dev macro-F1=0.7975 -> saved to outputs_pt_xlmr_en/xlmr_en_zero_shot_best.pt

Best EN dev macro-F1 (zero-shot): 0.7975


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded EN zero-shot checkpoint from: outputs_pt_xlmr_en/xlmr_en_zero_shot_best.pt
PT dev size: 273, PT eval size: 279

[EN-trained XLM-R (zero-shot) -> PT dev] macro-F1=0.6470
              precision    recall  f1-score   support

           0     0.7132    0.6299    0.6690       154
           1     0.5839    0.6723    0.6250       119

    accuracy                         0.6484       273
   macro avg     0.6486    0.6511    0.6470       273
weighted avg     0.6569    0.6484    0.6498       273

[[97 57]
 [39 80]]

Wrote PT eval submission (EN zero-shot) to: outputs_pt_xlmr_en/eval_submission_pt_xlmr_en_zero_shot.csv


## EN→PT ONE-SHOT: train on EN train_one_shot, test on PT

In [None]:
# 03 - EN→PT ONE-SHOT: train on EN train_one_shot, test on PT

print("=" * 80)
print("EN→PT XLM-R (ONE-SHOT SETTING: train_one_shot.csv for EN)")
print("=" * 80)

# ---- Prepare EN train/dev (one-shot setting) ----
train_en_1s, dev_en_1s = prepare_supervised_frame(
    train_path=TRAIN_ONE_SHOT,
    dev_path=DEV,
    dev_gold_path=DEV_GOLD,
    language="EN",
)

print(f"EN (one-shot) train size: {len(train_en_1s)}, EN dev size: {len(dev_en_1s)}")

# Class weights for EN one-shot training
label_counts_1s = train_en_1s["Label"].value_counts().to_dict()
print("EN (one-shot) train label counts:", label_counts_1s)
total_1s = sum(label_counts_1s.values())
class_weights_1s = [total_1s / (2.0 * label_counts_1s.get(i, 1)) for i in range(NUM_LABELS)]
class_weights_1s_tensor = torch.tensor(class_weights_1s, dtype=torch.float)
print("EN (one-shot) class weights:", class_weights_1s)

# ---- Build model & tokenizer ----
model_1s, tok_1s = build_model_and_tokenizer(move_to_device=True)

# ---- EN dataloaders ----
train_loader_en_1s = make_loader(
    train_en_1s,
    tok_1s,
    batch_size=BATCH_SIZE,
    shuffle=True,
    max_length=256,
    with_labels=True,
)
dev_loader_en_1s = make_loader(
    dev_en_1s,
    tok_1s,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=True,
)

# ---- Optimizer & scheduler ----
num_training_steps_1s = NUM_EPOCHS * len(train_loader_en_1s)
optimizer_1s = torch.optim.AdamW(
    model_1s.parameters(),
    lr=LR,
    weight_decay=WEIGHT_DECAY,
)
scheduler_1s = get_linear_schedule_with_warmup(
    optimizer_1s,
    num_warmup_steps=int(WARMUP_RATIO * num_training_steps_1s),
    num_training_steps=num_training_steps_1s,
)

best_en_dev_f1_1s = -1.0
best_path_en_1s = OUT_DIR / "xlmr_en_one_shot_best.pt"

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n=== Epoch {epoch}/{NUM_EPOCHS} (EN one-shot training) ===")
    train_loss_1s, train_f1_1s = run_epoch(
        model_1s,
        train_loader_en_1s,
        optimizer_1s,
        scheduler_1s,
        train=True,
        class_weights=class_weights_1s_tensor,
    )
    print(f"[EN one-shot train] loss={train_loss_1s:.4f} macro-F1={train_f1_1s:.4f}")

    dev_loss_1s, dev_f1_1s = run_epoch(
        model_1s,
        dev_loader_en_1s,
        optimizer=None,
        scheduler=None,
        train=False,
        class_weights=None,
    )
    print(f"[EN one-shot dev]   loss={dev_loss_1s:.4f} macro-F1={dev_f1_1s:.4f}")

    if dev_f1_1s > best_en_dev_f1_1s:
        best_en_dev_f1_1s = dev_f1_1s
        torch.save(model_1s.state_dict(), best_path_en_1s)
        print(f"New best EN dev macro-F1={best_en_dev_f1_1s:.4f} -> saved to {best_path_en_1s}")

print(f"\nBest EN dev macro-F1 (one-shot): {best_en_dev_f1_1s:.4f}")

# ---- Reload best EN one-shot checkpoint ----
model_1s_xfer, tok_1s_xfer = build_model_and_tokenizer(move_to_device=True)
state_1s = torch.load(best_path_en_1s, map_location=device)
model_1s_xfer.load_state_dict(state_1s)
model_1s_xfer.to(device)
model_1s_xfer.eval()
print(f"Loaded EN one-shot checkpoint from: {best_path_en_1s}")

# ---- Prepare PT dev + eval ----
_, dev_pt_1s = prepare_supervised_frame(
    train_path=TRAIN_ONE_SHOT,  # unused; function signature
    dev_path=DEV,
    dev_gold_path=DEV_GOLD,
    language="PT",
)
eval_pt_1s = prepare_eval_frame(EVAL, language="PT")

print(f"PT dev size: {len(dev_pt_1s)}, PT eval size: {len(eval_pt_1s)}")

dev_loader_pt_1s = make_loader(
    dev_pt_1s,
    tok_1s_xfer,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=True,
)
eval_loader_pt_1s = make_loader(
    eval_pt_1s,
    tok_1s_xfer,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=False,
)

# ---- Evaluate EN-trained-one-shot model on PT dev ----
ytrue_pt_1s, ypred_pt_1s = predict(model_1s_xfer, dev_loader_pt_1s)
pt_dev_f1_1s = f1_score(ytrue_pt_1s, ypred_pt_1s, average="macro")
print(f"\n[EN-trained XLM-R (one-shot) -> PT dev] macro-F1={pt_dev_f1_1s:.4f}")
print(classification_report(ytrue_pt_1s, ypred_pt_1s, digits=4))
print(confusion_matrix(ytrue_pt_1s, ypred_pt_1s))

# ---- Predict on PT eval ----
eval_preds_pt_1s = predict_no_labels(model_1s_xfer, eval_loader_pt_1s)

sub_pt_1s = pd.DataFrame({
    "ID": eval_pt_1s["ID"].astype(str),
    "Language": eval_pt_1s["Language"],
    "Setting": ["en_trained_one_shot"] * len(eval_pt_1s),
    "Label": eval_preds_pt_1s,
})
sub_pt_path_1s = OUT_DIR / "eval_submission_pt_xlmr_en_one_shot.csv"
sub_pt_1s.to_csv(sub_pt_path_1s, index=False)
print(f"\nWrote PT eval submission (EN one-shot) to: {sub_pt_path_1s}")


EN→PT XLM-R (ONE-SHOT SETTING: train_one_shot.csv for EN)
EN (one-shot) train size: 87, EN dev size: 466
EN (one-shot) train label counts: {1: 55, 0: 32}
EN (one-shot) class weights: [1.359375, 0.7909090909090909]


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Epoch 1/3 (EN one-shot training) ===
[EN one-shot train] loss=0.7264 macro-F1=0.3830
[EN one-shot dev]   loss=0.6682 macro-F1=0.3787
New best EN dev macro-F1=0.3787 -> saved to outputs_pt_xlmr_en/xlmr_en_one_shot_best.pt

=== Epoch 2/3 (EN one-shot training) ===
[EN one-shot train] loss=0.7150 macro-F1=0.4285
[EN one-shot dev]   loss=0.6686 macro-F1=0.3787

=== Epoch 3/3 (EN one-shot training) ===
[EN one-shot train] loss=0.6974 macro-F1=0.4755
[EN one-shot dev]   loss=0.6685 macro-F1=0.3787

Best EN dev macro-F1 (one-shot): 0.3787


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded EN one-shot checkpoint from: outputs_pt_xlmr_en/xlmr_en_one_shot_best.pt
PT dev size: 273, PT eval size: 279

[EN-trained XLM-R (one-shot) -> PT dev] macro-F1=0.3036
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       154
           1     0.4359    1.0000    0.6071       119

    accuracy                         0.4359       273
   macro avg     0.2179    0.5000    0.3036       273
weighted avg     0.1900    0.4359    0.2647       273

[[  0 154]
 [  0 119]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Wrote PT eval submission (EN one-shot) to: outputs_pt_xlmr_en/eval_submission_pt_xlmr_en_one_shot.csv


: 