## Shared setup: device, imports, paths, utilities

In [2]:
# 01 - Shared setup: device, imports, paths, utilities

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import random
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple

import torch

# ---- Device selection ----
RUN_DEVICE = "gpu"  # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ---- Seeds ----
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ---- Metrics ----
from sklearn.metrics import (
    f1_score,
    recall_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
)

# ---- HF transformers ----
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)

# ---- Data paths (SemEval SubTaskA for EN training) ----
DATA_DIR = Path("SemEval_2022_Task2-idiomaticity/SubTaskA")
TRAIN_ONE_SHOT = DATA_DIR / "Data" / "train_one_shot.csv"
TRAIN_ZERO_SHOT = DATA_DIR / "Data" / "train_zero_shot.csv"
DEV = DATA_DIR / "Data" / "dev.csv"
DEV_GOLD = DATA_DIR / "Data" / "dev_gold.csv"
EVAL = DATA_DIR / "Data" / "eval.csv"  # not used here, but kept for completeness

# ---- LRL data: single CSV with all low-resource languages ----
LRL_CSV = Path("lrl_idioms.csv")  # columns: ID,Language,MWE,Previous,Target,Next,Label

# ---- Outputs ----
OUT_DIR = Path("outputs_lrl_xlmr_en")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Model & training config ----
MODEL_NAME = "xlm-roberta-base"
NUM_LABELS = 2

NUM_EPOCHS = 3
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1

BATCH_SIZE_GPU = 16
BATCH_SIZE_CPU = 8
BATCH_SIZE = BATCH_SIZE_GPU if device.type == "cuda" else BATCH_SIZE_CPU

print(f"BATCH_SIZE={BATCH_SIZE}, NUM_EPOCHS={NUM_EPOCHS}, LR={LR}")


Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb


  from .autonotebook import tqdm as notebook_tqdm


BATCH_SIZE=16, NUM_EPOCHS=3, LR=2e-05


## Data loading & context utilities (SemEval EN + LRL)

In [3]:
# 02 - Data loading & context utilities (SemEval EN + LRL)

def load_any_csv(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep=None, engine="python", dtype=str)

def ensure_label_int(df: pd.DataFrame, col="Label") -> pd.DataFrame:
    if col in df.columns:
        df[col] = df[col].astype(int)
    return df

def mark_first_case_insensitive(text: str, needle: str, ltag="<mwe>", rtag="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    lt = text.lower()
    ln = needle.lower()
    i = lt.find(ln)
    if i == -1:
        return text
    return text[:i] + ltag + text[i:i+len(needle)] + rtag + text[i+len(needle):]

def pack_context(prev: str, target: str, nxt: str, mwe: str) -> str:
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    target = "" if pd.isna(target) else target
    tgt_marked = mark_first_case_insensitive(target, mwe)
    return f"Previous: {prev}\nTarget: {tgt_marked}\nNext: {nxt}"

def prepare_supervised_frame(
    train_path: Path,
    dev_path: Path,
    dev_gold_path: Path,
    language: str,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Prepare train/dev frames for a given language from SemEval:
    - Filter by Language column
    - Merge dev with gold labels
    - Build 'text' column using pack_context
    """
    train_df = load_any_csv(train_path)
    dev_df = load_any_csv(dev_path)
    gold_df = load_any_csv(dev_gold_path)

    # Strip column names
    train_df.columns = [c.strip() for c in train_df.columns]
    dev_df.columns = [c.strip() for c in dev_df.columns]
    gold_df.columns = [c.strip() for c in gold_df.columns]

    # Filter by language
    train_df = train_df[train_df["Language"] == language].copy()
    dev_df = dev_df[dev_df["Language"] == language].copy()

    # Merge dev with gold labels
    gold = gold_df[gold_df["Language"] == language][["ID", "Label"]].copy()
    gold["ID"] = gold["ID"].astype(str)
    dev_df["ID"] = dev_df["ID"].astype(str)
    dev_lab = dev_df.merge(gold, on="ID", how="left")
    dev_lab = ensure_label_int(dev_lab, "Label")
    train_df = ensure_label_int(train_df, "Label")

    # Build text column
    def _build_text(row):
        return pack_context(
            row.get("Previous", ""),
            row.get("Target", ""),
            row.get("Next", ""),
            row.get("MWE", ""),
        )

    train_df["text"] = train_df.apply(_build_text, axis=1)
    dev_lab["text"] = dev_lab.apply(_build_text, axis=1)

    return train_df, dev_lab

def load_lrl_frame(lrl_path: Path) -> pd.DataFrame:
    """
    Load LRL dataset lrl_idioms.csv and build 'text' column.
    Columns expected: ID, Language, MWE, Previous, Target, Next, Label
    """
    df = load_any_csv(lrl_path)
    df.columns = [c.strip() for c in df.columns]
    df = ensure_label_int(df, "Label")

    def _build_text(row):
        return pack_context(
            row.get("Previous", ""),
            row.get("Target", ""),
            row.get("Next", ""),
            row.get("MWE", ""),
        )

    df["text"] = df.apply(_build_text, axis=1)
    return df

# ---- Load LRL data once, get list of languages ----
lrl_df = load_lrl_frame(LRL_CSV)
print("Loaded LRL dataset (all languages):")
print(lrl_df.head())
print(f"Total LRL examples: {len(lrl_df)}")

lrl_languages = sorted(lrl_df["Language"].dropna().unique().tolist())
print("\nLRL languages found:", lrl_languages)

if len(lrl_languages) == 0:
    raise ValueError("No languages found in lrl_idioms.csv (Language column empty?).")


Loaded LRL dataset (all languages):
     ID Language                          MWE  \
0  bn_1       BN                   বৃষ্টির জল   
1  bn_2       BN  সংসার সুখের হয় রমণীর গুণে ।   
2  bn_3       BN                 যাচ্ছা তাই ।   
3  bn_4       BN         উলুবনে মুক্ত ছড়ানো ।   
4  bn_5       BN                     নিজের ঘর   

                                            Previous  \
0                 আকাশে অনেকক্ষণ ধরে কালো মেঘ জমছিল।   
1  ছোট বাসা, অল্প রোজগার—সব মিলিয়ে খুব অভাবের জীব...   
2  পাড়ার ক্লাবের অনুষ্ঠানের দায়িত্ব যাঁর হাতে দেও...   
3  প্রফেসর গভীর মনোযোগ দিয়ে কঠিন তত্ত্বগুলো বুঝিয়...   
4   সারাদিন বাইরে থাকায় সে ভীষণ ক্লান্ত হয়ে গিয়েছিল।   

                                              Target  \
0           হঠাৎ টুপটাপ করে বৃষ্টির জল পড়া শুরু হলো।   
1  তবু হাসিমুখে সব সামলে রাখত তারা; লোকজন বলত, সং...   
2  ফলে মঞ্চ, সাউন্ড, আলো—সব মিলিয়ে শেষ দিন একেবার...   
3  কিন্তু ছাত্ররা মোবাইলে ব্যস্ত, কেউ শুনছেই না—এ...   
4    বাড়ি ফিরে সে সোজা নিজের ঘরে গিয়ে দরজা বন্ধ করল।

## Dataset, model, dataloaders

In [4]:
# 03 - Dataset, model, dataloaders

class IdiomDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length: int = 256, with_labels: bool = True):
        self.texts = df["text"].tolist()
        self.with_labels = with_labels
        self.labels = df["Label"].tolist() if with_labels and "Label" in df.columns else None
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = self.texts[idx]
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
        }
        if self.with_labels and self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def build_model_and_tokenizer(move_to_device: bool = True):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Add MWE markers if not present
    special_tokens = {"additional_special_tokens": ["<mwe>", "</mwe>"]}
    tokenizer.add_special_tokens(special_tokens)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
    )

    # Resize embeddings for new special tokens
    model.resize_token_embeddings(len(tokenizer))

    if move_to_device:
        model.to(device)

    return model, tokenizer

def make_loader(df: pd.DataFrame, tokenizer, batch_size: int, shuffle: bool,
                max_length: int = 256, with_labels: bool = True):
    ds = IdiomDataset(df, tokenizer, max_length=max_length, with_labels=with_labels)
    return torch.utils.data.DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=2 if device.type == "cuda" else 0,
    )


## Training & evaluation helpers

In [5]:
# 04 - Training & evaluation functions

def run_epoch(
    model,
    dataloader,
    optimizer,
    scheduler,
    train: bool = True,
    class_weights: torch.Tensor = None,
) -> Tuple[float, float]:
    """
    Run one epoch. Returns (avg_loss, macro_f1).
    If train=False, no optimizer.step / scheduler.step is done.
    """
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_y_true = []
    all_y_pred = []

    if class_weights is not None:
        class_weights = class_weights.to(device)

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.set_grad_enabled(train):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            logits = outputs.logits
            if class_weights is not None:
                log_probs = torch.log_softmax(logits, dim=-1)
                loss = torch.nn.functional.nll_loss(
                    log_probs,
                    labels,
                    weight=class_weights,
                )
            else:
                loss = outputs.loss

            if train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()

        total_loss += loss.item() * labels.size(0)

        preds = torch.argmax(logits, dim=-1)
        all_y_true.extend(labels.detach().cpu().tolist())
        all_y_pred.extend(preds.detach().cpu().tolist())

    avg_loss = total_loss / max(1, len(all_y_true))
    macro_f1 = f1_score(all_y_true, all_y_pred, average="macro")
    return avg_loss, macro_f1

def predict(model, dataloader) -> Tuple[List[int], List[int]]:
    model.eval()
    all_y_true = []
    all_y_pred = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_y_true.extend(labels.detach().cpu().tolist())
            all_y_pred.extend(preds.detach().cpu().tolist())

    return all_y_true, all_y_pred

def predict_no_labels(model, dataloader) -> List[int]:
    model.eval()
    all_y_pred = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_y_pred.extend(preds.detach().cpu().tolist())

    return all_y_pred


## EN→LRL ZERO-SHOT: train on EN train_zero_shot, test on LRL

In [6]:
# 05 - EN→LRL ZERO-SHOT: train on EN train_zero_shot, test on LRL

print("=" * 80)
print("EN→LRL XLM-R (ZERO-SHOT SETTING: train_zero_shot.csv for EN)")
print("=" * 80)

# ---- Prepare EN train/dev (zero-shot setting) ----
train_en_0s, dev_en_0s = prepare_supervised_frame(
    train_path=TRAIN_ZERO_SHOT,
    dev_path=DEV,
    dev_gold_path=DEV_GOLD,
    language="EN",
)

print(f"EN (zero-shot) train size: {len(train_en_0s)}, EN dev size: {len(dev_en_0s)}")

# Class weights for EN zero-shot training
label_counts_0s = train_en_0s["Label"].value_counts().to_dict()
print("EN (zero-shot) train label counts:", label_counts_0s)
total_0s = sum(label_counts_0s.values())
class_weights_0s = [total_0s / (2.0 * label_counts_0s.get(i, 1)) for i in range(NUM_LABELS)]
class_weights_0s_tensor = torch.tensor(class_weights_0s, dtype=torch.float)
print("EN (zero-shot) class weights:", class_weights_0s)

# ---- Build model & tokenizer ----
model_0s, tok_0s = build_model_and_tokenizer(move_to_device=True)

# ---- EN dataloaders ----
train_loader_en_0s = make_loader(
    train_en_0s,
    tok_0s,
    batch_size=BATCH_SIZE,
    shuffle=True,
    max_length=256,
    with_labels=True,
)
dev_loader_en_0s = make_loader(
    dev_en_0s,
    tok_0s,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=True,
)

# ---- Optimizer & scheduler ----
num_training_steps_0s = NUM_EPOCHS * len(train_loader_en_0s)
optimizer_0s = torch.optim.AdamW(
    model_0s.parameters(),
    lr=LR,
    weight_decay=WEIGHT_DECAY,
)
scheduler_0s = get_linear_schedule_with_warmup(
    optimizer_0s,
    num_warmup_steps=int(WARMUP_RATIO * num_training_steps_0s),
    num_training_steps=num_training_steps_0s,
)

best_en_dev_f1_0s = -1.0
best_path_en_0s = OUT_DIR / "xlmr_en_zero_shot_best.pt"

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n=== Epoch {epoch}/{NUM_EPOCHS} (EN zero-shot training) ===")
    train_loss_0s, train_f1_0s = run_epoch(
        model_0s,
        train_loader_en_0s,
        optimizer_0s,
        scheduler_0s,
        train=True,
        class_weights=class_weights_0s_tensor,
    )
    print(f"[EN zero-shot train] loss={train_loss_0s:.4f} macro-F1={train_f1_0s:.4f}")

    dev_loss_0s, dev_f1_0s = run_epoch(
        model_0s,
        dev_loader_en_0s,
        optimizer=None,
        scheduler=None,
        train=False,
        class_weights=None,
    )
    print(f"[EN zero-shot dev]   loss={dev_loss_0s:.4f} macro-F1={dev_f1_0s:.4f}")

    if dev_f1_0s > best_en_dev_f1_0s:
        best_en_dev_f1_0s = dev_f1_0s
        torch.save(model_0s.state_dict(), best_path_en_0s)
        print(f"New best EN dev macro-F1={best_en_dev_f1_0s:.4f} -> saved to {best_path_en_0s}")

print(f"\nBest EN dev macro-F1 (zero-shot): {best_en_dev_f1_0s:.4f}")

# ---- Reload best EN zero-shot checkpoint for transfer to LRL ----
model_0s_xfer, tok_0s_xfer = build_model_and_tokenizer(move_to_device=True)
state_0s = torch.load(best_path_en_0s, map_location=device)
model_0s_xfer.load_state_dict(state_0s)
model_0s_xfer.to(device)
model_0s_xfer.eval()
print(f"Loaded EN zero-shot checkpoint from: {best_path_en_0s}")


EN→LRL XLM-R (ZERO-SHOT SETTING: train_zero_shot.csv for EN)
EN (zero-shot) train size: 3327, EN dev size: 466
EN (zero-shot) train label counts: {0: 1762, 1: 1565}
EN (zero-shot) class weights: [0.9440976163450624, 1.0629392971246006]


2025-12-03 19:30:12.878823: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-03 19:30:12.891142: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764819012.901686 2481545 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764819012.904401 2481545 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764819012.914509 2481545 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 


=== Epoch 1/3 (EN zero-shot training) ===
[EN zero-shot train] loss=0.6002 macro-F1=0.6508
[EN zero-shot dev]   loss=0.6785 macro-F1=0.7242
New best EN dev macro-F1=0.7242 -> saved to outputs_lrl_xlmr_en/xlmr_en_zero_shot_best.pt

=== Epoch 2/3 (EN zero-shot training) ===
[EN zero-shot train] loss=0.1728 macro-F1=0.9523
[EN zero-shot dev]   loss=1.0651 macro-F1=0.7782
New best EN dev macro-F1=0.7782 -> saved to outputs_lrl_xlmr_en/xlmr_en_zero_shot_best.pt

=== Epoch 3/3 (EN zero-shot training) ===
[EN zero-shot train] loss=0.0680 macro-F1=0.9840
[EN zero-shot dev]   loss=1.1448 macro-F1=0.7854
New best EN dev macro-F1=0.7854 -> saved to outputs_lrl_xlmr_en/xlmr_en_zero_shot_best.pt

Best EN dev macro-F1 (zero-shot): 0.7854


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded EN zero-shot checkpoint from: outputs_lrl_xlmr_en/xlmr_en_zero_shot_best.pt


## Evaluate EN zero-shot model on all LRL languages

In [8]:
# 06 - Evaluate EN-trained zero-shot model on all LRL languages

zero_results = []

for lang in lrl_languages:
    print("\n" + "=" * 80)
    print(f"EN-trained XLM-R (zero-shot) -> LRL language: {lang}")
    print("=" * 80)

    df_lang = lrl_df[lrl_df["Language"] == lang].copy()
    if df_lang.empty:
        print(f"No rows for language {lang}, skipping.")
        continue

    # Build dataloader for this language (with labels)
    dev_loader_lang = make_loader(
        df_lang,
        tok_0s_xfer,
        batch_size=BATCH_SIZE,
        shuffle=False,
        max_length=256,
        with_labels=True,
    )

    ytrue, ypred = predict(model_0s_xfer, dev_loader_lang)

    f1_macro = f1_score(ytrue, ypred, average="macro")
    recall_macro = recall_score(ytrue, ypred, average="macro")
    acc = accuracy_score(ytrue, ypred)

    print(f"[EN-trained zero-shot -> {lang}] Macro-F1={f1_macro:.4f}")
    print(f"[EN-trained zero-shot -> {lang}] Macro-recall={recall_macro:.4f}")
    print(f"[EN-trained zero-shot -> {lang}] Accuracy={acc:.4f}\n")

    print("Classification report:")
    print(classification_report(ytrue, ypred, digits=4))
    print("Confusion matrix:")
    print(confusion_matrix(ytrue, ypred))

    # Save per-language predictions
    out_df = df_lang.copy()
    out_df["PredLabel_EN_ZeroShot"] = ypred
    pred_path = OUT_DIR / f"{lang}_xlmr_en_zero_shot_predictions.csv"
    out_df.to_csv(pred_path, index=False, encoding="utf-8")
    print(f"\nSaved zero-shot predictions for {lang} to: {pred_path}")

    zero_results.append({
        "Language": lang,
        "NumExamples": len(df_lang),
        "MacroF1_ZeroShot": f1_macro,
        "MacroRecall_ZeroShot": recall_macro,
        "Accuracy_ZeroShot": acc,
    })



EN-trained XLM-R (zero-shot) -> LRL language: BN
[EN-trained zero-shot -> BN] Macro-F1=0.3661
[EN-trained zero-shot -> BN] Macro-recall=0.3563
[EN-trained zero-shot -> BN] Accuracy=0.4762

Classification report:
              precision    recall  f1-score   support

           0     0.0877    0.1220    0.1020        41
           1     0.6757    0.5906    0.6303       127

    accuracy                         0.4762       168
   macro avg     0.3817    0.3563    0.3661       168
weighted avg     0.5322    0.4762    0.5013       168

Confusion matrix:
[[ 5 36]
 [52 75]]

Saved zero-shot predictions for BN to: outputs_lrl_xlmr_en/BN_xlmr_en_zero_shot_predictions.csv

EN-trained XLM-R (zero-shot) -> LRL language: ML
[EN-trained zero-shot -> ML] Macro-F1=0.2695
[EN-trained zero-shot -> ML] Macro-recall=0.2762
[EN-trained zero-shot -> ML] Accuracy=0.2800

Classification report:
              precision    recall  f1-score   support

           0     0.1379    0.2667    0.1818        15
    

## EN→LRL ONE-SHOT: train on EN train_one_shot, test on LRL

In [10]:
# 07 - EN→LRL ONE-SHOT: train on EN train_one_shot, test on LRL

print("=" * 80)
print("EN→LRL XLM-R (ONE-SHOT SETTING: train_one_shot.csv for EN)")
print("=" * 80)

# ---- Prepare EN train/dev (one-shot setting) ----
train_en_1s, dev_en_1s = prepare_supervised_frame(
    train_path=TRAIN_ONE_SHOT,
    dev_path=DEV,
    dev_gold_path=DEV_GOLD,
    language="EN",
)

print(f"EN (one-shot) train size: {len(train_en_1s)}, EN dev size: {len(dev_en_1s)}")

# Class weights for EN one-shot training
label_counts_1s = train_en_1s["Label"].value_counts().to_dict()
print("EN (one-shot) train label counts:", label_counts_1s)
total_1s = sum(label_counts_1s.values())
class_weights_1s = [total_1s / (2.0 * label_counts_1s.get(i, 1)) for i in range(NUM_LABELS)]
class_weights_1s_tensor = torch.tensor(class_weights_1s, dtype=torch.float)
print("EN (one-shot) class weights:", class_weights_1s)

# ---- Build model & tokenizer ----
model_1s, tok_1s = build_model_and_tokenizer(move_to_device=True)

# ---- EN dataloaders ----
train_loader_en_1s = make_loader(
    train_en_1s,
    tok_1s,
    batch_size=BATCH_SIZE,
    shuffle=True,
    max_length=256,
    with_labels=True,
)
dev_loader_en_1s = make_loader(
    dev_en_1s,
    tok_1s,
    batch_size=BATCH_SIZE,
    shuffle=False,
    max_length=256,
    with_labels=True,
)

# ---- Optimizer & scheduler ----
num_training_steps_1s = NUM_EPOCHS * len(train_loader_en_1s)
optimizer_1s = torch.optim.AdamW(
    model_1s.parameters(),
    lr=LR,
    weight_decay=WEIGHT_DECAY,
)
scheduler_1s = get_linear_schedule_with_warmup(
    optimizer_1s,
    num_warmup_steps=int(WARMUP_RATIO * num_training_steps_1s),
    num_training_steps=num_training_steps_1s,
)

best_en_dev_f1_1s = -1.0
best_path_en_1s = OUT_DIR / "xlmr_en_one_shot_best.pt"

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n=== Epoch {epoch}/{NUM_EPOCHS} (EN one-shot training) ===")
    train_loss_1s, train_f1_1s = run_epoch(
        model_1s,
        train_loader_en_1s,
        optimizer_1s,
        scheduler_1s,
        train=True,
        class_weights=class_weights_1s_tensor,
    )
    print(f"[EN one-shot train] loss={train_loss_1s:.4f} macro-F1={train_f1_1s:.4f}")

    dev_loss_1s, dev_f1_1s = run_epoch(
        model_1s,
        dev_loader_en_1s,
        optimizer=None,
        scheduler=None,
        train=False,
        class_weights=None,
    )
    print(f"[EN one-shot dev]   loss={dev_loss_1s:.4f} macro-F1={dev_f1_1s:.4f}")

    if dev_f1_1s > best_en_dev_f1_1s:
        best_en_dev_f1_1s = dev_f1_1s
        torch.save(model_1s.state_dict(), best_path_en_1s)
        print(f"New best EN dev macro-F1={best_en_dev_f1_1s:.4f} -> saved to {best_path_en_1s}")

print(f"\nBest EN dev macro-F1 (one-shot): {best_en_dev_f1_1s:.4f}")

# ---- Reload best EN one-shot checkpoint for transfer to LRL ----
model_1s_xfer, tok_1s_xfer = build_model_and_tokenizer(move_to_device=True)
state_1s = torch.load(best_path_en_1s, map_location=device)
model_1s_xfer.load_state_dict(state_1s)
model_1s_xfer.to(device)
model_1s_xfer.eval()
print(f"Loaded EN one-shot checkpoint from: {best_path_en_1s}")


EN→LRL XLM-R (ONE-SHOT SETTING: train_one_shot.csv for EN)
EN (one-shot) train size: 87, EN dev size: 466
EN (one-shot) train label counts: {1: 55, 0: 32}
EN (one-shot) class weights: [1.359375, 0.7909090909090909]


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Epoch 1/3 (EN one-shot training) ===
[EN one-shot train] loss=0.7162 macro-F1=0.3562
[EN one-shot dev]   loss=0.6819 macro-F1=0.4420
New best EN dev macro-F1=0.4420 -> saved to outputs_lrl_xlmr_en/xlmr_en_one_shot_best.pt

=== Epoch 2/3 (EN one-shot training) ===
[EN one-shot train] loss=0.7117 macro-F1=0.5239
[EN one-shot dev]   loss=0.6911 macro-F1=0.5857
New best EN dev macro-F1=0.5857 -> saved to outputs_lrl_xlmr_en/xlmr_en_one_shot_best.pt

=== Epoch 3/3 (EN one-shot training) ===
[EN one-shot train] loss=0.7056 macro-F1=0.4386
[EN one-shot dev]   loss=0.6903 macro-F1=0.5965
New best EN dev macro-F1=0.5965 -> saved to outputs_lrl_xlmr_en/xlmr_en_one_shot_best.pt

Best EN dev macro-F1 (one-shot): 0.5965


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded EN one-shot checkpoint from: outputs_lrl_xlmr_en/xlmr_en_one_shot_best.pt


## Evaluate EN one-shot model on all LRL languages

In [11]:
# 08 - Evaluate EN-trained one-shot model on all LRL languages

one_results = []

for lang in lrl_languages:
    print("\n" + "=" * 80)
    print(f"EN-trained XLM-R (one-shot) -> LRL language: {lang}")
    print("=" * 80)

    df_lang = lrl_df[lrl_df["Language"] == lang].copy()
    if df_lang.empty:
        print(f"No rows for language {lang}, skipping.")
        continue

    # Build dataloader for this language (with labels)
    dev_loader_lang = make_loader(
        df_lang,
        tok_1s_xfer,
        batch_size=BATCH_SIZE,
        shuffle=False,
        max_length=256,
        with_labels=True,
    )

    ytrue, ypred = predict(model_1s_xfer, dev_loader_lang)

    f1_macro = f1_score(ytrue, ypred, average="macro")
    recall_macro = recall_score(ytrue, ypred, average="macro")
    acc = accuracy_score(ytrue, ypred)

    print(f"[EN-trained one-shot -> {lang}] Macro-F1={f1_macro:.4f}")
    print(f"[EN-trained one-shot -> {lang}] Macro-recall={recall_macro:.4f}")
    print(f"[EN-trained one-shot -> {lang}] Accuracy={acc:.4f}\n")

    print("Classification report:")
    print(classification_report(ytrue, ypred, digits=4))
    print("Confusion matrix:")
    print(confusion_matrix(ytrue, ypred))

    # Save per-language predictions
    out_df = df_lang.copy()
    out_df["PredLabel_EN_OneShot"] = ypred
    pred_path = OUT_DIR / f"{lang}_xlmr_en_one_shot_predictions.csv"
    out_df.to_csv(pred_path, index=False, encoding="utf-8")
    print(f"\nSaved one-shot predictions for {lang} to: {pred_path}")

    one_results.append({
        "Language": lang,
        "NumExamples": len(df_lang),
        "MacroF1_OneShot": f1_macro,
        "MacroRecall_OneShot": recall_macro,
        "Accuracy_OneShot": acc,
    })



EN-trained XLM-R (one-shot) -> LRL language: BN
[EN-trained one-shot -> BN] Macro-F1=0.1962
[EN-trained one-shot -> BN] Macro-recall=0.5000
[EN-trained one-shot -> BN] Accuracy=0.2440

Classification report:
              precision    recall  f1-score   support

           0     0.2440    1.0000    0.3923        41
           1     0.0000    0.0000    0.0000       127

    accuracy                         0.2440       168
   macro avg     0.1220    0.5000    0.1962       168
weighted avg     0.0596    0.2440    0.0958       168

Confusion matrix:
[[ 41   0]
 [127   0]]

Saved one-shot predictions for BN to: outputs_lrl_xlmr_en/BN_xlmr_en_one_shot_predictions.csv

EN-trained XLM-R (one-shot) -> LRL language: ML


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[EN-trained one-shot -> ML] Macro-F1=0.2857
[EN-trained one-shot -> ML] Macro-recall=0.2857
[EN-trained one-shot -> ML] Accuracy=0.4000

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        15
           1     0.5714    0.5714    0.5714        35

    accuracy                         0.4000        50
   macro avg     0.2857    0.2857    0.2857        50
weighted avg     0.4000    0.4000    0.4000        50

Confusion matrix:
[[ 0 15]
 [15 20]]

Saved one-shot predictions for ML to: outputs_lrl_xlmr_en/ML_xlmr_en_one_shot_predictions.csv

EN-trained XLM-R (one-shot) -> LRL language: PA
[EN-trained one-shot -> PA] Macro-F1=0.3107
[EN-trained one-shot -> PA] Macro-recall=0.4497
[EN-trained one-shot -> PA] Accuracy=0.4000

Classification report:
              precision    recall  f1-score   support

           0     0.4130    0.8636    0.5588        22
           1     0.2500    0.0357    0.0625        28

    accu

## Summaries & metadata

In [12]:
# 09 - Summaries & metadata

# Join zero-shot and one-shot results on Language
summary_df = None
if zero_results and one_results:
    zero_df = pd.DataFrame(zero_results)
    one_df = pd.DataFrame(one_results)
    summary_df = pd.merge(zero_df, one_df, on=["Language", "NumExamples"], how="outer")
elif zero_results:
    summary_df = pd.DataFrame(zero_results)
elif one_results:
    summary_df = pd.DataFrame(one_results)

if summary_df is not None:
    print("\n===== EN-trained XLM-R → LRL Summary =====")
    display(summary_df)

    summary_path = OUT_DIR / "xlmr_en_to_lrl_summary_all_languages.csv"
    summary_df.to_csv(summary_path, index=False, encoding="utf-8")
    print(f"\nSaved summary CSV to: {summary_path}")
else:
    print("No results collected (check data / training).")

# Save simple run metadata
with open(OUT_DIR / "run_lrl_xlmr_en.txt", "w", encoding="utf-8") as f:
    f.write(f"MODEL_NAME={MODEL_NAME}\n")
    f.write(f"DEVICE={device.type}\n")
    f.write(f"BATCH_SIZE={BATCH_SIZE}\n")
    f.write(f"NUM_EPOCHS={NUM_EPOCHS}\n")
    f.write(f"LR={LR}\n")
    f.write(f"DATA_SEMEVAL={DATA_DIR}\n")
    f.write(f"DATA_LRL={LRL_CSV}\n")
    f.write(f"BEST_EN_DEV_F1_ZERO={best_en_dev_f1_0s:.4f}\n")
    f.write(f"BEST_EN_DEV_F1_ONE={best_en_dev_f1_1s:.4f}\n")
    if summary_df is not None:
        for _, row in summary_df.iterrows():
            lang = row["Language"]
            f.write(
                f"LANG={lang},N={int(row['NumExamples'])},"
                f"MacroF1_Zero={row.get('MacroF1_ZeroShot', float('nan'))},"
                f"MacroF1_One={row.get('MacroF1_OneShot', float('nan'))}\n"
            )

print("\nDone. EN-trained XLM-R evaluated on all LRL languages in lrl_idioms.csv.")



===== EN-trained XLM-R → LRL Summary =====


Unnamed: 0,Language,NumExamples,MacroF1_ZeroShot,MacroRecall_ZeroShot,Accuracy_ZeroShot,MacroF1_OneShot,MacroRecall_OneShot,Accuracy_OneShot
0,BN,168,0.366146,0.356251,0.47619,0.196172,0.5,0.244048
1,ML,50,0.269481,0.27619,0.28,0.285714,0.285714,0.4
2,PA,50,0.415615,0.553571,0.5,0.310662,0.449675,0.4



Saved summary CSV to: outputs_lrl_xlmr_en/xlmr_en_to_lrl_summary_all_languages.csv

Done. EN-trained XLM-R evaluated on all LRL languages in lrl_idioms.csv.
