## Setup: single switch for CPU/GPU, imports, seeds, paths, model choice

In [1]:
# 01 - Setup: single switch for CPU/GPU, imports, seeds, paths, model choice

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import random
import re
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import torch

RUN_DEVICE = "gpu"  # change to "gpu" to use CUDA if available

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForCausalLM

DATA_DIR = Path("SemEval_2022_Task2-idiomaticity/SubTaskA")
TRAIN_ONE_SHOT = DATA_DIR / "Data" / "train_one_shot.csv"
TRAIN_ZERO_SHOT = DATA_DIR / "Data" / "train_zero_shot.csv"
DEV = DATA_DIR / "Data" / "dev.csv"
DEV_GOLD = DATA_DIR / "Data" / "dev_gold.csv"
EVAL = DATA_DIR / "Data" / "eval.csv"
EVAL_SUB_FMT = DATA_DIR / "Data" / "eval_submission_format.csv"

OUT_DIR = Path("outputs_en_llm")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Choose one instruct model that fits your RAM/VRAM
# MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

# Batch for logits classification
# BATCH_GEN = 8 if device.type == "cpu" else 32
BATCH_GEN_ZERO = 8 if device.type == "cuda" else 4
BATCH_GEN_ONE  = 2 if device.type == "cuda" else 2


Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb


  from .autonotebook import tqdm as notebook_tqdm


## Data loading & utilities (PT only)

In [2]:
# 02 - Data loading & utilities (PT only)

def load_any_csv(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep=None, engine="python", dtype=str)

def ensure_label_int(df: pd.DataFrame, col="Label") -> pd.DataFrame:
    if col in df.columns:
        df[col] = df[col].astype(int)
    return df

def load_train_dev(language="PT", oneshot=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if oneshot:
        train_df = load_any_csv(TRAIN_ONE_SHOT)
    else:
        train_df = load_any_csv(TRAIN_ZERO_SHOT)
    dev_df = load_any_csv(DEV)
    gold_df = load_any_csv(DEV_GOLD)

    train_df.columns = [c.strip() for c in train_df.columns]
    dev_df.columns = [c.strip() for c in dev_df.columns]
    gold_df.columns = [c.strip() for c in gold_df.columns]

    train_df = train_df[train_df["Language"] == language].copy()
    dev_df = dev_df[dev_df["Language"] == language].copy()

    gold = gold_df[gold_df["Language"] == language][["ID","Label"]].copy()
    gold["ID"] = gold["ID"].astype(str)
    dev_df["ID"] = dev_df["ID"].astype(str)
    dev_lab = dev_df.merge(gold, on="ID", how="left")
    dev_lab = ensure_label_int(dev_lab, "Label")
    train_df = ensure_label_int(train_df, "Label")
    return train_df, dev_lab

def load_eval(language="PT") -> pd.DataFrame:
    df = load_any_csv(EVAL)
    df.columns = [c.strip() for c in df.columns]
    return df[df["Language"] == language].copy()

def mark_first_case_insensitive(text: str, needle: str, ltag="<mwe>", rtag="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    lt = text.lower()
    ln = needle.lower()
    i = lt.find(ln)
    if i == -1:
        return text
    return text[:i] + ltag + text[i:i+len(needle)] + rtag + text[i+len(needle):]

def pack_context(prev: str, target: str, nxt: str, mwe: str) -> str:
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    tgt = mark_first_case_insensitive(target, mwe)
    return f"Previous: {prev}\nTarget: {tgt}\nNext: {nxt}"

def label_to_str(y: int) -> str:
    return "1" if int(y) == 1 else "0"


## Build one-shot exemplars per MWE from train_one_shot (PT)

In [3]:
# 03 - Build one-shot exemplars per MWE from train_one_shot (PT)

def build_oneshot_index(train_one_shot_en: pd.DataFrame) -> Dict[str, Dict[int, Dict[str, str]]]:
    idx = {}
    for _, r in train_one_shot_en.iterrows():
        mwe = r["MWE"]
        y = int(r["Label"])
        ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)
        idx.setdefault(mwe, {})
        if y not in idx[mwe]:
            idx[mwe][y] = {"context": ctx, "label": y}
    return idx

def pick_global_oneshot_fallback(train_one_shot_en: pd.DataFrame) -> Dict[int, Dict[str, str]]:
    pool = {0: None, 1: None}
    for _, r in train_one_shot_en.iterrows():
        y = int(r["Label"])
        if pool[y] is None:
            mwe = r["MWE"]
            ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)
            pool[y] = {"context": ctx, "label": y, "mwe": mwe}
        if pool[0] is not None and pool[1] is not None:
            break
    return pool


## Load LLM (CPU or GPU) and tokenizer

In [4]:
# 04 - Load LLM (CPU or GPU) and tokenizer

# HF login: use env var if set, otherwise interactively prompt (works in VS Code & Colab)
import os
from getpass import getpass
from huggingface_hub import login as hf_login

hf_token = os.getenv("HF_TOKEN", "").strip()
if not hf_token:
    try:
        # getpass hides the token in most environments (VS Code, Colab, etc.)
        hf_token = getpass("Enter your Hugging Face token (or press Enter to skip): ").strip()
    except Exception:
        # Fallback in case getpass doesn't work
        hf_token = input("Enter your Hugging Face token (or press Enter to skip): ").strip()

if hf_token:
    try:
        hf_login(token=hf_token)
    except Exception as e:
        print(f"Warning: Hugging Face login failed: {e}")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True
)

if device.type == "cuda":
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=dtype,
        device_map="auto",      # <<< let HF shard / offload to CPU again
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        device_map={"": "cpu"},  # all on CPU
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )

if tokenizer.pad_token_id is None:
    if tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

model.eval()


`torch_dtype` is deprecated! Use `dtype` instead!
2025-11-25 23:41:44.742876: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-25 23:41:44.756671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764142904.766631 3680896 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764142904.769300 3680896 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764142904.778575 3680896 computation_placer.cc:177] computation placer already r

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (ro

## Minimal prompt builders (compact)

In [5]:
# 05 - Minimal prompt builders (compact)

INSTR_ZS = (
    "Decide if the MWE is used idiomatically (figurative) in the Target sentence.\n"
    "Output only one digit: 1 (idiomatic) or 0 (literal)."
)

def make_zero_shot_prompt(mwe: str, ctx_block: str) -> str:
    return (
        f"{INSTR_ZS}\n"
        f"MWE: {mwe}\n"
        f"{ctx_block}\n"
        f"Answer:"
    )

INSTR_1S = (
    "You will see two labeled examples for the SAME MWE, then a new instance.\n"
    "Output only one digit: 1 (idiomatic) or 0 (literal)."
)

def make_one_shot_prompt(mwe: str, pos_example: str, neg_example: str, ctx_block: str) -> str:
    return (
        f"{INSTR_1S}\n"
        f"Example+(Label=1)\nMWE: {mwe}\n{pos_example}\n"
        f"Example-(Label=0)\nMWE: {mwe}\n{neg_example}\n"
        f"Now classify the new instance.\n"
        f"MWE: {mwe}\n{ctx_block}\n"
        f"Answer:"
    )


## Batched next-token logits classification (0/1 only), CPU/GPU

In [6]:
# 06 - Batched next-token logits classification (0/1 only), CPU/GPU

def _apply_chat_or_plain_batch(texts: list) -> dict:
    if hasattr(tokenizer, "apply_chat_template"):
        messages_batch = [[
            {"role": "system", "content": "You are a helpful, concise assistant."},
            {"role": "user", "content": t}
        ] for t in texts]
        input_ids = tokenizer.apply_chat_template(
            messages_batch,
            add_generation_prompt=True,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
    else:
        input_ids = tokenizer(
            texts, return_tensors="pt", padding=True, truncation=True
        ).input_ids
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    return {"input_ids": input_ids, "attention_mask": attention_mask}

_id0 = None
_id1 = None

def _candidate_token_id_for_digit(d: str) -> Optional[int]:
    ids = tokenizer.encode(d, add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    ids = tokenizer.encode(" " + d, add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    ids = tokenizer.encode(d + "\n", add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    return None

def _init_digit_ids():
    global _id0, _id1
    if _id0 is None: _id0 = _candidate_token_id_for_digit("0")
    if _id1 is None: _id1 = _candidate_token_id_for_digit("1")

_init_digit_ids()

def classify_prompts_logits(prompts: list) -> list:
    enc = _apply_chat_or_plain_batch(prompts)

    # Make sure inputs are on the same device as the model’s first parameter
    model_device = next(model.parameters()).device
    enc = {k: v.to(model_device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits  # [B, T, V]
        next_logits = logits[:, -1, :]
        if _id0 is not None and _id1 is not None:
            logit0 = next_logits[:, _id0]
            logit1 = next_logits[:, _id1]
            return (logit1 >= logit0).long().detach().cpu().tolist()

    outs = []
    with torch.no_grad():
        gen = model.generate(
            **enc,
            max_new_tokens=1,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    for i in range(gen.size(0)):
        cut = enc["input_ids"][i].shape[-1]
        new_ids = gen[i][cut:]
        text = tokenizer.decode(new_ids, skip_special_tokens=True)
        if "0" in text and "1" in text:
            outs.append(1 if text.index("1") < text.index("0") else 0)
        elif "1" in text:
            outs.append(1)
        elif "0" in text:
            outs.append(0)
        else:
            outs.append(1)
    return outs


## Predictors for zero-shot and one-shot (helpers if you need non-resume paths)

In [7]:
# 07 - Predictors for zero-shot and one-shot (helpers if you need non-resume paths)

def predict_zero_shot(df: pd.DataFrame) -> List[int]:
    preds = []
    for _, r in df.iterrows():
        mwe = r["MWE"]
        ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)
        prompt = make_zero_shot_prompt(mwe, ctx)
        yhat = classify_prompts_logits([prompt])[0]
        preds.append(yhat)
    return preds

def predict_one_shot(df: pd.DataFrame, oneshot_index: Dict[str, Dict[int, Dict[str, str]]], global_pool: Dict[int, Dict[str, str]]) -> List[int]:
    preds = []
    for _, r in df.iterrows():
        mwe = r["MWE"]
        ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)

        pos_ctx = oneshot_index.get(mwe, {}).get(1, {}).get("context")
        neg_ctx = oneshot_index.get(mwe, {}).get(0, {}).get("context")
        if pos_ctx is None:
            pos_ctx = global_pool[1]["context"]
        if neg_ctx is None:
            neg_ctx = global_pool[0]["context"]

        prompt = make_one_shot_prompt(mwe, pos_ctx, neg_ctx, ctx)
        yhat = classify_prompts_logits([prompt])[0]
        preds.append(yhat)
    return preds


## Caching helpers (shared by zero- and one-shot)

In [8]:
# 07.1 - Caching helpers (shared by zero- and one-shot)

from time import perf_counter
from tqdm.auto import tqdm

def _load_cache(cache_path: Path) -> dict:
    """
    Load a simple ID -> Label cache from CSV, if it exists.
    """
    if cache_path.exists():
        df = pd.read_csv(cache_path, dtype={"ID": str, "Label": int})
        return dict(zip(df["ID"].astype(str), df["Label"].astype(int)))
    return {}

def _append_one(cache_path: Path, rec: tuple):
    """
    Append a single (ID, Label) record to the cache CSV (creating header if needed).
    """
    _id, _lab = rec
    header_needed = not cache_path.exists()
    with open(cache_path, "a") as f:
        if header_needed:
            f.write("ID,Label\n")
        f.write(f"{_id},{int(_lab)}\n")


## Zero-shot (PT): dev eval + eval submission (progress + RESUME, batched logits)

In [None]:
# 08 - Zero-shot (PT): dev eval + eval submission (progress + RESUME, batched logits)

# Flip this to False if you want to skip recomputing zero-shot
RUN_ZERO_SHOT = True

def progressive_predict_zero_shot_batched(
    df: pd.DataFrame,
    cache_path: Path,
    desc: str = "Zero-shot PT",
) -> list:
    df = df.copy()
    df["ID"] = df["ID"].astype(str)

    preds_map = _load_cache(cache_path)
    done = set(preds_map.keys())
    todo_idx = [i for i, _id in enumerate(df["ID"]) if _id not in done]

    print(f"{desc} | Resuming with {len(done)} cached / {len(df)} total")
    t0 = perf_counter()

    for start in tqdm(range(0, len(todo_idx), BATCH_GEN_ZERO), desc=desc, leave=True):
        batch_rows = todo_idx[start:start + BATCH_GEN_ZERO]
        prompts, ids = [], []

        for j in batch_rows:
            r = df.iloc[j]
            _id = r["ID"]
            mwe = r["MWE"]
            ctx = pack_context(
                r.get("Previous", ""),
                r.get("Target", ""),
                r.get("Next", ""),
                mwe,
            )
            prompts.append(make_zero_shot_prompt(mwe, ctx))
            ids.append(_id)

        if not prompts:
            continue

        labels = classify_prompts_logits(prompts)
        for _id, lab in zip(ids, labels):
            preds_map[_id] = int(lab)
            _append_one(cache_path, (_id, int(lab)))

    elapsed = perf_counter() - t0
    print(
        f"{desc} | Newly computed: {len(todo_idx)} | Cached at start: {len(done)} | "
        f"Total: {len(df)} | Elapsed: {elapsed:.1f}s | "
        f"{(elapsed / max(1, len(todo_idx))):.3f}s/example (new only)"
    )

    yhat = [preds_map[str(i)] for i in df["ID"]]
    return yhat


if RUN_ZERO_SHOT:
    # ---- DEV (zero-shot PT) ----
    train_0s_en, dev_0s_en = load_train_dev(language="PT", oneshot=False)

    cache_dev_0s = OUT_DIR / "cache_llm_zeroshot_dev_en.csv"
    yhat_dev_0s = progressive_predict_zero_shot_batched(
        dev_0s_en, cache_dev_0s, desc="Zero-shot PT (dev)"
    )
    ytrue_dev_0s = dev_0s_en["Label"].tolist()
    f1_0s = f1_score(ytrue_dev_0s, yhat_dev_0s, average="macro")
    print(f"[LLM Zero-shot PT] Dev macro-F1: {f1_0s:.4f}")
    print(classification_report(ytrue_dev_0s, yhat_dev_0s, digits=4))
    print(confusion_matrix(ytrue_dev_0s, yhat_dev_0s))

    # ---- EVAL (zero-shot PT) ----
    eval_en = load_eval(language="PT")
    cache_eval_0s = OUT_DIR / "cache_llm_zeroshot_eval_en.csv"
    yhat_eval_0s = progressive_predict_zero_shot_batched(
        eval_en, cache_eval_0s, desc="Zero-shot PT (eval)"
    )

    sub_0s = pd.DataFrame({
        "ID": eval_en["ID"].astype(str),
        "Language": eval_en["Language"],
        "Setting": ["zero_shot"] * len(eval_en),
        "Label": yhat_eval_0s,
    })
    sub_0s_path = OUT_DIR / "eval_submission_en_llm_zeroshot.csv"
    sub_0s.to_csv(sub_0s_path, index=False)
    print(f"Wrote {sub_0s_path}")


Zero-shot EN (dev) | Resuming with 160 cached / 466 total


Zero-shot EN (dev): 100%|██████████| 10/10 [00:12<00:00,  1.23s/it]


Zero-shot EN (dev) | Newly computed: 306 | Cached at start: 160 | Total: 466 | Elapsed: 12.3s | 0.040s/example (new only)
[LLM Zero-shot EN] Dev macro-F1: 0.4134
              precision    recall  f1-score   support

           0     0.3803    0.7418    0.5028       182
           1     0.5766    0.2254    0.3241       284

    accuracy                         0.4270       466
   macro avg     0.4784    0.4836    0.4134       466
weighted avg     0.4999    0.4270    0.3939       466

[[135  47]
 [220  64]]
Zero-shot EN (eval) | Resuming with 0 cached / 483 total


Zero-shot EN (eval): 100%|██████████| 16/16 [00:10<00:00,  1.50it/s]

Zero-shot EN (eval) | Newly computed: 483 | Cached at start: 0 | Total: 483 | Elapsed: 10.6s | 0.022s/example (new only)
Wrote outputs_en_llm/eval_submission_en_llm_zeroshot.csv





## One-shot (PT): build exemplars, dev eval + eval submission (progress + RESUME, batched logits)

In [9]:
# 09 - One-shot (PT): build exemplars, dev eval + eval submission (progress + RESUME, batched logits)

# Flip this to False if you want to skip recomputing one-shot
RUN_ONE_SHOT = True

# Optional: clear temporary CUDA buffers before starting one-shot
import torch
if device.type == "cuda":
    torch.cuda.empty_cache()

def progressive_predict_one_shot_batched(
    df: pd.DataFrame,
    oneshot_index: dict,
    global_pool: dict,
    cache_path: Path,
    desc: str = "One-shot PT",
) -> list:
    df = df.copy()
    df["ID"] = df["ID"].astype(str)

    preds_map = _load_cache(cache_path)
    done = set(preds_map.keys())
    todo_idx = [i for i, _id in enumerate(df["ID"]) if _id not in done]

    print(f"{desc} | Resuming with {len(done)} cached / {len(df)} total")
    t0 = perf_counter()

    for start in tqdm(range(0, len(todo_idx), BATCH_GEN_ONE), desc=desc, leave=True):
        batch_rows = todo_idx[start:start + BATCH_GEN_ONE]
        prompts, ids = [], []

        for j in batch_rows:
            r = df.iloc[j]
            _id = r["ID"]
            mwe = r["MWE"]
            ctx = pack_context(
                r.get("Previous", ""),
                r.get("Target", ""),
                r.get("Next", ""),
                mwe,
            )

            # Idiom-specific exemplars if possible, otherwise global fallback
            pos_ctx = oneshot_index.get(mwe, {}).get(1, {}).get("context")
            neg_ctx = oneshot_index.get(mwe, {}).get(0, {}).get("context")
            if pos_ctx is None:
                pos_ctx = global_pool[1]["context"]
            if neg_ctx is None:
                neg_ctx = global_pool[0]["context"]

            prompts.append(make_one_shot_prompt(mwe, pos_ctx, neg_ctx, ctx))
            ids.append(_id)

        if not prompts:
            continue

        labels = classify_prompts_logits(prompts)
        for _id, lab in zip(ids, labels):
            preds_map[_id] = int(lab)
            _append_one(cache_path, (_id, int(lab)))

    elapsed = perf_counter() - t0
    print(
        f"{desc} | Newly computed: {len(todo_idx)} | Cached at start: {len(done)} | "
        f"Total: {len(df)} | Elapsed: {elapsed:.1f}s | "
        f"{(elapsed / max(1, len(todo_idx))):.3f}s/example (new only)"
    )

    yhat = [preds_map[str(i)] for i in df["ID"]]
    return yhat


if RUN_ONE_SHOT:
    train_1s_en, dev_1s_en = load_train_dev(language="PT", oneshot=True)
    oneshot_index = build_oneshot_index(train_1s_en)
    global_pool = pick_global_oneshot_fallback(train_1s_en)

    # ---- DEV (one-shot PT) ----
    cache_dev_1s = OUT_DIR / "cache_llm_oneshot_dev_en.csv"
    yhat_dev_1s = progressive_predict_one_shot_batched(
        dev_1s_en, oneshot_index, global_pool, cache_dev_1s, desc="One-shot PT (dev)"
    )
    ytrue_dev_1s = dev_1s_en["Label"].tolist()
    f1_1s = f1_score(ytrue_dev_1s, yhat_dev_1s, average="macro")
    print(f"[LLM One-shot PT] Dev macro-F1: {f1_1s:.4f}")
    print(classification_report(ytrue_dev_1s, yhat_dev_1s, digits=4))
    print(confusion_matrix(ytrue_dev_1s, yhat_dev_1s))

    # ---- EVAL (one-shot PT) ----
    eval_en = load_eval(language="PT")
    cache_eval_1s = OUT_DIR / "cache_llm_oneshot_eval_en.csv"
    yhat_eval_1s = progressive_predict_one_shot_batched(
        eval_en, oneshot_index, global_pool, cache_eval_1s, desc="One-shot PT (eval)"
    )

    sub_1s = pd.DataFrame({
        "ID": eval_en["ID"].astype(str),
        "Language": eval_en["Language"],
        "Setting": ["one_shot"] * len(eval_en),  # note: one_shot
        "Label": yhat_eval_1s,
    })
    sub_1s_path = OUT_DIR / "eval_submission_en_llm_oneshot.csv"
    sub_1s.to_csv(sub_1s_path, index=False)
    print(f"Wrote {sub_1s_path}")


One-shot EN (dev) | Resuming with 0 cached / 466 total


One-shot EN (dev):   0%|          | 0/233 [00:00<?, ?it/s]

One-shot EN (dev): 100%|██████████| 233/233 [00:24<00:00,  9.58it/s]


One-shot EN (dev) | Newly computed: 466 | Cached at start: 0 | Total: 466 | Elapsed: 24.3s | 0.052s/example (new only)
[LLM One-shot EN] Dev macro-F1: 0.4759
              precision    recall  f1-score   support

           0     0.3785    0.5220    0.4388       182
           1     0.5953    0.4507    0.5130       284

    accuracy                         0.4785       466
   macro avg     0.4869    0.4863    0.4759       466
weighted avg     0.5107    0.4785    0.4840       466

[[ 95  87]
 [156 128]]
One-shot EN (eval) | Resuming with 0 cached / 483 total


One-shot EN (eval): 100%|██████████| 242/242 [00:24<00:00,  9.85it/s]

One-shot EN (eval) | Newly computed: 483 | Cached at start: 0 | Total: 483 | Elapsed: 24.6s | 0.051s/example (new only)
Wrote outputs_en_llm/eval_submission_en_llm_oneshot.csv





## Save run metadata

In [11]:
# 10 - Save run metadata

# Safely get F1s (in case zero-shot or one-shot were skipped)
zero_f1_str = f"{f1_0s:.4f}" if "f1_0s" in globals() else "NA"
one_f1_str  = f"{f1_1s:.4f}" if "f1_1s" in globals() else "NA"

with open(OUT_DIR / "run_en_llm.txt", "w") as f:
    f.write(f"MODEL_NAME={MODEL_NAME}\n")
    f.write(f"DEVICE={device.type}\n")
    f.write(f"BATCH_GEN_ZERO={BATCH_GEN_ZERO}\n")
    f.write(f"BATCH_GEN_ONE={BATCH_GEN_ONE}\n")
    f.write(f"ZERO_SHOT_DEV_F1={zero_f1_str}\n")
    f.write(f"ONE_SHOT_DEV_F1={one_f1_str}\n")

print("Saved run metadata.")

print("Resume cache files (delete to start fresh):")
for p in [
    OUT_DIR / "cache_llm_zeroshot_dev_en.csv",
    OUT_DIR / "cache_llm_zeroshot_eval_en.csv",
    OUT_DIR / "cache_llm_oneshot_dev_en.csv",
    OUT_DIR / "cache_llm_oneshot_eval_en.csv",
]:
    print(f"- {p}")


Saved run metadata.
Resume cache files (delete to start fresh):
- outputs_en_llm/cache_llm_zeroshot_dev_en.csv
- outputs_en_llm/cache_llm_zeroshot_eval_en.csv
- outputs_en_llm/cache_llm_oneshot_dev_en.csv
- outputs_en_llm/cache_llm_oneshot_eval_en.csv
