## Setup: device, imports, seeds, paths, model

In [1]:
# 01 - Setup: device, imports, seeds, paths, model

import os
from pathlib import Path
import random
import re
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional

# ---- Single toggle: "cpu" or "gpu"
RUN_DEVICE = "gpu"   # change to "cpu" to force CPU

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    SELECT_DEVICE = "cuda"
else:
    SELECT_DEVICE = "cpu"
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

device = torch.device(SELECT_DEVICE)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
if device.type == "cpu":
    torch.set_num_threads(max(1, os.cpu_count() // 2))

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)

DATA_DIR = Path("SemEval_2022_Task2-idiomaticity/SubTaskA")
TRAIN_ONE_SHOT = DATA_DIR / "Data" / "train_one_shot.csv"
TRAIN_ZERO_SHOT = DATA_DIR / "Data" / "train_zero_shot.csv"
DEV = DATA_DIR / "Data" / "dev.csv"
DEV_GOLD = DATA_DIR / "Data" / "dev_gold.csv"
EVAL = DATA_DIR / "Data" / "eval.csv"

OUT_DIR = Path("outputs_pt_llm")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Choose one instruct model that fits RAM; both OK on CPU/GPU:
# MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

# generation caps (we classify via logits; generation is a tiny fallback)
MAX_NEW_TOKENS = 1
DO_SAMPLE = False

print(f"Using device: {device.type.upper()} | Model: {MODEL_NAME}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: CUDA | Model: meta-llama/Llama-3.2-3B-Instruct


## Data loading & utilities (PT)

In [2]:
# 02 - Data loading & utilities (PT)

def load_any_csv(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep=None, engine="python", dtype=str)

def ensure_label_int(df: pd.DataFrame, col="Label") -> pd.DataFrame:
    if col in df.columns:
        df[col] = df[col].astype(int)
    return df

def load_train_dev(language="PT", oneshot=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_df = load_any_csv(TRAIN_ONE_SHOT if oneshot else TRAIN_ZERO_SHOT)
    dev_df = load_any_csv(DEV)
    gold_df = load_any_csv(DEV_GOLD)

    train_df.columns = [c.strip() for c in train_df.columns]
    dev_df.columns = [c.strip() for c in dev_df.columns]
    gold_df.columns = [c.strip() for c in gold_df.columns]

    train_df = train_df[train_df["Language"] == language].copy()
    dev_df = dev_df[dev_df["Language"] == language].copy()

    gold = gold_df[gold_df["Language"] == language][["ID","Label"]].copy()
    gold["ID"] = gold["ID"].astype(str)
    dev_df["ID"] = dev_df["ID"].astype(str)
    dev_lab = dev_df.merge(gold, on="ID", how="left")
    dev_lab = ensure_label_int(dev_lab, "Label")
    train_df = ensure_label_int(train_df, "Label")
    return train_df, dev_lab

def load_eval(language="PT") -> pd.DataFrame:
    df = load_any_csv(EVAL)
    df.columns = [c.strip() for c in df.columns]
    return df[df["Language"] == language].copy()

def mark_first_case_insensitive(text: str, needle: str, ltag="<mwe>", rtag="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    lt = text.lower(); ln = needle.lower()
    i = lt.find(ln)
    if i == -1:
        return text
    return text[:i] + ltag + text[i:i+len(needle)] + rtag + text[i+len(needle):]

def pack_context(prev: str, target: str, nxt: str, mwe: str) -> str:
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    tgt = mark_first_case_insensitive(target, mwe)
    # Keep labels in English for stability; content is PT
    return f"Previous: {prev}\nTarget: {tgt}\nNext: {nxt}"


## One-shot exemplar builders (PT)

In [3]:
# 03 - One-shot exemplar builders (PT)

def build_oneshot_index(train_one_shot_pt: pd.DataFrame) -> Dict[str, Dict[int, Dict[str, str]]]:
    idx = {}
    for _, r in train_one_shot_pt.iterrows():
        mwe = r["MWE"]
        y = int(r["Label"])
        ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)
        idx.setdefault(mwe, {})
        if y not in idx[mwe]:  # first seen per label
            idx[mwe][y] = {"context": ctx, "label": y}
    return idx

def pick_global_oneshot_fallback(train_one_shot_pt: pd.DataFrame) -> Dict[int, Dict[str, str]]:
    pool = {0: None, 1: None}
    for _, r in train_one_shot_pt.iterrows():
        y = int(r["Label"])
        if pool[y] is None:
            mwe = r["MWE"]
            ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)
            pool[y] = {"context": ctx, "label": y, "mwe": mwe}
        if pool[0] is not None and pool[1] is not None:
            break
    return pool


## Load LLM (CPU/GPU) and tokenizer

In [4]:
# 04 - Load LLM (CPU/GPU) and tokenizer with HF login + VRAM-safe setup

import os
from getpass import getpass
from huggingface_hub import login as hf_login
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc

# ----- VRAM helpers (safe on CPU too)
def free_vram():
    for k, v in list(globals().items()):
        try:
            import torch.nn as nn
            if isinstance(v, nn.Module):
                del globals()[k]
        except Exception:
            pass
        if isinstance(v, torch.Tensor):
            del globals()[k]
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    print("VRAM cleanup done.")

def cuda_mem():
    if torch.cuda.is_available():
        a = torch.cuda.memory_allocated() / (1024**3)
        r = torch.cuda.memory_reserved() / (1024**3)
        print(f"CUDA allocated: {a:.2f} GiB | reserved: {r:.2f} GiB")
    else:
        print("CUDA not available.")

# Prefer segmented allocator on CUDA to reduce fragmentation
if device.type == "cuda":
    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

# ----- Optional Hugging Face login (supports gated repos)
hf_token = os.getenv("HF_TOKEN", "").strip()
if not hf_token:
    try:
        hf_token = getpass("Enter your Hugging Face token (press Enter to skip): ").strip()
    except Exception:
        hf_token = input("Enter your Hugging Face token (press Enter to skip): ").strip()
if hf_token:
    try:
        hf_login(token=hf_token)
    except Exception as e:
        print(f"Warning: Hugging Face login failed: {e}")

_token_arg = {"token": hf_token} if hf_token else {}

# ----- Clean up any previous models before loading a new one
if device.type == "cuda":
    cuda_mem()
free_vram()
if device.type == "cuda":
    cuda_mem()

# ----- Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
    **_token_arg
)

# ----- Choose dtype & placement to avoid OOM on Colab T4
# On GPU: bf16 if supported, else fp16, with device_map="auto" to spread layers if needed.
# On CPU: fp32.
if device.type == "cuda":
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=dtype,
        device_map="auto",            # lets HF place on GPU (and CPU if needed) to fit memory
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        device_map={"": "cpu"},
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )

model.eval()

# ----- Ensure pad token and attention_mask behavior are well-defined
def _ensure_pad():
    if tokenizer.pad_token_id is None:
        if tokenizer.eos_token_id is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id

_ensure_pad()

if device.type == "cuda":
    cuda_mem()


Enter your Hugging Face token (press Enter to skip):  ········


CUDA allocated: 0.00 GiB | reserved: 0.00 GiB
VRAM cleanup done.
CUDA allocated: 0.00 GiB | reserved: 0.00 GiB


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.50s/it]


CUDA allocated: 5.98 GiB | reserved: 6.00 GiB


## Minimal prompts (compact, predictable)

In [5]:
# 05 - Minimal prompts (compact, predictable)

INSTR_ZS = (
    "Decide if the MWE is used idiomatically (figurative) in the Target sentence.\n"
    "Output only one digit: 1 (idiomatic) or 0 (literal)."
)

def make_zero_shot_prompt(mwe: str, ctx_block: str) -> str:
    return (
        f"{INSTR_ZS}\n"
        f"MWE: {mwe}\n"
        f"{ctx_block}\n"
        f"Answer:"
    )

INSTR_1S = (
    "You will see two labeled examples for the SAME MWE, then a new instance.\n"
    "Output only one digit: 1 (idiomatic) or 0 (literal)."
)

def make_one_shot_prompt(mwe: str, pos_example: str, neg_example: str, ctx_block: str) -> str:
    return (
        f"{INSTR_1S}\n"
        f"Example+(Label=1)\nMWE: {mwe}\n{pos_example}\n"
        f"Example-(Label=0)\nMWE: {mwe}\n{neg_example}\n"
        f"Now classify the new instance.\n"
        f"MWE: {mwe}\n{ctx_block}\n"
        f"Answer:"
    )


## Batched 0/1 classification via next-token logits

In [6]:
# 06 - Batched 0/1 classification via next-token logits

BATCH_GEN = 8  # adjust for memory/speed

def _apply_chat_or_plain_batch(texts: List[str]) -> dict:
    # Build input_ids and attention_mask with correct padding on CPU/GPU
    if hasattr(tokenizer, "apply_chat_template"):
        messages_batch = [[
            {"role": "system", "content": "You are a concise assistant. Respond with 0 or 1."},
            {"role": "user", "content": t}
        ] for t in texts]
        input_ids = tokenizer.apply_chat_template(
            messages_batch,
            add_generation_prompt=True,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
    else:
        input_ids = tokenizer(
            texts, return_tensors="pt", padding=True, truncation=True
        ).input_ids
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    return {
        "input_ids": input_ids.to(device, non_blocking=(device.type=="cuda")),
        "attention_mask": attention_mask.to(device, non_blocking=(device.type=="cuda"))
    }

# Cache token ids for digits "0" and "1"
_id0 = None
_id1 = None

def _candidate_token_id_for_digit(d: str) -> Optional[int]:
    ids = tokenizer.encode(d, add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    ids = tokenizer.encode(" " + d, add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    ids = tokenizer.encode(d + "\n", add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    return None

def _init_digit_ids():
    global _id0, _id1
    if _id0 is None: _id0 = _candidate_token_id_for_digit("0")
    if _id1 is None: _id1 = _candidate_token_id_for_digit("1")

_init_digit_ids()

@torch.no_grad()
def classify_prompts_logits(prompts: List[str]) -> List[int]:
    enc = _apply_chat_or_plain_batch(prompts)
    logits = model(**enc).logits          # [B, T, V]
    next_logits = logits[:, -1, :]        # [B, V]

    if (_id0 is not None) and (_id1 is not None):
        logit0 = next_logits[:, _id0]
        logit1 = next_logits[:, _id1]
        return (logit1 >= logit0).long().cpu().tolist()

    # Fallback: force-generate 1 token and parse
    gen = model.generate(
        **enc,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=DO_SAMPLE,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    outs = []
    for i in range(gen.size(0)):
        cut = enc["input_ids"][i].shape[-1]
        new_ids = gen[i][cut:]
        text = tokenizer.decode(new_ids, skip_special_tokens=True)
        if "0" in text and "1" in text:
            outs.append(1 if text.index("1") < text.index("0") else 0)
        elif "1" in text:
            outs.append(1)
        elif "0" in text:
            outs.append(0)
        else:
            outs.append(1)
    return outs


## Predictors (single-row helpers called by batch loops)

In [7]:
# 07 - Predictors (helpers building prompts per row)

def _zero_shot_prompt_for_row(r) -> str:
    mwe = r["MWE"]
    ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)
    return make_zero_shot_prompt(mwe, ctx)

def _one_shot_prompt_for_row(r, oneshot_index, global_pool) -> str:
    mwe = r["MWE"]
    ctx = pack_context(r.get("Previous",""), r.get("Target",""), r.get("Next",""), mwe)
    pos_ctx = oneshot_index.get(mwe, {}).get(1, {}).get("context")
    neg_ctx = oneshot_index.get(mwe, {}).get(0, {}).get("context")
    if pos_ctx is None: pos_ctx = global_pool[1]["context"]
    if neg_ctx is None: neg_ctx = global_pool[0]["context"]
    return make_one_shot_prompt(mwe, pos_ctx, neg_ctx, ctx)


## Zero-shot (PT): dev eval + eval submission (progress + RESUME, batched, OOM-safe)

In [8]:
# 08 - Zero-shot (PT): dev eval + eval submission (progress + RESUME, batched, OOM-safe)

from time import perf_counter
from tqdm.auto import tqdm

# optional: use math SDPA kernels to reduce peak memory (slower but safer)
try:
    if device.type == "cuda":
        torch.backends.cuda.enable_flash_sdp(False)
        torch.backends.cuda.enable_mem_efficient_sdp(False)
        torch.backends.cuda.enable_math_sdp(True)
except Exception:
    pass

def _load_cache(cache_path: Path) -> dict:
    if cache_path.exists():
        df = pd.read_csv(cache_path, dtype={"ID": str, "Label": int})
        return dict(zip(df["ID"].astype(str), df["Label"].astype(int)))
    return {}

def _append_one(cache_path: Path, rec: tuple):
    _id, _lab = rec
    header_needed = not cache_path.exists()
    with open(cache_path, "a") as f:
        if header_needed: f.write("ID,Label\n")
        f.write(f"{_id},{int(_lab)}\n")

def _classify_micro(prompts: list, micro_bs: int) -> list:
    """Run classification in small micro-batches, retrying on OOM by halving the batch size."""
    out = []
    i = 0
    bs = max(1, micro_bs)
    while i < len(prompts):
        step = min(bs, len(prompts) - i)
        chunk = prompts[i:i+step]
        try:
            with torch.inference_mode():
                out.extend(classify_prompts_logits(chunk))
            i += step
            if device.type == "cuda":
                torch.cuda.empty_cache()
        except RuntimeError as e:
            if "out of memory" in str(e).lower() and step > 1:
                if device.type == "cuda":
                    torch.cuda.empty_cache(); torch.cuda.ipc_collect()
                bs = max(1, step // 2)
                continue
            raise
    return out

def progressive_predict_zeroshot_batched(df: pd.DataFrame,
                                         cache_path: Path,
                                         desc: str,
                                         batch_gen: int = None,
                                         micro_bs: int = None) -> list:
    df = df.copy(); df["ID"] = df["ID"].astype(str)
    preds_map = _load_cache(cache_path)
    done = set(preds_map.keys())
    todo_idx = [i for i, _id in enumerate(df["ID"]) if _id not in done]

    # defaults tuned for T4; CPU can go larger
    if batch_gen is None:
        batch_gen = 8 if device.type == "cuda" else 16
    if micro_bs is None:
        micro_bs = 2 if device.type == "cuda" else 8

    print(f"{desc} | Resuming with {len(done)} cached / {len(df)} total")
    t0 = perf_counter()
    for start in tqdm(range(0, len(todo_idx), batch_gen), desc=desc, leave=True):
        rows = todo_idx[start:start+batch_gen]
        prompts = []; ids = []
        for j in rows:
            r = df.iloc[j]
            prompts.append(_zero_shot_prompt_for_row(r))
            ids.append(r["ID"])
        if not prompts:
            continue

        labels = _classify_micro(prompts, micro_bs)
        for _id, lab in zip(ids, labels):
            preds_map[_id] = int(lab)
            _append_one(cache_path, (_id, int(lab)))

        if device.type == "cuda":
            torch.cuda.empty_cache()

    elapsed = perf_counter() - t0
    print(f"{desc} | Newly computed: {len(todo_idx)} | Cached at start: {len(done)} | Total: {len(df)} | "
          f"Elapsed: {elapsed:.1f}s | {(elapsed/max(1,len(todo_idx))):.3f}s/example (new only)")
    return [preds_map[str(i)] for i in df["ID"]]

# ---- DEV (zero-shot PT) ----
train_0s_pt, dev_0s_pt = load_train_dev(language="PT", oneshot=False)
cache_dev_0s = OUT_DIR / "cache_llm_zeroshot_dev_pt.csv"
yhat_dev_0s = progressive_predict_zeroshot_batched(dev_0s_pt, cache_dev_0s, desc="Zero-shot PT (dev)")
ytrue_dev_0s = dev_0s_pt["Label"].tolist()
f1_0s = f1_score(ytrue_dev_0s, yhat_dev_0s, average="macro")
print(f"[LLM Zero-shot PT] Dev macro-F1: {f1_0s:.4f}")
print(classification_report(ytrue_dev_0s, yhat_dev_0s, digits=4))
print(confusion_matrix(ytrue_dev_0s, yhat_dev_0s))

# ---- EVAL (zero-shot PT) ----
eval_pt = load_eval(language="PT")
cache_eval_0s = OUT_DIR / "cache_llm_zeroshot_eval_pt.csv"
yhat_eval_0s = progressive_predict_zeroshot_batched(eval_pt, cache_eval_0s, desc="Zero-shot PT (eval)")

sub_0s = pd.DataFrame({
    "ID": eval_pt["ID"].astype(str),
    "Language": eval_pt["Language"],
    "Setting": ["zero_shot"] * len(eval_pt),
    "Label": yhat_eval_0s
})
sub_0s_path = OUT_DIR / "eval_submission_pt_llm_zeroshot.csv"
sub_0s.to_csv(sub_0s_path, index=False)
print(f"Wrote {sub_0s_path}")


Zero-shot PT (dev) | Resuming with 32 cached / 273 total


Zero-shot PT (dev): 100%|██████████| 31/31 [00:18<00:00,  1.71it/s]


Zero-shot PT (dev) | Newly computed: 241 | Cached at start: 32 | Total: 273 | Elapsed: 18.1s | 0.075s/example (new only)
[LLM Zero-shot PT] Dev macro-F1: 0.5170
              precision    recall  f1-score   support

           0     0.5816    0.5325    0.5559       154
           1     0.4545    0.5042    0.4781       119

    accuracy                         0.5201       273
   macro avg     0.5181    0.5183    0.5170       273
weighted avg     0.5262    0.5201    0.5220       273

[[82 72]
 [59 60]]
Zero-shot PT (eval) | Resuming with 0 cached / 279 total


Zero-shot PT (eval): 100%|██████████| 35/35 [00:14<00:00,  2.43it/s]

Zero-shot PT (eval) | Newly computed: 279 | Cached at start: 0 | Total: 279 | Elapsed: 14.4s | 0.052s/example (new only)
Wrote outputs_pt_llm/eval_submission_pt_llm_zeroshot.csv





## One-shot (PT): build exemplars, dev eval + eval submission (progress + RESUME, batched, OOM-safe)

In [9]:
# 09 - One-shot (PT): build exemplars, dev eval + eval submission (progress + RESUME, batched, OOM-safe)

from time import perf_counter
from tqdm.auto import tqdm

def progressive_predict_oneshot_batched(df: pd.DataFrame,
                                        oneshot_index: dict,
                                        global_pool: dict,
                                        cache_path: Path,
                                        desc: str,
                                        batch_gen: int = None,
                                        micro_bs: int = None) -> list:
    df = df.copy(); df["ID"] = df["ID"].astype(str)
    preds_map = _load_cache(cache_path)
    done = set(preds_map.keys())
    todo_idx = [i for i, _id in enumerate(df["ID"]) if _id not in done]

    if batch_gen is None:
        batch_gen = 8 if device.type == "cuda" else 16
    if micro_bs is None:
        micro_bs = 2 if device.type == "cuda" else 8

    print(f"{desc} | Resuming with {len(done)} cached / {len(df)} total")
    t0 = perf_counter()
    for start in tqdm(range(0, len(todo_idx), batch_gen), desc=desc, leave=True):
        rows = todo_idx[start:start+batch_gen]
        prompts = []; ids = []
        for j in rows:
            r = df.iloc[j]
            prompts.append(_one_shot_prompt_for_row(r, oneshot_index, global_pool))
            ids.append(r["ID"])
        if not prompts:
            continue

        labels = _classify_micro(prompts, micro_bs)
        for _id, lab in zip(ids, labels):
            preds_map[_id] = int(lab)
            _append_one(cache_path, (_id, int(lab)))

        if device.type == "cuda":
            torch.cuda.empty_cache()

    elapsed = perf_counter() - t0
    print(f"{desc} | Newly computed: {len(todo_idx)} | Cached at start: {len(done)} | Total: {len(df)} | "
          f"Elapsed: {elapsed:.1f}s | {(elapsed/max(1,len(todo_idx))):.3f}s/example (new only)")
    return [preds_map[str(i)] for i in df["ID"]]

train_1s_pt, dev_1s_pt = load_train_dev(language="PT", oneshot=True)
oneshot_index = build_oneshot_index(train_1s_pt)
global_pool = pick_global_oneshot_fallback(train_1s_pt)

cache_dev_1s = OUT_DIR / "cache_llm_oneshot_dev_pt.csv"
yhat_dev_1s = progressive_predict_oneshot_batched(dev_1s_pt, oneshot_index, global_pool, cache_dev_1s, desc="One-shot PT (dev)")
ytrue_dev_1s = dev_1s_pt["Label"].tolist()
f1_1s = f1_score(ytrue_dev_1s, yhat_dev_1s, average="macro")
print(f"[LLM One-shot PT] Dev macro-F1: {f1_1s:.4f}")
print(classification_report(ytrue_dev_1s, yhat_dev_1s, digits=4))
print(confusion_matrix(ytrue_dev_1s, yhat_dev_1s))

eval_pt = load_eval(language="PT")
cache_eval_1s = OUT_DIR / "cache_llm_oneshot_eval_pt.csv"
yhat_eval_1s = progressive_predict_oneshot_batched(eval_pt, oneshot_index, global_pool, cache_eval_1s, desc="One-shot PT (eval)")

sub_1s = pd.DataFrame({
    "ID": eval_pt["ID"].astype(str),
    "Language": eval_pt["Language"],
    "Setting": ["zero_shot"] * len(eval_pt),
    "Label": yhat_eval_1s
})
sub_1s_path = OUT_DIR / "eval_submission_pt_llm_oneshot.csv"
sub_1s.to_csv(sub_1s_path, index=False)
print(f"Wrote {sub_1s_path}")


One-shot PT (dev) | Resuming with 0 cached / 273 total


One-shot PT (dev): 100%|██████████| 35/35 [00:34<00:00,  1.01it/s]


One-shot PT (dev) | Newly computed: 273 | Cached at start: 0 | Total: 273 | Elapsed: 34.6s | 0.127s/example (new only)
[LLM One-shot PT] Dev macro-F1: 0.4412
              precision    recall  f1-score   support

           0     0.5617    0.8571    0.6787       154
           1     0.4211    0.1345    0.2038       119

    accuracy                         0.5421       273
   macro avg     0.4914    0.4958    0.4412       273
weighted avg     0.5004    0.5421    0.4717       273

[[132  22]
 [103  16]]
One-shot PT (eval) | Resuming with 0 cached / 279 total


One-shot PT (eval): 100%|██████████| 35/35 [00:37<00:00,  1.08s/it]

One-shot PT (eval) | Newly computed: 279 | Cached at start: 0 | Total: 279 | Elapsed: 37.8s | 0.135s/example (new only)
Wrote outputs_pt_llm/eval_submission_pt_llm_oneshot.csv





## Save run metadata + show cache files

In [10]:
# 10 - Save run metadata + show cache files

with open(OUT_DIR / "run_pt_llm.txt", "w") as f:
    f.write(f"MODEL_NAME={MODEL_NAME}\n")
    f.write(f"DEVICE={device.type}\n")

print("Resume cache files (delete any to start fresh for that split):")
for p in [
    OUT_DIR / "cache_llm_zeroshot_dev_pt.csv",
    OUT_DIR / "cache_llm_zeroshot_eval_pt.csv",
    OUT_DIR / "cache_llm_oneshot_dev_pt.csv",
    OUT_DIR / "cache_llm_oneshot_eval_pt.csv",
]:
    print(f"- {p}")


Resume cache files (delete any to start fresh for that split):
- outputs_pt_llm/cache_llm_zeroshot_dev_pt.csv
- outputs_pt_llm/cache_llm_zeroshot_eval_pt.csv
- outputs_pt_llm/cache_llm_oneshot_dev_pt.csv
- outputs_pt_llm/cache_llm_oneshot_eval_pt.csv
