## Setup: device, imports, seeds, paths, model

In [1]:
# 01 - Setup: device, imports, seeds, paths, model

import os
from pathlib import Path
import random
import re
import gc
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd

import torch
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score

# ---- Single toggle: "cpu" or "gpu"
RUN_DEVICE = "gpu"   # change to "cpu" to force CPU

os.environ["TOKENIZERS_PARALLELISM"] = "false"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    SELECT_DEVICE = "cuda"
else:
    SELECT_DEVICE = "cpu"
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

device = torch.device(SELECT_DEVICE)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
if device.type == "cpu":
    torch.set_num_threads(max(1, os.cpu_count() // 2))

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)

# ---- Single CSV & language setup
# lrl_idioms.csv lives in the SAME FOLDER as this notebook
MULTI_CSV = Path("lrl_idioms.csv")

# choose which language to run: e.g., "BN", "ML", "PA"
LANGUAGE = "BN"

OUT_DIR = Path(f"outputs_{LANGUAGE.lower()}_llm")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Choose instruct model that fits RAM
# MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

# generation caps (we classify via logits; generation is a tiny fallback)
MAX_NEW_TOKENS = 1
DO_SAMPLE = False

print(f"Using device: {device.type.upper()} | Model: {MODEL_NAME}")
print(f"CSV path: {MULTI_CSV.resolve()}")
print(f"Selected language: {LANGUAGE}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: CUDA | Model: meta-llama/Llama-3.2-3B-Instruct
CSV path: /project/6029407/mhossai6/INTD461-Project/lrl_idioms.csv
Selected language: BN


## Data loading & utilities (single CSV + language filter)

In [2]:
# 02 - Data loading & utilities (single CSV + language filter)

def load_lrl_csv(path: Path) -> pd.DataFrame:
    """
    Assumes CSV with columns at least:
    ID, Language, MWE, Previous, Target, Next, Label

    We'll keep Label only for metrics, not for prompts.
    """
    df = pd.read_csv(path, sep=None, engine="python", dtype=str)
    df.columns = [c.strip() for c in df.columns]
    if "Label" in df.columns:
        df["Label"] = df["Label"].astype(int)
    return df

def mark_first_case_insensitive(text: str, needle: str,
                                ltag="<mwe>", rtag="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    lt = text.lower()
    ln = needle.lower()
    i = lt.find(ln)
    if i == -1:
        return text
    return text[:i] + ltag + text[i:i+len(needle)] + rtag + text[i+len(needle):]

def pack_context(prev: str, target: str, nxt: str, mwe: str) -> str:
    """
    Build the context block for the LLM. Note: NO label is included.
    """
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    tgt = mark_first_case_insensitive(target, mwe)
    # Keep instructions in English; content can be BN/ML/PA/etc.
    return f"Previous: {prev}\nTarget: {tgt}\nNext: {nxt}"

# Load full LRL dataset
full_df = load_lrl_csv(MULTI_CSV)
print("Loaded lrl_idioms.csv (all languages):")
print(full_df.head())
print(f"Total examples (all languages): {len(full_df)}")

# Filter by language
lang_df = full_df[full_df["Language"] == LANGUAGE].copy()
print(f"\nFiltered to language = {LANGUAGE}:")
print(lang_df.head())
print(f"Total examples for {LANGUAGE}: {len(lang_df)}")

if len(lang_df) == 0:
    raise ValueError(f"No rows found for LANGUAGE={LANGUAGE}. Check your CSV.")


FileNotFoundError: [Errno 2] No such file or directory: 'lrl_idioms.csv'

## Load LLM (CPU/GPU) and tokenizer with HF login + VRAM-safe setup

In [None]:
# 03 - Load LLM (CPU/GPU) and tokenizer with HF login + VRAM-safe setup

from getpass import getpass
from huggingface_hub import login as hf_login

# ----- VRAM helpers (safe on CPU too)
def free_vram():
    for k, v in list(globals()).copy().items():
        try:
            import torch.nn as nn
            if isinstance(v, nn.Module):
                del globals()[k]
        except Exception:
            pass
        if isinstance(v, torch.Tensor):
            del globals()[k]
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    print("VRAM cleanup done.")

def cuda_mem():
    if torch.cuda.is_available():
        a = torch.cuda.memory_allocated() / (1024**3)
        r = torch.cuda.memory_reserved() / (1024**3)
        print(f"CUDA allocated: {a:.2f} GiB | reserved: {r:.2f} GiB")
    else:
        print("CUDA not available.")

# Prefer segmented allocator on CUDA to reduce fragmentation
if device.type == "cuda":
    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

# ----- Optional Hugging Face login (supports gated repos)
hf_token = os.getenv("HF_TOKEN", "").strip()
if not hf_token:
    try:
        hf_token = getpass("Enter your Hugging Face token (press Enter to skip): ").strip()
    except Exception:
        # Some environments don't like getpass
        hf_token = input("Enter your Hugging Face token (press Enter to skip): ").strip()
if hf_token:
    try:
        hf_login(token=hf_token)
    except Exception as e:
        print(f"Warning: Hugging Face login failed: {e}")

_token_arg = {"token": hf_token} if hf_token else {}

# ----- Clean up any previous models before loading a new one
if device.type == "cuda":
    cuda_mem()
free_vram()
if device.type == "cuda":
    cuda_mem()

# ----- Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
    **_token_arg
)

# ----- Choose dtype & placement to avoid OOM
if device.type == "cuda":
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=dtype,
        device_map="auto",            # lets HF place on GPU (and CPU if needed)
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        device_map={"": "cpu"},
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )

model.eval()

# ----- Ensure pad token and attention_mask behavior are well-defined
def _ensure_pad():
    if tokenizer.pad_token_id is None:
        if tokenizer.eos_token_id is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id

_ensure_pad()

if device.type == "cuda":
    cuda_mem()


## Minimal zero-shot prompt

In [None]:
# 04 - Minimal prompts (zero-shot only)

INSTR_ZS = (
    "Decide if the MWE is used idiomatically (figurative) in the Target sentence.\n"
    "Output only one digit: 1 (idiomatic) or 0 (literal)."
)

def make_zero_shot_prompt(mwe: str, ctx_block: str) -> str:
    return (
        f"{INSTR_ZS}\n"
        f"MWE: {mwe}\n"
        f"{ctx_block}\n"
        f"Answer:"
    )

def _zero_shot_prompt_for_row(r) -> str:
    """
    Build the prompt for a single row.
    IMPORTANT: We do NOT include the Label column here.
    """
    mwe = r["MWE"]
    ctx = pack_context(r.get("Previous", ""), r.get("Target", ""), r.get("Next", ""), mwe)
    return make_zero_shot_prompt(mwe, ctx)


## Batched 0/1 classification via next-token logits

In [None]:
# 05 - Batched 0/1 classification via next-token logits

BATCH_GEN = 8  # adjust for memory/speed

def _apply_chat_or_plain_batch(texts: List[str]) -> dict:
    """
    Convert a batch of strings to model inputs (input_ids, attention_mask).
    """
    if hasattr(tokenizer, "apply_chat_template"):
        messages_batch = [[
            {"role": "system", "content": "You are a concise assistant. Respond with 0 or 1."},
            {"role": "user", "content": t}
        ] for t in texts]
        input_ids = tokenizer.apply_chat_template(
            messages_batch,
            add_generation_prompt=True,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
    else:
        input_ids = tokenizer(
            texts, return_tensors="pt", padding=True, truncation=True
        ).input_ids
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    return {
        "input_ids": input_ids.to(device, non_blocking=(device.type == "cuda")),
        "attention_mask": attention_mask.to(device, non_blocking=(device.type == "cuda"))
    }

# Cache token ids for digits "0" and "1"
_id0 = None
_id1 = None

def _candidate_token_id_for_digit(d: str) -> Optional[int]:
    ids = tokenizer.encode(d, add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    ids = tokenizer.encode(" " + d, add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    ids = tokenizer.encode(d + "\n", add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    return None

def _init_digit_ids():
    global _id0, _id1
    if _id0 is None:
        _id0 = _candidate_token_id_for_digit("0")
    if _id1 is None:
        _id1 = _candidate_token_id_for_digit("1")

_init_digit_ids()

@torch.no_grad()
def classify_prompts_logits(prompts: List[str]) -> List[int]:
    """
    Given a batch of prompts, classify with 0/1 using next-token logits.
    """
    enc = _apply_chat_or_plain_batch(prompts)# 05 - Batched 0/1 classification via next-token logits

BATCH_GEN = 8  # adjust for memory/speed

def _apply_chat_or_plain_batch(texts: List[str]) -> dict:
    """
    Convert a batch of strings to model inputs (input_ids, attention_mask).
    """
    if hasattr(tokenizer, "apply_chat_template"):
        messages_batch = [[
            {"role": "system", "content": "You are a concise assistant. Respond with 0 or 1."},
            {"role": "user", "content": t}
        ] for t in texts]
        input_ids = tokenizer.apply_chat_template(
            messages_batch,
            add_generation_prompt=True,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
    else:
        input_ids = tokenizer(
            texts, return_tensors="pt", padding=True, truncation=True
        ).input_ids
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    return {
        "input_ids": input_ids.to(device, non_blocking=(device.type == "cuda")),
        "attention_mask": attention_mask.to(device, non_blocking=(device.type == "cuda"))
    }

# Cache token ids for digits "0" and "1"
_id0 = None
_id1 = None

def _candidate_token_id_for_digit(d: str) -> Optional[int]:
    ids = tokenizer.encode(d, add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    ids = tokenizer.encode(" " + d, add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    ids = tokenizer.encode(d + "\n", add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    return None

def _init_digit_ids():
    global _id0, _id1
    if _id0 is None:
        _id0 = _candidate_token_id_for_digit("0")
    if _id1 is None:
        _id1 = _candidate_token_id_for_digit("1")

_init_digit_ids()

@torch.no_grad()
def classify_prompts_logits(prompts: List[str]) -> List[int]:
    """
    Given a batch of prompts, classify with 0/1 using next-token logits.
    """
    enc = _apply_chat_or_plain_batch(prompts)
    logits = model(**enc).logits          # [B, T, V]
    next_logits = logits[:, -1, :]        # [B, V]

    if (_id0 is not None) and (_id1 is not None):
        logit0 = next_logits[:, _id0]
        logit1 = next_logits[:, _id1]
        return (logit1 >= logit0).long().cpu().tolist()

    # Fallback: force-generate 1 token and parse
    gen = model.generate(
        **enc,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=DO_SAMPLE,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    outs = []
    for i in range(gen.size(0)):
        cut = enc["input_ids"][i].shape[-1]
        new_ids = gen[i][cut:]
        text = tokenizer.decode(new_ids, skip_special_tokens=True)
        if "0" in text and "1" in text:
            outs.append(1 if text.index("1") < text.index("0") else 0)
        elif "1" in text:
            outs.append(1)
        elif "0" in text:
            outs.append(0)
        else:
            outs.append(1)  # default to 1 if ambiguous
    return outs

    logits = model(**enc).logits          # [B, T, V]
    next_logits = logits[:, -1, :]        # [B, V]

    if (_id0 is not None) and (_id1 is not None):
        logit0 = next_logits[:, _id0]
        logit1 = next_logits[:, _id1]
        return (logit1 >= logit0).long().cpu().tolist()

    # Fallback: force-generate 1 token and parse
    gen = model.generate(
        **enc,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=DO_SAMPLE,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    outs = []
    for i in range(gen.size(0)):
        cut = enc["input_ids"][i].shape[-1]
        new_ids = gen[i][cut:]
        text = tokenizer.decode(new_ids, skip_special_tokens=True)
        if "0" in text and "1" in text:
            outs.append(1 if text.index("1") < text.index("0") else 0)
        elif "1" in text:
            outs.append(1)
        elif "0" in text:
            outs.append(0)
        else:
            outs.append(1)  # default to 1 if ambiguous
    return outs


## Progressive prediction (zero-shot, with caching)

In [None]:
# 06 - Progressive prediction (zero-shot, with caching)

from time import perf_counter
from tqdm.auto import tqdm

# optional: use math SDPA kernels to reduce peak memory (slower but safer)
try:
    if device.type == "cuda":
        torch.backends.cuda.enable_flash_sdp(False)
        torch.backends.cuda.enable_mem_efficient_sdp(False)
        torch.backends.cuda.enable_math_sdp(True)
except Exception:
    pass

def _load_cache(cache_path: Path) -> dict:
    """
    Load cached predictions if they exist.
    """
    if cache_path.exists():
        df = pd.read_csv(cache_path, dtype={"ID": str, "Label": int})
        return dict(zip(df["ID"].astype(str), df["Label"].astype(int)))
    return {}

def _append_one(cache_path: Path, rec: tuple):
    _id, _lab = rec
    header_needed = not cache_path.exists()
    with open(cache_path, "a", encoding="utf-8") as f:
        if header_needed:
            f.write("ID,Label\n")
        f.write(f"{_id},{int(_lab)}\n")

def _classify_micro(prompts: list, micro_bs: int) -> list:
    """
    Run classification in small micro-batches, retrying on OOM by halving the batch size.
    """
    out = []
    i = 0
    bs = max(1, micro_bs)
    while i < len(prompts):
        step = min(bs, len(prompts) - i)
        chunk = prompts[i:i+step]
        try:
            with torch.inference_mode():
                out.extend(classify_prompts_logits(chunk))
            i += step
            if device.type == "cuda":
                torch.cuda.empty_cache()
        except RuntimeError as e:
            if "out of memory" in str(e).lower() and step > 1:
                if device.type == "cuda":
                    torch.cuda.empty_cache(); torch.cuda.ipc_collect()
                bs = max(1, step // 2)
                continue
            raise
    return out

def progressive_predict_zeroshot_batched(df: pd.DataFrame,
                                         cache_path: Path,
                                         desc: str,
                                         batch_gen: int = None,
                                         micro_bs: int = None) -> list:
    """
    Zero-shot predictions on the given dataframe, with resumable cache.
    Assumes df has columns: ID, MWE, Previous, Target, Next.
    """
    df = df.copy()
    df["ID"] = df["ID"].astype(str)

    preds_map = _load_cache(cache_path)
    done = set(preds_map.keys())
    todo_idx = [i for i, _id in enumerate(df["ID"]) if _id not in done]

    # defaults tuned for T4; CPU can go larger
    if batch_gen is None:
        batch_gen = 8 if device.type == "cuda" else 16
    if micro_bs is None:
        micro_bs = 2 if device.type == "cuda" else 8

    print(f"{desc} | Resuming with {len(done)} cached / {len(df)} total")
    t0 = perf_counter()
    for start in tqdm(range(0, len(todo_idx), batch_gen), desc=desc, leave=True):
        rows = todo_idx[start:start+batch_gen]
        prompts = []
        ids = []
        for j in rows:
            r = df.iloc[j]
            prompts.append(_zero_shot_prompt_for_row(r))
            ids.append(r["ID"])
        if not prompts:
            continue

        labels = _classify_micro(prompts, micro_bs)
        for _id, lab in zip(ids, labels):
            preds_map[_id] = int(lab)
            _append_one(cache_path, (_id, int(lab)))

        if device.type == "cuda":
            torch.cuda.empty_cache()

    elapsed = perf_counter() - t0
    print(
        f"{desc} | Newly computed: {len(todo_idx)} | Cached at start: {len(done)} | "
        f"Total: {len(df)} | Elapsed: {elapsed:.1f}s | "
        f"{(elapsed / max(1, len(todo_idx))):.3f}s/example (new only)"
    )
    return [preds_map[str(i)] for i in df["ID"]]


## Run zero-shot classification for chosen language, compute metrics, save outputs

In [None]:
# 07 - Run zero-shot classification for chosen language, compute metrics, save outputs

# Cache for selected language
cache_lang_0s = OUT_DIR / f"cache_llm_zeroshot_{LANGUAGE.lower()}.csv"

yhat_lang_0s = progressive_predict_zeroshot_batched(
    lang_df,
    cache_lang_0s,
    desc=f"Zero-shot {LANGUAGE} (full dataset)"
)

# Ground-truth labels (used ONLY here, AFTER prediction; never sent to model)
if "Label" not in lang_df.columns:
    raise ValueError("No 'Label' column found in lrl_idioms.csv; cannot compute metrics.")

ytrue_lang = lang_df["Label"].astype(int).tolist()

f1_macro = f1_score(ytrue_lang, yhat_lang_0s, average="macro")
acc = accuracy_score(ytrue_lang, yhat_lang_0s)

print(f"[LLM Zero-shot {LANGUAGE}] Macro-F1: {f1_macro:.4f}")
print(f"[LLM Zero-shot {LANGUAGE}] Accuracy: {acc:.4f}")
print("\nClassification report:")
print(classification_report(ytrue_lang, yhat_lang_0s, digits=4))
print("Confusion matrix:")
print(confusion_matrix(ytrue_lang, yhat_lang_0s))

# Save predictions together with original data
lang_out = lang_df.copy()
lang_out["PredLabel"] = yhat_lang_0s
pred_path = OUT_DIR / f"{LANGUAGE.lower()}_predictions_llm_zeroshot.csv"
lang_out.to_csv(pred_path, index=False, encoding="utf-8")
print(f"\nSaved predictions to: {pred_path}")


## Save run metadata & show cache file

In [None]:
# 08 - Save run metadata & show cache file

with open(OUT_DIR / f"run_{LANGUAGE.lower()}_llm.txt", "w", encoding="utf-8") as f:
    f.write(f"MODEL_NAME={MODEL_NAME}\n")
    f.write(f"DEVICE={device.type}\n")
    f.write(f"DATA={MULTI_CSV}\n")
    f.write(f"LANGUAGE={LANGUAGE}\n")

print("Resume cache file (delete it to start fresh for this language):")
print(f"- {OUT_DIR / f'cache_llm_zeroshot_{LANGUAGE.lower()}.csv'}")
