## Setup: CPU/GPU, imports, seeds, paths, model choice

In [1]:
# 01 - Setup: single switch for CPU/GPU, imports, seeds, paths, model choice

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import random
import re
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import torch

RUN_DEVICE = "gpu"  # "gpu" to use CUDA if available, otherwise CPU

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

from sklearn.metrics import (
    f1_score,
    recall_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
)

from transformers import AutoTokenizer, AutoModelForCausalLM

# ---- Single CSV with all languages; same folder as this notebook
MULTI_CSV = Path("lrl_idioms.csv")

# Output directory (shared, with per-language files)
OUT_DIR = Path("outputs_lrl_llama")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# LLaMA model
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

# Batch size for batched logits classification
BATCH_GEN_ZERO = 8 if device.type == "cuda" else 4

print(f"CSV path: {MULTI_CSV.resolve()}")
print(f"Model: {MODEL_NAME}")
print(f"Device: {device.type}")


Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb


  from .autonotebook import tqdm as notebook_tqdm


CSV path: /project/6029407/mhossai6/INTD461-Project/lrl_idioms.csv
Model: meta-llama/Llama-3.2-3B-Instruct
Device: cuda


## Data loading & text utilities

In [2]:
# 02 - Data loading & text utilities

def load_lrl_csv(path: Path) -> pd.DataFrame:
    """
    Assumes CSV with columns at least:
    ID, Language, MWE, Previous, Target, Next, Label

    We'll keep Label only for metrics, not for prompts.
    """
    df = pd.read_csv(path, sep=None, engine="python", dtype=str)
    df.columns = [c.strip() for c in df.columns]
    if "Label" in df.columns:
        df["Label"] = df["Label"].astype(int)
    return df

def mark_first_case_insensitive(text: str, needle: str,
                                ltag="<mwe>", rtag="</mwe>") -> str:
    if not isinstance(text, str) or not isinstance(needle, str):
        return text
    lt = text.lower()
    ln = needle.lower()
    i = lt.find(ln)
    if i == -1:
        return text
    return text[:i] + ltag + text[i:i+len(needle)] + rtag + text[i+len(needle):]

def pack_context(prev: str, target: str, nxt: str, mwe: str) -> str:
    """
    Build the context block for the LLM. Note: NO label is included.
    """
    prev = "" if pd.isna(prev) else prev
    nxt = "" if pd.isna(nxt) else nxt
    tgt = mark_first_case_insensitive(target, mwe)
    # Keep instructions in English; content can be BN/ML/PA/etc.
    return f"Previous: {prev}\nTarget: {tgt}\nNext: {nxt}"

# Load full LRL dataset
full_df = load_lrl_csv(MULTI_CSV)
print("Loaded lrl_idioms.csv (all languages):")
print(full_df.head())
print(f"Total examples (all languages): {len(full_df)}")

# Get unique languages present
languages = sorted(full_df["Language"].dropna().unique().tolist())
print("\nLanguages found in dataset:", languages)

if len(languages) == 0:
    raise ValueError("No languages found in lrl_idioms.csv (Language column empty?).")


Loaded lrl_idioms.csv (all languages):
     ID Language                          MWE  \
0  bn_1       BN                   বৃষ্টির জল   
1  bn_2       BN  সংসার সুখের হয় রমণীর গুণে ।   
2  bn_3       BN                 যাচ্ছা তাই ।   
3  bn_4       BN         উলুবনে মুক্ত ছড়ানো ।   
4  bn_5       BN                     নিজের ঘর   

                                            Previous  \
0                 আকাশে অনেকক্ষণ ধরে কালো মেঘ জমছিল।   
1  ছোট বাসা, অল্প রোজগার—সব মিলিয়ে খুব অভাবের জীব...   
2  পাড়ার ক্লাবের অনুষ্ঠানের দায়িত্ব যাঁর হাতে দেও...   
3  প্রফেসর গভীর মনোযোগ দিয়ে কঠিন তত্ত্বগুলো বুঝিয়...   
4   সারাদিন বাইরে থাকায় সে ভীষণ ক্লান্ত হয়ে গিয়েছিল।   

                                              Target  \
0           হঠাৎ টুপটাপ করে বৃষ্টির জল পড়া শুরু হলো।   
1  তবু হাসিমুখে সব সামলে রাখত তারা; লোকজন বলত, সং...   
2  ফলে মঞ্চ, সাউন্ড, আলো—সব মিলিয়ে শেষ দিন একেবার...   
3  কিন্তু ছাত্ররা মোবাইলে ব্যস্ত, কেউ শুনছেই না—এ...   
4    বাড়ি ফিরে সে সোজা নিজের ঘরে গিয়ে দরজা বন্ধ ক

## Load LLaMA (CPU/GPU) and tokenizer with VRAM-safe setup

In [3]:
# 03 - Load LLM (CPU or GPU) and tokenizer with VRAM-safe setup

from getpass import getpass
from huggingface_hub import login as hf_login
import gc

# ----- VRAM helpers (safe on CPU too)
def free_vram():
    """
    Basic cleanup: run garbage collection and clear CUDA cache.
    """
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    print("VRAM cleanup done.")

def cuda_mem():
    if torch.cuda.is_available():
        a = torch.cuda.memory_allocated() / (1024**3)
        r = torch.cuda.memory_reserved() / (1024**3)
        print(f"CUDA allocated: {a:.2f} GiB | reserved: {r:.2f} GiB")
    else:
        print("CUDA not available.")

# Prefer segmented allocator on CUDA to reduce fragmentation
if device.type == "cuda":
    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

# ----- Optional Hugging Face login (supports gated repos)
hf_token = os.getenv("HF_TOKEN", "").strip()
if not hf_token:
    try:
        hf_token = getpass("Enter your Hugging Face token (or press Enter to skip): ").strip()
    except Exception:
        hf_token = input("Enter your Hugging Face token (or press Enter to skip): ").strip()
if hf_token:
    try:
        hf_login(token=hf_token)
    except Exception as e:
        print(f"Warning: Hugging Face login failed: {e}")

_token_arg = {"token": hf_token} if hf_token else {}

# ----- Clean up any previous models before loading a new one
if device.type == "cuda":
    cuda_mem()
free_vram()
if device.type == "cuda":
    cuda_mem()

# ----- Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
    **_token_arg
)

# ----- Choose dtype & placement to avoid OOM
if device.type == "cuda":
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=dtype,
        device_map="auto",      # let HF shard / offload as needed
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        device_map={"": "cpu"},  # all on CPU
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )

# ----- Ensure pad token and attention_mask behavior are well-defined
if tokenizer.pad_token_id is None:
    if tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

model.eval()

if device.type == "cuda":
    cuda_mem()


CUDA allocated: 0.00 GiB | reserved: 0.00 GiB
VRAM cleanup done.
CUDA allocated: 0.00 GiB | reserved: 0.00 GiB


`torch_dtype` is deprecated! Use `dtype` instead!
2025-12-03 19:11:31.233570: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-03 19:11:31.247201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764817891.257618 2477136 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764817891.260255 2477136 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764817891.269359 2477136 computation_placer.cc:177] computation placer already r

CUDA allocated: 5.98 GiB | reserved: 6.00 GiB





## Zero-shot prompt builder

In [4]:
# 04 - Minimal zero-shot prompt builder

INSTR_ZS = (
    "Decide if the MWE is used idiomatically (figurative) in the Target sentence.\n"
    "Output only one digit: 1 (idiomatic) or 0 (literal)."
)

def make_zero_shot_prompt(mwe: str, ctx_block: str) -> str:
    return (
        f"{INSTR_ZS}\n"
        f"MWE: {mwe}\n"
        f"{ctx_block}\n"
        f"Answer:"
    )


## Batched next-token logits classification (0/1 only)

In [5]:
# 05 - Batched next-token logits classification (0/1 only), CPU/GPU

def _apply_chat_or_plain_batch(texts: List[str]) -> Dict[str, torch.Tensor]:
    """
    Turn raw prompts into input_ids + attention_mask.
    """
    if hasattr(tokenizer, "apply_chat_template"):
        messages_batch = [[
            {"role": "system", "content": "You are a helpful, concise assistant."},
            {"role": "user", "content": t}
        ] for t in texts]
        input_ids = tokenizer.apply_chat_template(
            messages_batch,
            add_generation_prompt=True,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
    else:
        input_ids = tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).input_ids
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    return {"input_ids": input_ids, "attention_mask": attention_mask}

_id0: Optional[int] = None
_id1: Optional[int] = None

def _candidate_token_id_for_digit(d: str) -> Optional[int]:
    ids = tokenizer.encode(d, add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    ids = tokenizer.encode(" " + d, add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    ids = tokenizer.encode(d + "\n", add_special_tokens=False)
    if len(ids) == 1:
        return ids[0]
    return None

def _init_digit_ids():
    global _id0, _id1
    if _id0 is None:
        _id0 = _candidate_token_id_for_digit("0")
    if _id1 is None:
        _id1 = _candidate_token_id_for_digit("1")

_init_digit_ids()

def classify_prompts_logits(prompts: List[str]) -> List[int]:
    enc = _apply_chat_or_plain_batch(prompts)

    # Move inputs to the same device as the model
    model_device = next(model.parameters()).device
    enc = {k: v.to(model_device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits  # [B, T, V]
        next_logits = logits[:, -1, :]
        if _id0 is not None and _id1 is not None:
            logit0 = next_logits[:, _id0]
            logit1 = next_logits[:, _id1]
            return (logit1 >= logit0).long().detach().cpu().tolist()

    # Fallback: generate one token and parse
    outs: List[int] = []
    with torch.no_grad():
        gen = model.generate(
            **enc,
            max_new_tokens=1,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    for i in range(gen.size(0)):
        cut = enc["input_ids"][i].shape[-1]
        new_ids = gen[i][cut:]
        text = tokenizer.decode(new_ids, skip_special_tokens=True)
        if "0" in text and "1" in text:
            outs.append(1 if text.index("1") < text.index("0") else 0)
        elif "1" in text:
            outs.append(1)
        elif "0" in text:
            outs.append(0)
        else:
            outs.append(1)
    return outs


## Caching helpers

In [6]:
# 06 - Caching helpers

from time import perf_counter
from tqdm.auto import tqdm

def _load_cache(cache_path: Path) -> Dict[str, int]:
    """
    Load a simple ID -> Label cache from CSV, if it exists.
    """
    if cache_path.exists():
        df = pd.read_csv(cache_path, dtype={"ID": str, "Label": int})
        return dict(zip(df["ID"].astype(str), df["Label"].astype(int)))
    return {}

def _append_one(cache_path: Path, rec: Tuple[str, int]):
    """
    Append a single (ID, Label) record to the cache CSV (creating header if needed).
    """
    _id, _lab = rec
    header_needed = not cache_path.exists()
    with open(cache_path, "a", encoding="utf-8") as f:
        if header_needed:
            f.write("ID,Label\n")
        f.write(f"{_id},{int(_lab)}\n")


## Zero-shot prediction for a single language (batched, resumable)

In [7]:
# 07 - Zero-shot prediction for a single language (batched, resumable)

def progressive_predict_zero_shot_batched_for_language(
    df_lang: pd.DataFrame,
    language: str,
    cache_path: Path,
    desc: str = "Zero-shot",
) -> List[int]:
    df = df_lang.copy()
    df["ID"] = df["ID"].astype(str)

    preds_map = _load_cache(cache_path)
    done = set(preds_map.keys())
    todo_idx = [i for i, _id in enumerate(df["ID"]) if _id not in done]

    print(f"{desc} [{language}] | Resuming with {len(done)} cached / {len(df)} total")
    t0 = perf_counter()

    for start in tqdm(range(0, len(todo_idx), BATCH_GEN_ZERO), desc=f"{desc} [{language}]", leave=True):
        batch_rows = todo_idx[start:start + BATCH_GEN_ZERO]
        prompts: List[str] = []
        ids: List[str] = []

        for j in batch_rows:
            r = df.iloc[j]
            _id = r["ID"]
            mwe = r["MWE"]
            ctx = pack_context(
                r.get("Previous", ""),
                r.get("Target", ""),
                r.get("Next", ""),
                mwe,
            )
            prompts.append(make_zero_shot_prompt(mwe, ctx))
            ids.append(_id)

        if not prompts:
            continue

        labels = classify_prompts_logits(prompts)
        for _id, lab in zip(ids, labels):
            preds_map[_id] = int(lab)
            _append_one(cache_path, (_id, int(lab)))

        if device.type == "cuda":
            torch.cuda.empty_cache()

    elapsed = perf_counter() - t0
    print(
        f"{desc} [{language}] | Newly computed: {len(todo_idx)} | Cached at start: {len(done)} | "
        f"Total: {len(df)} | Elapsed: {elapsed:.1f}s | "
        f"{(elapsed / max(1, len(todo_idx))):.3f}s/example (new only)"
    )

    yhat = [preds_map[str(i)] for i in df["ID"]]
    return yhat


## Run zero-shot evaluation for all languages and collect results

In [8]:
# 08 - Run zero-shot evaluation for ALL languages and collect results

all_results = []  # will store per-language metric summaries

for lang in languages:
    print("\n" + "="*60)
    print(f"Processing language: {lang}")
    print("="*60)

    df_lang = full_df[full_df["Language"] == lang].copy()
    if df_lang.empty:
        print(f"No rows for language {lang}, skipping.")
        continue

    if "Label" not in df_lang.columns:
        print(f"No 'Label' column for language {lang}, skipping (no metrics possible).")
        continue

    cache_zero = OUT_DIR / f"cache_llama_zeroshot_{lang}.csv"

    yhat = progressive_predict_zero_shot_batched_for_language(
        df_lang,
        language=lang,
        cache_path=cache_zero,
        desc="Zero-shot LLaMA"
    )

    ytrue = df_lang["Label"].astype(int).tolist()

    f1_macro = f1_score(ytrue, yhat, average="macro")
    recall_macro = recall_score(ytrue, yhat, average="macro")
    acc = accuracy_score(ytrue, yhat)

    print(f"\n[LLAMA Zero-shot {lang}] Macro-F1: {f1_macro:.4f}")
    print(f"[LLAMA Zero-shot {lang}] Macro-recall: {recall_macro:.4f}")
    print(f"[LLAMA Zero-shot {lang}] Accuracy: {acc:.4f}")

    print("\nClassification report:")
    print(classification_report(ytrue, yhat, digits=4))
    print("Confusion matrix:")
    print(confusion_matrix(ytrue, yhat))

    # Save per-language predictions
    out_df = df_lang.copy()
    out_df["PredLabel_ZeroShot"] = yhat
    pred_path = OUT_DIR / f"{lang}_llama_zeroshot_predictions.csv"
    out_df.to_csv(pred_path, index=False, encoding="utf-8")
    print(f"\nSaved predictions for {lang} to: {pred_path}")

    all_results.append({
        "Language": lang,
        "NumExamples": len(df_lang),
        "MacroF1": f1_macro,
        "MacroRecall": recall_macro,
        "Accuracy": acc,
    })

print("\nFinished all languages.")



Processing language: BN
Zero-shot LLaMA [BN] | Resuming with 0 cached / 168 total


Zero-shot LLaMA [BN]: 100%|██████████| 21/21 [00:05<00:00,  3.55it/s]


Zero-shot LLaMA [BN] | Newly computed: 168 | Cached at start: 0 | Total: 168 | Elapsed: 5.9s | 0.035s/example (new only)

[LLAMA Zero-shot BN] Macro-F1: 0.3940
[LLAMA Zero-shot BN] Macro-recall: 0.6024
[LLAMA Zero-shot BN] Accuracy: 0.3988

Classification report:
              precision    recall  f1-score   support

           0     0.2887    1.0000    0.4481        41
           1     1.0000    0.2047    0.3399       127

    accuracy                         0.3988       168
   macro avg     0.6444    0.6024    0.3940       168
weighted avg     0.8264    0.3988    0.3663       168

Confusion matrix:
[[ 41   0]
 [101  26]]

Saved predictions for BN to: outputs_lrl_llama/BN_llama_zeroshot_predictions.csv

Processing language: ML
Zero-shot LLaMA [ML] | Resuming with 0 cached / 50 total


Zero-shot LLaMA [ML]: 100%|██████████| 7/7 [00:01<00:00,  5.06it/s]


Zero-shot LLaMA [ML] | Newly computed: 50 | Cached at start: 0 | Total: 50 | Elapsed: 1.4s | 0.028s/example (new only)

[LLAMA Zero-shot ML] Macro-F1: 0.3432
[LLAMA Zero-shot ML] Macro-recall: 0.5048
[LLAMA Zero-shot ML] Accuracy: 0.3600

Classification report:
              precision    recall  f1-score   support

           0     0.3023    0.8667    0.4483        15
           1     0.7143    0.1429    0.2381        35

    accuracy                         0.3600        50
   macro avg     0.5083    0.5048    0.3432        50
weighted avg     0.5907    0.3600    0.3011        50

Confusion matrix:
[[13  2]
 [30  5]]

Saved predictions for ML to: outputs_lrl_llama/ML_llama_zeroshot_predictions.csv

Processing language: PA
Zero-shot LLaMA [PA] | Resuming with 0 cached / 50 total


Zero-shot LLaMA [PA]: 100%|██████████| 7/7 [00:01<00:00,  6.21it/s]

Zero-shot LLaMA [PA] | Newly computed: 50 | Cached at start: 0 | Total: 50 | Elapsed: 1.1s | 0.023s/example (new only)

[LLAMA Zero-shot PA] Macro-F1: 0.3064
[LLAMA Zero-shot PA] Macro-recall: 0.3718
[LLAMA Zero-shot PA] Accuracy: 0.3400

Classification report:
              precision    recall  f1-score   support

           0     0.3590    0.6364    0.4590        22
           1     0.2727    0.1071    0.1538        28

    accuracy                         0.3400        50
   macro avg     0.3159    0.3718    0.3064        50
weighted avg     0.3107    0.3400    0.2881        50

Confusion matrix:
[[14  8]
 [25  3]]

Saved predictions for PA to: outputs_lrl_llama/PA_llama_zeroshot_predictions.csv

Finished all languages.





## Show summary table and save metadata

In [None]:
# 09 - Show summary table and save metadata

if all_results:
    summary_df = pd.DataFrame(all_results)
    print("\n===== Summary over all languages =====")
    display(summary_df)

    summary_path = OUT_DIR / "llama_zeroshot_summary_all_languages.csv"
    summary_df.to_csv(summary_path, index=False, encoding="utf-8")
    print(f"\nSaved summary CSV to: {summary_path}")
else:
    print("No results collected (check dataset / labels).")

# Save simple run metadata
with open(OUT_DIR / "run_lrl_llama_zeroshot.txt", "w", encoding="utf-8") as f:
    f.write(f"MODEL_NAME={MODEL_NAME}\n")
    f.write(f"DEVICE={device.type}\n")
    f.write(f"BATCH_GEN_ZERO={BATCH_GEN_ZERO}\n")
    f.write(f"DATA={MULTI_CSV}\n")
    if all_results:
        for row in all_results:
            f.write(
                f"LANG={row['Language']},N={row['NumExamples']},"
                f"MacroF1={row['MacroF1']:.4f},"
                f"MacroRecall={row['MacroRecall']:.4f},"
                f"Acc={row['Accuracy']:.4f}\n"
            )

print("\nResume cache files (delete to start fresh per language):")
for lang in languages:
    cache_path = OUT_DIR / f"cache_llama_zeroshot_{lang}.csv"
    if cache_path.exists():
        print(f"- {cache_path}")



===== Summary over all languages =====


Unnamed: 0,Language,NumExamples,MacroF1,MacroRecall,Accuracy
0,BN,168,0.393978,0.602362,0.39881
1,ML,50,0.343186,0.504762,0.36
2,PA,50,0.306431,0.371753,0.34



Saved summary CSV to: outputs_lrl_llama/llama_zeroshot_summary_all_languages.csv

Resume cache files (delete to start fresh per language):
- outputs_lrl_llama/cache_llama_zeroshot_BN.csv
- outputs_lrl_llama/cache_llama_zeroshot_ML.csv
- outputs_lrl_llama/cache_llama_zeroshot_PA.csv


: 