In [8]:
# =========================
# 6) Choose EVAL file, load & run eval (auto-detect fields + progress bars)
# =========================
from tqdm.auto import tqdm
import pandas as pd
import time

def prompt_for_jsonl(default="/content/drive/MyDrive/Dissertation/datasets/test.jsonl"):
    """Ask for a JSONL path; type 'upload' to select a local file."""
    path = input(
        f"Enter path to TEST JSONL (or type 'upload' to pick a local file)\n[{default}]: "
    ).strip()
    if not path:
        path = default
    if path.lower() == "upload":
        try:
            from google.colab import files
        except Exception:
            raise RuntimeError("Local upload is only available in Google Colab.")
        uploaded = files.upload()
        if not uploaded:
            raise RuntimeError("No file uploaded.")
        path = next(iter(uploaded.keys()))
        print(f"[INFO] Uploaded and using file: {path}")
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return path

TEST_JSONL = prompt_for_jsonl()

def read_jsonl(path):
    # Pass 1: count non-empty lines for a proper progress bar
    total = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                total += 1
    # Pass 2: load with progress bar
    rows = []
    with open(path, "r", encoding="utf-8") as f, tqdm(total=total, desc="Loading JSONL", unit="ex") as pbar:
        for line in f:
            if not line.strip():
                continue
            rows.append(json.loads(line))
            pbar.update(1)
    return rows

data = read_jsonl(TEST_JSONL)
assert len(data) > 0, f"No data found in {TEST_JSONL}"
print(f"[INFO] Loaded {len(data)} examples")

# ---- Auto-detect dataset format/fields ----
CAND_PROMPT = ["prompt","instruction","input","question","text","query","user","source","src"]
CAND_LABEL  = ["label","output","answer","response","target","completion","gold","ground_truth","dst","destination"]

FORMAT = "text"
PROMPT_KEY = None
LABEL_KEY  = None

sample = data[0]
if isinstance(sample, dict) and "messages" in sample and isinstance(sample["messages"], list):
    FORMAT = "chat"
else:
    for k in CAND_PROMPT:
        if k in sample:
            PROMPT_KEY = k; break
    for k in CAND_LABEL:
        if k in sample:
            LABEL_KEY = k; break
    if PROMPT_KEY is None or LABEL_KEY is None:
        raise KeyError(
            f"Couldn't infer fields. Example keys: {list(sample.keys())}\n"
            f"Set PROMPT_FIELD/LABEL_FIELD manually or add more aliases."
        )

if FORMAT == "chat":
    print("[INFO] Detected chat-style dataset with `messages` list.")
else:
    print(f"[INFO] Using fields -> prompt: '{PROMPT_KEY}', label: '{LABEL_KEY}'")

def build_inputs_and_gold(example):
    """Return (input_text, gold_text) for both 'text' and 'chat' formats."""
    if FORMAT == "chat":
        msgs = example["messages"]
        # assume last assistant message is the gold; prompt is all previous messages
        if len(msgs) >= 1 and msgs[-1].get("role") == "assistant":
            gold = msgs[-1].get("content","")
            prompt_msgs = msgs[:-1]
        else:
            raise KeyError("Chat format detected but the last message is not from 'assistant'.")
        if USE_CHAT_TEMPLATE and hasattr(tokenizer, "apply_chat_template") and tokenizer.apply_chat_template:
            input_text = tokenizer.apply_chat_template(prompt_msgs, tokenize=False, add_generation_prompt=True)
        else:
            # basic fallback: concatenate roles
            input_text = "\n".join([f"{m.get('role','user')}: {m.get('content','')}" for m in prompt_msgs]) + "\nassistant:"
        return input_text, gold
    else:
        return str(example[PROMPT_KEY]), str(example[LABEL_KEY])

def generate(model, tokenizer, prompt_text: str):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
    with torch.inference_mode():
        out_ids = model.generate(**inputs, **GEN_KW)
    gen_only = out_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(gen_only, skip_special_tokens=True).strip()

# ---- Evaluation with live EM progress ----
preds, golds, prompts, matches = [], [], [], []
running_correct = 0
t0 = time.time()

with tqdm(total=len(data), desc="Evaluating", unit="ex") as bar:
    for i, ex in enumerate(data):
        prompt_text, gold = build_inputs_and_gold(ex)
        pred = generate(model, tokenizer, prompt_text)
        match = exact_match(pred, gold)

        preds.append(pred)
        golds.append(gold)
        prompts.append(prompt_text)
        matches.append(match)
        running_correct += int(match)

        em_so_far = running_correct / (i + 1)
        rate = (i + 1) / max(1e-9, (time.time() - t0))
        bar.set_postfix(em=f"{em_so_far:.3f}", rate=f"{rate:.2f} ex/s")
        bar.update(1)

# ---- Final metrics & save ----
em = sum(matches) / len(matches)
print(f"\n===== RESULTS =====")
print(f"Exact-Match (EM): {em:.4f}  ({sum(matches)}/{len(matches)})")

df = pd.DataFrame({"prompt": prompts, "gold": golds, "prediction": preds, "match": matches})
df.to_csv(SAVE_PRED_CSV, index=False)
print(f"[INFO] Saved predictions to: {SAVE_PRED_CSV}")


Enter path to TEST JSONL (or type 'upload' to pick a local file)
[/content/drive/MyDrive/Dissertation/datasets/test.jsonl]: upload


Saving test.jsonl to test (1).jsonl
[INFO] Uploaded and using file: test (1).jsonl


Loading JSONL:   0%|          | 0/750 [00:00<?, ?ex/s]

[INFO] Loaded 750 examples
[INFO] Detected chat-style dataset with `messages` list.


Evaluating:   0%|          | 0/750 [00:00<?, ?ex/s]


===== RESULTS =====
Exact-Match (EM): 0.9760  (732/750)
[INFO] Saved predictions to: /content/predictions.csv
