# Are you using the right version of python??

In [None]:
import sys

required = "3.11.10"
if not sys.version.startswith(required):
    raise RuntimeError(
        f"❌ Wrong Python version.\n"
        f"You are using: {sys.version}\n"
        f"This project requires: Python {required}.\n"
    )

print("✅ Python version OK:", sys.version)


# Cell 1 - Imports & Data registry

In [65]:
import os
import json
import time
from typing import Tuple, Type, Dict, Any

from adapters.base import DatasetAdapter
from adapters.cybermetric import CyberMetricAdapter
from adapters.cipherbank import CipherBankAdapter
from adapters.cipherbench import CipherBenchAdapter
from models.base import get_model

# Dataset registry: name -> (AdapterClass, path)
DATASETS = {
    "cybermetric": (
        CyberMetricAdapter,
        os.path.join("datasets", "cybermetric", "cybermetric.json"),
    ),
    "cipherbank": (
        CipherBankAdapter,
        os.path.join("datasets", "cipherbank", "cipherbank.jsonl"),
    ),
    "cipherbench": (
        CipherBenchAdapter,
        os.path.join("datasets", "cipherbench", "cipherbench.jsonl"),
    ),
}

GEN_CONFIG = {
    "cipherbench": {
        "max_output_tokens": 1536,  
    },
    "cipherbank": {
        "max_output_tokens": 1536,
    },
    "cybermetric": {
        "max_output_tokens": 128,
    },
}



# Cell 2 - Helpers & constants

In [66]:
def classify_output(raw: str) -> Dict[str, Any]:
    """
    Normalize model output into a status + text payload without throwing.
    (Kept for compatibility if you want to use it later.)
    """
    if not isinstance(raw, str):
        return {"status": "error", "output": "", "error_message": f"Non-string output: {type(raw)}"}
    s = raw.strip()
    if s.startswith("[Rate-limited"):
        return {"status": "rate_limited", "output": ""}
    if s.startswith("[Empty content]") or s.startswith("[Unexpected format]"):
        return {"status": "empty", "output": ""}
    return {"status": "ok", "output": s}


def _is_blank(s: str) -> bool:
    return not isinstance(s, str) or not s.strip()


CONTENT_RETRIES = int(os.getenv("EVAL_CONTENT_RETRIES", "2"))
CONTENT_BACKOFF = float(os.getenv("EVAL_CONTENT_BACKOFF", "1.5"))  # seconds
ANSWER_CUE = "\n\nPlaintext:"  # nudges models that return whitespace
FALLBACK_OUTPUT = "[NO_ANSWER]"  # ensures every record has an output


# Cell 3 - safe_generate

In [67]:
def safe_generate(model, prompt: str, **gen_kwargs) -> dict:
    attempts = 0
    used_cue = False

    # attempt 0: original prompt
    attempts += 1
    try:
        out = model.generate(prompt, **gen_kwargs)
        if _is_blank(out) or out.startswith("[Rate-limited"):
            raise ValueError("blank_or_rate_limited")
        return {"status": "ok", "output": out.strip(), "attempts": attempts, "used_cue": used_cue}
    except KeyboardInterrupt:
        raise
    except Exception as e:
        last_err = str(e)

    # retries with cue
    retry_prompt = prompt + ANSWER_CUE
    used_cue = True
    for _ in range(CONTENT_RETRIES):
        time.sleep(CONTENT_BACKOFF)
        attempts += 1
        try:
            out = model.generate(retry_prompt, **gen_kwargs)
            if _is_blank(out) or out.startswith("[Rate-limited"):
                raise ValueError("blank_or_rate_limited")
            return {"status": "ok", "output": out.strip(), "attempts": attempts, "used_cue": used_cue}
        except KeyboardInterrupt:
            raise
        except Exception as e:
            last_err = str(e)

    return {
        "status": "empty",
        "output": FALLBACK_OUTPUT,
        "error_message": last_err,
        "attempts": attempts,
        "used_cue": used_cue,
    }


# Cell 4 - Config

In [68]:
# ==== CONFIG – EDIT THESE BETWEEN RUNS ====

DATASET_NAME = "cybermetric" # cipherbank, cipherbench, cybermetric
MODEL_NAME   = "mistral-7b-instruct" #gemini-2.5-flash-lite, gpt-oss-20b, llama-3.3-70b-instruct, mistral-7b-instruct

LIMIT = 6  # to stay under 1000 RPD

START_INDEX = None  # <-- this is new: None = auto-resume

OUT_PATH = None

PROGRESS_EVERY = 25
PER_ITEM_SLEEP = float(os.getenv("EVAL_MIN_SECS_BETWEEN_CALLS", "0"))

DATASET_SIZES = {
    "cipherbank": 2358,
    "cipherbench": 2400,
    "cybermetric": 1500,
}


# Cell 5 - infer_start_index() function

In [69]:
def infer_start_index(dataset_name: str, model_name: str, out_path: str | None = None) -> int:
    """
    If a results file already exists, return the number of lines in it.
    That corresponds to how many items you've already processed.
    """
    if out_path is None:
        out_path = os.path.join("results", f"{dataset_name}__{model_name}.jsonl")
    if not os.path.exists(out_path):
        return 0
    count = 0
    with open(out_path, "r", encoding="utf-8") as f:
        for _ in f:
            count += 1
    return count


# Cell 6 - Planning cell (shows what will run)

In [70]:
# Decide START_INDEX
if START_INDEX is None:
    START_INDEX = infer_start_index(DATASET_NAME, MODEL_NAME, OUT_PATH)

total_size = DATASET_SIZES.get(DATASET_NAME, None)
already_done = START_INDEX
remaining = None if total_size is None else max(total_size - already_done, 0)

print(f"Dataset: {DATASET_NAME}")
print(f"Model:   {MODEL_NAME}")
print(f"Already done (in results file): {already_done}")

if total_size is not None:
    print(f"Total size: {total_size}")
    print(f"Remaining after this start index: {remaining}")
    if LIMIT is not None:
        will_process = min(LIMIT, remaining)
        print(f"Will process this run (LIMIT): {will_process}")
        print(f"Remaining AFTER this run: {remaining - will_process}")
else:
    print("(No dataset size info; only showing already_done.)")

print("\nIf this looks wrong, adjust START_INDEX or LIMIT before running eval.")


Dataset: cybermetric
Model:   mistral-7b-instruct
Already done (in results file): 0
Total size: 1500
Remaining after this start index: 1500
Will process this run (LIMIT): 6
Remaining AFTER this run: 1494

If this looks wrong, adjust START_INDEX or LIMIT before running eval.


# Cell 7 - run_eval() function

In [71]:
def run_eval(
    dataset_name: str,
    model_name: str,
    limit: int | None = None,
    start_index: int = 0,
    out_path: str | None = None,
    progress_every: int = 25,
    per_item_sleep: float = 0.0,
):
    # Load dataset adapter
    if dataset_name not in DATASETS:
        raise ValueError(f"Unknown dataset: {dataset_name}")
    AdapterCls, ds_path = DATASETS[dataset_name]
    adapter: DatasetAdapter = AdapterCls(ds_path)

    # Load model
    model = get_model(model_name)

    # Prepare output
    os.makedirs("results", exist_ok=True)
    out_path = out_path or os.path.join("results", f"{dataset_name}__{model.name}.jsonl")

    # If resuming (start_index > 0) and file exists, append instead of overwrite
    mode = "a" if start_index > 0 and os.path.exists(out_path) else "w"
    if mode == "a":
        print(f"Appending to existing file: {out_path} (starting at dataset index {start_index})")
    else:
        print(f"Writing new results file: {out_path}")

    correct = 0
    processed = 0  # number of items actually evaluated in THIS run

    try:
        with open(out_path, mode, encoding="utf-8") as fout:
            for ds_idx, item in enumerate(adapter.iter_items()):
                # Skip until we reach START_INDEX
                if ds_idx < start_index:
                    continue

                # Enforce session limit (relative to start_index)
                if limit is not None and processed >= limit:
                    break

                prompt = adapter.build_prompt(item)

                # Call model safely
                gen_cfg = GEN_CONFIG.get(dataset_name, {})
                result = safe_generate(model, prompt, **gen_cfg)
                status = result["status"]
                output = result.get("output", "")
                error_message = result.get("error_message", "")
                attempts = result.get("attempts", 1)
                used_cue = result.get("used_cue", False)

                # Only score "ok" outputs; others count as incorrect
                if status == "ok":
                    label = adapter.score(item, output)
                else:
                    label = 0

                correct += label
                processed += 1

                rec = {
                    "id": item.get("id", ds_idx),
                    "dataset": dataset_name,
                    "model": model.name,
                    "status": status,          # ok | rate_limited | empty | error
                    "prompt": prompt,
                    "output": output,
                    "correct": label,
                    "attempts": attempts,
                    "used_cue": used_cue,
                }
                if error_message:
                    rec["error_message"] = error_message

                # Include extra metadata if present
                for key in ("algorithm", "ciphertext", "prompt_text", "question"):
                    if key in item:
                        rec[key] = item[key]

                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

                # Optional progress logging
                if progress_every and processed % progress_every == 0:
                    acc_so_far = correct / processed if processed else 0.0
                    print(
                        f"[processed={processed} | ds_idx={ds_idx}] "
                        f"running acc={acc_so_far:.4f} (last status={status})"
                    )

                # Optional gentle pacing (useful for free-tier models)
                if per_item_sleep > 0:
                    time.sleep(per_item_sleep)

    except KeyboardInterrupt:
        print("\nInterrupted by user. Finalizing...")

    acc = correct / processed if processed else 0.0
    print("-" * 60)
    print(f"Dataset: {dataset_name}")
    print(f"Model:   {model.name}")
    print(f"Start index: {start_index}")
    print(f"Processed this run: {processed}")
    print(f"Accuracy this run: {acc:.4f}")
    print(f"Wrote results to: {out_path}")


# Cell 7 - Run cell

In [72]:
run_eval(
    dataset_name=DATASET_NAME,
    model_name=MODEL_NAME,
    limit=LIMIT,
    start_index=START_INDEX,
    out_path=OUT_PATH,
    progress_every=PROGRESS_EVERY,
    per_item_sleep=PER_ITEM_SLEEP,
)


Writing new results file: results/cybermetric__mistral-7b-instruct.jsonl
------------------------------------------------------------
Dataset: cybermetric
Model:   mistral-7b-instruct
Start index: 0
Processed this run: 6
Accuracy this run: 1.0000
Wrote results to: results/cybermetric__mistral-7b-instruct.jsonl
