# SRL Benchmarking (Colab)
Run vLLM-based benchmarks for SRL/SFT models, saving results for later plotting.

## 0. Setup
- Runtime: GPU (A100 recommended)
- HF token: set `HF_TOKEN` env var if needed for private models/datasets.
- Repo: cloned into `/content/SRL-reasoning`.

In [None]:
# --- Colab setup (run this first in a fresh runtime) ---
REPO_URL = "https://github.com/iroblesrazzaq/SRL-reasoning.git"  # update if needed
BRANCH = "main"                               # update if needed
WORKDIR = "/content/SRL-reasoning"

# Install before importing anything
%pip install -U pip
import os

if not os.path.exists(WORKDIR):
    !git clone --branch $BRANCH $REPO_URL $WORKDIR
%cd $WORKDIR
%pip install -e .

from pathlib import Path

print("Ready. Repo at", WORKDIR)

## 1. (Optional) Mount Drive for saving results/checkpoints

In [None]:

MOUNT_DRIVE = True  # set True to mount
if MOUNT_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_RESULTS_DIR = '/content/drive/MyDrive/srl_bench_results'
else:
    DRIVE_RESULTS_DIR = None


## 2. Configure run

In [None]:
# Required: set your fine-tuned model location (HF Hub id or local path)
MODEL_PATH = "Qwen/Qwen3-4B-Instruct-2507"  # e.g., "yourname/srl-lora" or "/content/ckpts/best"
MODEL_NAME = "Qwen3-4B-Instruct-2507"  # optional display name; defaults to basename of MODEL_PATH

# Benchmark settings
BENCHMARK = "aime24"          # choices: amc23, aime24, aime25
MODE = "avg32"                # choices: greedy, avg32
MODEL_TYPE = "base"            # choices: srl (has <think>), base (no <think>)
GPU_MEMORY_UTILIZATION = 0.7  # leaves headroom alongside vLLM
RESULTS_DIR = "benchmarks/results"

SEED = 42

## 3. Run benchmark and save result

In [None]:
import time
from pathlib import Path
from benchmarks import load_benchmark_data, MathEvaluator, BenchmarkResult, data_loader
import os as _os

_gpu_util = str(GPU_MEMORY_UTILIZATION)
_os.environ.setdefault("VLLM_GPU_MEMORY_UTILIZATION", _gpu_util)

data_loader.BENCHMARK_CONFIGS["aime25"] = ("math-ai/aime25", "test")

print(f"Loading benchmark '{BENCHMARK}'...")
data = load_benchmark_data(BENCHMARK)
print(f"Loaded {len(data)} problems")

print("Initializing evaluator (vLLM)...")
eval_start = time.time()
evaluator = MathEvaluator(
    MODEL_PATH,
    model_type=MODEL_TYPE,
    gpu_memory_utilization=GPU_MEMORY_UTILIZATION
)



In [None]:
print(f"Running mode='{MODE}' ...")
score = evaluator.evaluate(data, mode=MODE)
eval_end = time.time() - eval_start
print(f"Score: {score:.4f} | elapsed: {eval_end/60:.1f} min")

benchmark_type = "Avg32" if MODE == "avg32" else "Greedy"
model_display = MODEL_NAME or Path(MODEL_PATH).name

result = BenchmarkResult(
    benchmark=BENCHMARK,
    benchmark_type=benchmark_type,
    score=score,
    model_name=model_display,
    model_path=MODEL_PATH,
    num_questions=len(data),
    eval_time_seconds=eval_end,
    seed=SEED,
)

path = result.save(RESULTS_DIR)
print("Saved result to", path)


In [None]:
MODE = 'greedy'
print(f"Running mode='{MODE}' ...")
score = evaluator.evaluate(data, mode=MODE)
eval_end = time.time() - eval_start
print(f"Score: {score:.4f} | elapsed: {eval_end/60:.1f} min")

benchmark_type = "Avg32" if MODE == "avg32" else "Greedy"
model_display = MODEL_NAME or Path(MODEL_PATH).name

result = BenchmarkResult(
    benchmark=BENCHMARK,
    benchmark_type=benchmark_type,
    score=score,
    model_name=model_display,
    model_path=MODEL_PATH,
    num_questions=len(data),
    eval_time_seconds=eval_end,
    seed=SEED,
)

path = result.save(RESULTS_DIR)
print("Saved result to", path)


In [None]:
# checkpoint save in case amc23 crashes:

import shutil
from pathlib import Path

if DRIVE_RESULTS_DIR:
    dest = Path(DRIVE_RESULTS_DIR)
    dest.mkdir(parents=True, exist_ok=True)
    for f in Path(RESULTS_DIR).glob("*.json"):
        shutil.copy2(f, dest / f.name)
    print("Copied results to", dest)
else:
    print("Drive not mounted; skipping copy.")


In [None]:
BENCHMARK = "amc23"          # choices: amc23, aime24, aime25
MODE = "avg32"                # choices: greedy, avg32

data = load_benchmark_data(BENCHMARK)

print(f"Running mode='{MODE}' ...")
score = evaluator.evaluate(data, mode=MODE)
eval_end = time.time() - eval_start
print(f"Score: {score:.4f} | elapsed: {eval_end/60:.1f} min")

benchmark_type = "Avg32" if MODE == "avg32" else "Greedy"
model_display = MODEL_NAME or Path(MODEL_PATH).name

result = BenchmarkResult(
    benchmark=BENCHMARK,
    benchmark_type=benchmark_type,
    score=score,
    model_name=model_display,
    model_path=MODEL_PATH,
    num_questions=len(data),
    eval_time_seconds=eval_end,
    seed=SEED,

)

path = result.save(RESULTS_DIR)
print("Saved result to", path)



In [None]:
BENCHMARK = "aime24"          # choices: amc23, aime24, aime25
data = load_benchmark_data(BENCHMARK)

MODE = "greedy"                # choices: greedy, avg32

print(f"Running mode='{MODE}' ...")
score = evaluator.evaluate(data, mode=MODE)
eval_end = time.time() - eval_start
print(f"Score: {score:.4f} | elapsed: {eval_end/60:.1f} min")

benchmark_type = "Avg32" if MODE == "avg32" else "Greedy"
model_display = MODEL_NAME or Path(MODEL_PATH).name

result = BenchmarkResult(
    benchmark=BENCHMARK,
    benchmark_type=benchmark_type,
    score=score,
    model_name=model_display,
    model_path=MODEL_PATH,
    num_questions=len(data),
    eval_time_seconds=eval_end,
    seed=SEED,

)

path = result.save(RESULTS_DIR)
print("Saved result to", path)



## 4. (Optional) Copy results to Drive

In [None]:

import shutil
from pathlib import Path

if DRIVE_RESULTS_DIR:
    dest = Path(DRIVE_RESULTS_DIR)
    dest.mkdir(parents=True, exist_ok=True)
    for f in Path(RESULTS_DIR).glob("*.json"):
        shutil.copy2(f, dest / f.name)
    print("Copied results to", dest)
else:
    print("Drive not mounted; skipping copy.")


## 5. Inspect saved results

In [None]:

from benchmarks import load_all_results, summarize_results

results = list(load_all_results(RESULTS_DIR))
print(f"Loaded {len(results)} result file(s)")
for r in results:
    print(f"- {r.benchmark} | {r.benchmark_type} | {r.model_name} | score={r.score:.4f} | run_id={r.run_id}")

best = summarize_results(results)
print("Best scores (benchmark, model_name) -> score:")
for (bench, model), sc in best.items():
    print(f"{bench} / {model}: {sc:.4f}")
