## Setup

In [290]:
import pandas as pd
import json
import re
import subprocess
import tempfile
import os
import sys
import time
from pathlib import Path
from datetime import datetime
from functools import partial

import anyio
from tqdm.notebook import tqdm

from utils import LLMPool, RuntimeConfig

## Configuration

In [291]:
# API Configuration
API_BASE_URL = "http://127.0.0.1:8080/v1"
API_KEY = "sk-local"
MODEL_NAME = "qlora0"

# Evaluation Configuration
KEEP_ONLY = True # only use audited to "keep"
N_PROBLEMS = 0

# Test file
# TEST_PATH = Path('/home/larcanio/AIMO3_v2/data/datasets/splits/algebra_specialist/test.jsonl')
# Optional: use GSM8K test.jsonl directly (set to None to use the normalized format below)
GSM8K_TEST_FILE = '/home/larcanio/AIMO3_v2/data/GSM8I/test.jsonl' #Path('/home/larcanio/AIMO3_v2/data/GSM8I/test.jsonl')

# Generation Parameters
MAX_TOKENS = 1024
GREEDY_TEMPERATURE = 0
GREEDY_TOP_P = 1.0
SAMPLED_TEMPERATURE = 0.6
SAMPLED_TOP_P = 0.8
N_SAMPLES = 3

# Output Configuration
DATASET_NAME = "qwen"
OUTPUT_DIR = Path("./evaluations")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Live-tunable settings (edit config.json while running)
CONFIG_FILE = "config.json"
cfg = RuntimeConfig(CONFIG_FILE, defaults={
    "MAX_CONCURRENT": 10,
    "EXECUTION_TIMEOUT": 30,
    "REQUEST_TIMEOUT": 50,
    "SAVE_EVERY": 200,
})

print(f"Model: {MODEL_NAME}")
print(f"Evaluating {N_PROBLEMS} problems")
print(f"GSM8K test file: {GSM8K_TEST_FILE}")
print(f"Output: {OUTPUT_DIR}")
print(cfg)

[config] reloaded: MAX_CONCURRENT: 10 -> 6, SAVE_EVERY: 200 -> 300
Model: qlora0
Evaluating 0 problems
GSM8K test file: /home/larcanio/AIMO3_v2/data/GSM8I/test.jsonl
Output: evaluations
RuntimeConfig(MAX_CONCURRENT=6, EXECUTION_TIMEOUT=30, REQUEST_TIMEOUT=50, SAVE_EVERY=300)


## Load Evaluation Dataset

In [292]:
if GSM8K_TEST_FILE is not None:
    # Load GSM8K test.jsonl directly: {"question": "...", "answer": "reasoning\n#### number"}
    datapoints = []
    with open(GSM8K_TEST_FILE, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if not line.strip():
                continue
            dp = json.loads(line)
            answer_text = dp['answer']
            if '####' not in answer_text:
                continue
            numeric_str = answer_text.split('####')[-1].strip().replace(',', '')
            numeric_match = re.search(r'-?\d+\.?\d*', numeric_str)
            if not numeric_match:
                continue
            datapoints.append({
                'problem_id': str(i),
                'problem': dp['question'],
                'answer': numeric_match.group(),
                'domain': 'unknown',
                'level': -1,
            })
    print(f"Loaded {len(datapoints)} problems from GSM8K test file")
else:
    # Load evaluation dataset (normalized format)
    jsonl_file = TEST_PATH

    datapoints = []
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    dp = json.loads(line)
                    datapoints.append({
                        'problem_id': dp['problem_id'],
                        'problem': dp["problem"]['text'],
                        'answer': dp["problem"]['expected_answer'],
                        'domain': dp.get('math_structure', {}).get('from_text', {}).get('domain', 'unknown'),
                        'level': dp.get('effective_difficulty', {}).get('level', -1),
                    })
                except json.JSONDecodeError:
                    continue

if N_PROBLEMS > 0:
    eval_df = pd.DataFrame(datapoints).head(N_PROBLEMS).reset_index(drop=True)
else:
    eval_df = pd.DataFrame(datapoints).reset_index(drop=True)

print(f"Loaded {len(eval_df)} problems")
print(f"Domains: {eval_df['domain'].value_counts().to_dict()}")
print(f"Levels:  {eval_df['level'].value_counts().sort_index().to_dict()}")
print(f"Sample: {eval_df.iloc[0]['problem'][:100]}...")

Loaded 1319 problems from GSM8K test file
Loaded 1319 problems
Domains: {'unknown': 1319}
Levels:  {-1: 1319}
Sample: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for ...


## Code Extraction & Execution

In [293]:
import signal


def extract_code_from_response(text):
    if not text:
        return None
    for pattern in [r'```python\s*(.*?)\s*```', r'```\s*(.*?)\s*```']:
        match = re.search(pattern, text, re.DOTALL)
        if match:
            return match.group(1).strip()
    return None


def _kill_proc_tree(proc):
    """Kill a process and its entire process group, then close pipes."""
    # Kill the whole process group so orphan children can't hold pipes open
    try:
        os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
    except OSError:
        try:
            proc.kill()
        except OSError:
            pass
    # Close pipe handles so wait() can't block on lingering children
    for pipe in (proc.stdout, proc.stderr, proc.stdin):
        if pipe:
            try:
                pipe.close()
            except OSError:
                pass
    try:
        proc.wait(timeout=5)
    except Exception:
        pass


def execute_code_with_timeout(code: str, timeout_seconds: int = 30):
    """Execute code in a fresh subprocess with proper process-group cleanup.

    Uses Popen + start_new_session so we can killpg() the entire tree
    if the process (or its children) exceeds the timeout.  This prevents
    orphan child processes from holding stdout/stderr pipes open and
    blocking the calling thread indefinitely.
    """
    tmp_path = None
    start_time = time.time()
    proc = None
    try:
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
            print(json.dumps(code))
            print("--------------------------------")
            f.write(code)
            tmp_path = f.name

        proc = subprocess.Popen(
            [sys.executable, tmp_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            start_new_session=True,   # new process group → killpg works
        )

        try:
            stdout, stderr = proc.communicate(timeout=timeout_seconds)
        except subprocess.TimeoutExpired:
            _kill_proc_tree(proc)
            duration = time.time() - start_time
            return None, f"Timeout after {timeout_seconds}s", True, duration

        duration = time.time() - start_time

        if proc.returncode != 0:
            return None, (stderr or "")[:500] or "Non-zero exit", False, duration
        stdout = (stdout or "").strip()
        if stdout:
            result = stdout.split('\n')[-1].strip()
            return result, None, False, duration
        return None, "No output", False, duration
    except Exception as e:
        if proc is not None:
            _kill_proc_tree(proc)
        duration = time.time() - start_time
        return None, str(e)[:500], False, duration
    finally:
        if tmp_path:
            try:
                os.unlink(tmp_path)
            except OSError:
                pass


def check_answer(predicted, expected):
    if predicted is None or expected is None:
        return False
    pred_str = str(predicted).strip()
    exp_str = str(expected).strip()
    if pred_str == exp_str:
        return True
    try:
        return abs(float(pred_str) - float(exp_str)) < 1e-6
    except (ValueError, TypeError):
        return False

print("Utils loaded")

Utils loaded


## Prompt Template

In [294]:
SYSTEM_MESSAGE = "You are a mathematician writing Python code to solve problems."

def format_prompt(problem: str) -> str:
    return f"""Write a Python program that computes the correct answer to the following math problem.

Requirements:
- The program must compute the answer programmatically (do NOT hard-code the final value).
- The program must be fully self-contained and executable.
- The program must print ONLY the final numerical answer (no extra text).

Output format:
- Output exactly one Python code block, starting with ```python and ending with ```.
- Do not include any text outside the code block.

Problem:
{problem}"""

## Evaluation Engine

In [295]:
async def evaluate_single_problem(pool: LLMPool, problem_row: dict, idx: int,
                                  exec_limiter: anyio.CapacityLimiter | None = None,
                                  verbose: bool = False):
    problem_id = problem_row['problem_id']
    problem_text = problem_row['problem']
    expected_answer = str(problem_row['answer'])
    
    if verbose:
        print(f"[{idx}] Starting problem {problem_id}", flush=True)
    
    prompt = format_prompt(problem_text)
    messages = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": prompt}
    ]
    
    record = {
        'problem_id': problem_id,
        'problem': problem_text,
        'answer': expected_answer,
        'domain': problem_row.get('domain', 'unknown'),
        'level': problem_row.get('level', -1),
    }
    
    tok_in = 0
    tok_out = 0

    async def _run_code(code: str):
        """Run code in a thread, gated by exec_limiter to prevent thread-pool exhaustion."""
        if exec_limiter is not None:
            async with exec_limiter:
                return await anyio.to_thread.run_sync(
                    partial(execute_code_with_timeout, code, cfg.EXECUTION_TIMEOUT)
                )
        return await anyio.to_thread.run_sync(
            partial(execute_code_with_timeout, code, cfg.EXECUTION_TIMEOUT)
        )
    
    # Greedy Pass@1
    try:
        resp = await pool.request(messages, temperature=GREEDY_TEMPERATURE, max_tokens=MAX_TOKENS, top_p=GREEDY_TOP_P)
        tok_in += resp.prompt_tokens
        tok_out += resp.completion_tokens
        code = extract_code_from_response(resp.content)
        record['code_greedy'] = code
        if code:
            result, error, is_timeout, duration = await _run_code(code)
            record['exec_ok_greedy'] = (error is None and not is_timeout)
            record['result_greedy'] = result
            record['error_greedy'] = error
            record['correct_greedy'] = check_answer(result, expected_answer)
        else:
            record['exec_ok_greedy'] = False
            record['result_greedy'] = None
            record['error_greedy'] = "No code"
            record['correct_greedy'] = False
    except Exception as e:
        print(f"LLM error (problem {idx}, greedy): {type(e).__name__} - {str(e)[:200]}", flush=True)
        record['code_greedy'] = None
        record['exec_ok_greedy'] = False
        record['result_greedy'] = None
        record['error_greedy'] = f"API error: {str(e)}"
        record['correct_greedy'] = False
    
    # Sampled Pass@N — early exit on first correct
    for sample_idx in range(1, N_SAMPLES + 1):
        try:
            resp = await pool.request(messages, temperature=SAMPLED_TEMPERATURE, max_tokens=MAX_TOKENS, top_p=SAMPLED_TOP_P)
            tok_in += resp.prompt_tokens
            tok_out += resp.completion_tokens
            code = extract_code_from_response(resp.content)
            record[f'code_{sample_idx}'] = code
            if code:
                result, error, is_timeout, duration = await _run_code(code)
                record[f'exec_ok_{sample_idx}'] = (error is None and not is_timeout)
                record[f'result_{sample_idx}'] = result
                record[f'error_{sample_idx}'] = error
                record[f'correct_{sample_idx}'] = check_answer(result, expected_answer)
            else:
                record[f'exec_ok_{sample_idx}'] = False
                record[f'result_{sample_idx}'] = None
                record[f'error_{sample_idx}'] = "No code"
                record[f'correct_{sample_idx}'] = False
        except Exception as e:
            print(f"LLM error (problem {idx}, sample {sample_idx}): {type(e).__name__} - {str(e)[:200]}", flush=True)
            record[f'code_{sample_idx}'] = None
            record[f'exec_ok_{sample_idx}'] = False
            record[f'result_{sample_idx}'] = None
            record[f'error_{sample_idx}'] = f"API error: {str(e)}"
            record[f'correct_{sample_idx}'] = False
        
        # Early exit: no need for more samples once we have a pass
        if record.get(f'correct_{sample_idx}', False):
            for remaining in range(sample_idx + 1, N_SAMPLES + 1):
                record[f'code_{remaining}'] = None
                record[f'exec_ok_{remaining}'] = False
                record[f'result_{remaining}'] = None
                record[f'error_{remaining}'] = "Skipped (early exit)"
                record[f'correct_{remaining}'] = False
            break
    
    # Derived metrics
    record['pass@1'] = record['correct_greedy']
    record['pass@3'] = any([record.get(f'correct_{j}', False) for j in range(1, N_SAMPLES + 1)])
    record['first_success_attempt'] = None
    for j in range(1, N_SAMPLES + 1):
        if record.get(f'correct_{j}', False):
            record['first_success_attempt'] = j
            break
    
    return record, tok_in, tok_out


def _save_checkpoint(results: list, completed: int, total: int, save_num: int):
    """Write partial results to disk atomically."""
    tmp = OUTPUT_DIR / f"{MODEL_NAME}_{DATASET_NAME}_partial.tmp"
    out = OUTPUT_DIR / f"{MODEL_NAME}_{DATASET_NAME}_partial.csv"
    pd.DataFrame(results).to_csv(tmp, index=False)
    tmp.rename(out)
    print(f"\n[checkpoint {save_num}] {completed}/{total} saved to {out.name}")


async def run_evaluation(eval_df: pd.DataFrame):
    results = []
    completed = 0
    pass1_count = 0
    pass2_count = 0
    pass3_count = 0
    total_tok_in = 0
    total_tok_out = 0
    save_count = 0
    max_problems = len(eval_df)
    
    pbar = tqdm(total=max_problems, desc="Evaluating")
    write_lock = anyio.Lock()
    # Limit live coroutines so the event loop stays responsive
    spawn_limit = anyio.Semaphore(cfg.MAX_CONCURRENT * 3)
    # Cap concurrent subprocess executions to avoid thread-pool exhaustion.
    # Each problem may run up to (1 + N_SAMPLES) code executions, so we limit
    # total threads consumed by subprocesses independently of coroutine count.
    exec_limiter = anyio.CapacityLimiter(cfg.MAX_CONCURRENT * 2)
    
    async def process_one_with_progress(pool, idx, row):
        nonlocal completed, pass1_count, pass2_count, pass3_count
        nonlocal total_tok_in, total_tok_out, save_count
        
        t0 = time.monotonic()
        result, tok_in, tok_out = await evaluate_single_problem(
            pool, row.to_dict(), idx, exec_limiter=exec_limiter
        )
        elapsed = time.monotonic() - t0
        
        async with write_lock:
            results.append(result)
            completed += 1
            total_tok_in += tok_in
            total_tok_out += tok_out
            
            if result.get('correct_greedy', False):
                pass1_count += 1
            if any([result.get(f'correct_{j}', False) for j in range(1, min(3, N_SAMPLES + 1))]):
                pass2_count += 1
            if any([result.get(f'correct_{j}', False) for j in range(1, N_SAMPLES + 1)]):
                pass3_count += 1
            
            pbar.set_postfix({
                'P@1': f'{pass1_count}/{completed} ({pass1_count/completed*100:.1f}%)',
                'P@2': f'{pass2_count}/{completed} ({pass2_count/completed*100:.1f}%)',
                'P@3': f'{pass3_count}/{completed} ({pass3_count/completed*100:.1f}%)'
            })
            pbar.update(1)
            
            status = "Pass" if result['pass@1'] else "Fail"
            print(
                f"{completed}/{max_problems} -> {status}"
                f" -> P@1={result['pass@1']}, P@3={result['pass@3']},"
                f" req={elapsed:.1f}s,"
                f" tokens: in={tok_in}, out={tok_out}, total={tok_in+tok_out}",
                flush=True
            )
            
            # Periodic checkpoint
            if completed % cfg.SAVE_EVERY == 0:
                save_count += 1
                _save_checkpoint(list(results), completed, max_problems, save_count)
    
    async with LLMPool(
        base_url=API_BASE_URL,
        api_key=API_KEY,
        model=MODEL_NAME,
        max_inflight=cfg.MAX_CONCURRENT,
        timeout=cfg.REQUEST_TIMEOUT
    ) as pool:
        async with anyio.create_task_group() as tg:
            for idx, row in eval_df.iterrows():
                async def _run(pool=pool, idx=idx, row=row):
                    async with spawn_limit:
                        await process_one_with_progress(pool, idx, row)
                tg.start_soon(_run)
    
    pbar.close()
    print(f"\nTotal tokens: in={total_tok_in}, out={total_tok_out}, total={total_tok_in+total_tok_out}")
    return results

print("Evaluation engine ready")

Evaluation engine ready


In [296]:
# Sync test - bypasses event loop entirely
import httpx as _httpx
with _httpx.Client(timeout=10) as c:
    r = c.post(f"{API_BASE_URL}/chat/completions", json={
        "model": MODEL_NAME,
        "messages": [{"role":"user","content":"2+2?"}],
        "max_tokens": 16, "temperature": 0.0,
    }, headers={"Authorization": f"Bearer {API_KEY}"})
    print(f"Status: {r.status_code}")
    print(r.json()["choices"][0]["message"]["content"])

Status: 200
2+2 equals 4.


In [297]:
# Quick connectivity test - run this first
import httpx

async def test_api():
    body = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": "You are helpful."},
            {"role": "user", "content": "What is 2+2? Reply with just the number."}
        ],
        "temperature": 0.0,
        "max_tokens": 32,
    }
    print(f"POST {API_BASE_URL}/chat/completions")
    print(f"model={MODEL_NAME}, timeout={cfg.REQUEST_TIMEOUT}s")
    
    async with httpx.AsyncClient(timeout=httpx.Timeout(30, connect=5)) as client:
        t0 = time.monotonic()
        try:
            resp = await client.post(
                f"{API_BASE_URL}/chat/completions",
                json=body,
                headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"},
            )
            elapsed = time.monotonic() - t0
            print(f"Status: {resp.status_code} in {elapsed:.2f}s")
            if resp.status_code == 200:
                data = resp.json()
                print(f"Content: {data['choices'][0]['message']['content']}")
                print(f"Tokens: in={data.get('usage',{}).get('prompt_tokens')}, out={data.get('usage',{}).get('completion_tokens')}")
            else:
                print(f"Body: {resp.text[:500]}")
        except Exception as e:
            elapsed = time.monotonic() - t0
            print(f"FAILED after {elapsed:.2f}s: {type(e).__name__}: {e}")

await test_api()

POST http://127.0.0.1:8080/v1/chat/completions
model=qlora0, timeout=50s
Status: 200 in 0.07s
Content: 4
Tokens: in=30, out=2


## Run Evaluation

In [298]:
print(f"Starting evaluation of {len(eval_df)} problems")
print(f"Greedy: temp={GREEDY_TEMPERATURE}")
print(f"Sampled: temp={SAMPLED_TEMPERATURE}, n={N_SAMPLES}\n")

start_time = time.time()
results = await run_evaluation(eval_df)
elapsed = time.time() - start_time

print(f"\nCompleted in {elapsed:.1f}s ({elapsed/len(eval_df):.2f}s per problem)")

Starting evaluation of 1319 problems
Greedy: temp=0
Sampled: temp=0.6, n=3



Evaluating:   0%|          | 0/1319 [00:00<?, ?it/s]

"total = 3 * 20\nprint(total)"
--------------------------------
"sprints = 3\nmeters_per_sprint = 60\ntotal_meters = sprints * meters_per_sprint\nprint(total_meters)"
--------------------------------
"cost_per_glass = 5\nprice_per_second = cost_per_glass * 0.6\ntotal_cost = cost_per_glass * 16\nprint(total_cost)"
--------------------------------
"blue_bolts = 2\nwhite_bolts = blue_bolts // 2\ntotal_bolts = blue_bolts + white_bolts\nprint(total_bolts)"
--------------------------------
"initial = 80000\nreparations = 50000\nprofit = initial + reparations - initial * 150 // 100\nprint(profit)"
--------------------------------
"eggs_per_day = 16\nbreakfast = 3\nmuffins = 4\ntotal_baked = breakfast + muffins\nremaining = total_baked - eggs_per_day\nselling = remaining * 2\nprint(selling)"
--------------------------------
"seattle = 20\ncharleston = 4 * Seattle\ntoulouse = 2 * Charleston\ntotal = toulouse + Charleston + Seattle\nprint(total)"
--------------------------------
"downloads_first

## Compute Metrics

In [299]:
results_df = pd.DataFrame(results)
N = len(results_df)

pass_at_1 = results_df['pass@1'].sum() / N
pass_at_3 = results_df['pass@3'].sum() / N
greedy_exec_rate = results_df['exec_ok_greedy'].sum() / N
sampled_exec_rate = sum([results_df[f'exec_ok_{j}'].sum() for j in range(1, N_SAMPLES+1)]) / (N * N_SAMPLES)

# New metrics
exec_ok_rate = results_df['exec_ok_greedy'].sum() / N  # Fraction where 1st greedy gen executes without errors
exec_ok_cases = results_df[results_df['exec_ok_greedy'] == True]
correct_given_exec = exec_ok_cases['correct_greedy'].sum() / len(exec_ok_cases) if len(exec_ok_cases) > 0 else 0.0

passed = results_df[results_df['pass@3'] == True]
mean_first_success = passed['first_success_attempt'].mean() if len(passed) > 0 else None

summary = {
    'model': MODEL_NAME,
    'dataset': DATASET_NAME,
    'n_problems': N,
    'pass@1': pass_at_1,
    'pass@3': pass_at_3,
    'greedy_exec_rate': greedy_exec_rate,
    'sampled_exec_rate': sampled_exec_rate,
    'exec_ok_rate': exec_ok_rate,
    'correct_given_exec': correct_given_exec,
    'mean_first_success_attempt': mean_first_success,
    'timestamp': datetime.now().isoformat()
}

print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"Model: {summary['model']}")
print(f"Dataset: {summary['dataset']}")
print(f"Problems: {summary['n_problems']}")
print()
print(f"Verified Pass@1:     {summary['pass@1']:.2%}")
print(f"Verified Pass@3:     {summary['pass@3']:.2%}")
print()
print(f"Greedy exec rate:    {summary['greedy_exec_rate']:.2%}")
print(f"Sampled exec rate:   {summary['sampled_exec_rate']:.2%}")
print()
print(f"Exec OK rate:        {summary['exec_ok_rate']:.2%}  (1st greedy gen)")
print(f"Correct given exec:  {summary['correct_given_exec']:.2%}  (P(correct | exec_ok))")
if summary['mean_first_success_attempt']:
    print(f"Mean first success:  {summary['mean_first_success_attempt']:.2f}")
print("="*60)


EVALUATION SUMMARY
Model: qlora0
Dataset: qwen
Problems: 1319

Verified Pass@1:     22.97%
Verified Pass@3:     34.34%

Greedy exec rate:    92.57%
Sampled exec rate:   73.82%

Exec OK rate:        92.57%  (1st greedy gen)
Correct given exec:  24.82%  (P(correct | exec_ok))
Mean first success:  1.45


In [None]:

# ============================================================
# EVALUATION SUMMARY
# ============================================================
# Model: qwen
# Dataset: qwen
# Problems: 1319

# Verified Pass@1:     17.97%
# Verified Pass@3:     28.05%

# Greedy exec rate:    91.51%
# Sampled exec rate:   78.67%

# Exec OK rate:        91.51%  (1st greedy gen)
# Correct given exec:  19.64%  (P(correct | exec_ok))
# Mean first success:  1.52
# ============================================================

# ============================================================
# EVALUATION SUMMARY
# ============================================================
# Model: qlora0
# Dataset: qwen
# Problems: 1319

# Verified Pass@1:     22.97%
# Verified Pass@3:     34.34%

# Greedy exec rate:    92.57%
# Sampled exec rate:   73.82%

# Exec OK rate:        92.57%  (1st greedy gen)
# Correct given exec:  24.82%  (P(correct | exec_ok))
# Mean first success:  1.45
# ============================================================

## Per-Domain & Per-Level Breakdown

In [301]:
def _build_breakdown(df, group_col, label):
    """Build a pass/fail breakdown table grouped by group_col."""
    # Compute pass@2 (any of first 2 samples correct)
    df = df.copy()
    df['pass@2'] = df.apply(
        lambda r: any(r.get(f'correct_{j}', False) for j in range(1, min(3, N_SAMPLES + 1))), axis=1
    )

    rows = []
    for name, grp in df.groupby(group_col, sort=True):
        n = len(grp)
        p1 = grp['pass@1'].sum()
        p2 = grp['pass@2'].sum()
        p3 = grp['pass@3'].sum()
        rows.append({
            label: name,
            'N': n,
            'Pass@1': int(p1),
            'Pass@1%': f'{p1/n:.1%}',
            'Fail@1': int(n - p1),
            'Pass@2': int(p2),
            'Pass@2%': f'{p2/n:.1%}',
            'Fail@2': int(n - p2),
            'Pass@3': int(p3),
            'Pass@3%': f'{p3/n:.1%}',
            'Fail@3': int(n - p3),
            'Greedy Pass': int(grp['correct_greedy'].sum()),
            'Greedy Pass%': f'{grp["correct_greedy"].sum()/n:.1%}',
        })
    # Totals row
    n = len(df)
    p1 = df['pass@1'].sum()
    p2 = df['pass@2'].sum()
    p3 = df['pass@3'].sum()
    gp = df['correct_greedy'].sum()
    rows.append({
        label: 'TOTAL',
        'N': n,
        'Pass@1': int(p1), 'Pass@1%': f'{p1/n:.1%}', 'Fail@1': int(n - p1),
        'Pass@2': int(p2), 'Pass@2%': f'{p2/n:.1%}', 'Fail@2': int(n - p2),
        'Pass@3': int(p3), 'Pass@3%': f'{p3/n:.1%}', 'Fail@3': int(n - p3),
        'Greedy Pass': int(gp), 'Greedy Pass%': f'{gp/n:.1%}',
    })
    return pd.DataFrame(rows)

# --- Per-Domain breakdown ---
print("=" * 80)
print("PER-DOMAIN BREAKDOWN")
print("=" * 80)
domain_table = _build_breakdown(results_df, 'domain', 'Domain')
display(domain_table)

# --- Per-Level breakdown ---
print("\n" + "=" * 80)
print("PER-LEVEL BREAKDOWN")
print("=" * 80)
level_table = _build_breakdown(results_df, 'level', 'Level')
display(level_table)

PER-DOMAIN BREAKDOWN


Unnamed: 0,Domain,N,Pass@1,Pass@1%,Fail@1,Pass@2,Pass@2%,Fail@2,Pass@3,Pass@3%,Fail@3,Greedy Pass,Greedy Pass%
0,unknown,1319,303,23.0%,1016,400,30.3%,919,453,34.3%,866,303,23.0%
1,TOTAL,1319,303,23.0%,1016,400,30.3%,919,453,34.3%,866,303,23.0%



PER-LEVEL BREAKDOWN


Unnamed: 0,Level,N,Pass@1,Pass@1%,Fail@1,Pass@2,Pass@2%,Fail@2,Pass@3,Pass@3%,Fail@3,Greedy Pass,Greedy Pass%
0,-1,1319,303,23.0%,1016,400,30.3%,919,453,34.3%,866,303,23.0%
1,TOTAL,1319,303,23.0%,1016,400,30.3%,919,453,34.3%,866,303,23.0%


In [302]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_dir = OUTPUT_DIR / f"{MODEL_NAME}_{DATASET_NAME}_{timestamp}"
run_dir.mkdir(parents=True, exist_ok=True)

# --- Save full CSV + summary JSON ---
csv_path = run_dir / f"results.csv"
json_path = run_dir / f"summary.json"

results_df.to_csv(csv_path, index=False)
with open(json_path, 'w') as f:
    json.dump(summary, f, indent=2)

# --- Save per-domain & per-level tables ---
domain_table.to_csv(run_dir / "breakdown_domain.csv", index=False)
level_table.to_csv(run_dir / "breakdown_level.csv", index=False)

# --- Build JSONL records for passed / failed ---
def _best_attempt(row):
    """Return (code, result, attempt_label) for the best successful attempt, or the greedy attempt if all failed."""
    # Check greedy first
    if row.get('correct_greedy'):
        return row.get('code_greedy'), row.get('result_greedy'), 'greedy'
    # Check sampled attempts
    for j in range(1, N_SAMPLES + 1):
        if row.get(f'correct_{j}'):
            return row.get(f'code_{j}'), row.get(f'result_{j}'), f'sample_{j}'
    # All failed — return greedy as the representative attempt
    return row.get('code_greedy'), row.get('result_greedy'), 'greedy'

passed_records = []
failed_records = []

for _, row in results_df.iterrows():
    code, result, attempt_label = _best_attempt(row)

    rec = {
        'problem_id': row['problem_id'],
        'domain': row.get('domain', 'unknown'),
        'level': int(row.get('level', -1)),
        'problem': row['problem'],
        'expected_answer': row['answer'],
        'predicted_answer': result,
        'best_attempt': attempt_label,
        'code': code,
        'greedy_pass': bool(row.get('correct_greedy', False)),
        'pass@1': bool(row.get('pass@1', False)),
        'pass@3': bool(row.get('pass@3', False)),
        'greedy_code': row.get('code_greedy'),
        'greedy_result': row.get('result_greedy'),
        'greedy_error': row.get('error_greedy'),
    }
    # Include all sampled attempt details
    for j in range(1, N_SAMPLES + 1):
        rec[f'sample_{j}_code'] = row.get(f'code_{j}')
        rec[f'sample_{j}_result'] = row.get(f'result_{j}')
        rec[f'sample_{j}_correct'] = bool(row.get(f'correct_{j}', False))
        rec[f'sample_{j}_error'] = row.get(f'error_{j}')

    if row.get('pass@3', False):
        passed_records.append(rec)
    else:
        failed_records.append(rec)

# Write JSONL files
passed_path = run_dir / "passed.jsonl"
failed_path = run_dir / "failed.jsonl"

for path, records in [(passed_path, passed_records), (failed_path, failed_records)]:
    with open(path, 'w') as f:
        for rec in records:
            f.write(json.dumps(rec, default=str) + '\n')

print(f"Evaluation saved to: {run_dir}/")
print(f"  results.csv          ({len(results_df)} rows)")
print(f"  summary.json")
print(f"  breakdown_domain.csv")
print(f"  breakdown_level.csv")
print(f"  passed.jsonl         ({len(passed_records)} problems)")
print(f"  failed.jsonl         ({len(failed_records)} problems)")

print("\nSample (first 5):")
cols = ['problem_id', 'domain', 'level', 'pass@1', 'pass@3', 'correct_greedy', 'correct_1', 'correct_2', 'correct_3']
display(results_df[cols].head())

Evaluation saved to: evaluations/qlora0_qwen_20260208_022016/
  results.csv          (1319 rows)
  summary.json
  breakdown_domain.csv
  breakdown_level.csv
  passed.jsonl         (453 problems)
  failed.jsonl         (866 problems)

Sample (first 5):


Unnamed: 0,problem_id,domain,level,pass@1,pass@3,correct_greedy,correct_1,correct_2,correct_3
0,3,unknown,-1,False,True,False,True,False,False
1,1,unknown,-1,True,True,True,True,False,False
2,10,unknown,-1,False,True,False,True,False,False
3,0,unknown,-1,False,True,False,True,False,False
4,16,unknown,-1,True,True,True,False,True,False


In [303]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_path = OUTPUT_DIR / f"{MODEL_NAME}_{DATASET_NAME}_eval_{timestamp}.csv"
json_path = OUTPUT_DIR / f"{MODEL_NAME}_{DATASET_NAME}_summary_{timestamp}.json"

results_df.to_csv(csv_path, index=False)
with open(json_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nResults saved to: {csv_path}")
print(f"Summary saved to: {json_path}")

print("\nSample (first 5):")
cols = ['problem_id', 'pass@1', 'pass@3', 'correct_greedy', 'correct_1', 'correct_2', 'correct_3']
display(results_df[cols].head())


Results saved to: evaluations/qlora0_qwen_eval_20260208_022016.csv
Summary saved to: evaluations/qlora0_qwen_summary_20260208_022016.json

Sample (first 5):


Unnamed: 0,problem_id,pass@1,pass@3,correct_greedy,correct_1,correct_2,correct_3
0,3,False,True,False,True,False,False
1,1,True,True,True,True,False,False
2,10,False,True,False,True,False,False
3,0,False,True,False,True,False,False
4,16,True,True,True,False,True,False


## Analysis

In [304]:
improved = results_df[(results_df['pass@1'] == False) & (results_df['pass@3'] == True)]
print(f"\nImproved by sampling: {len(improved)}/{N} ({len(improved)/N:.1%})")

if len(improved) > 0:
    print("\nFirst success distribution:")
    print(improved['first_success_attempt'].value_counts().sort_index())

failed_all = results_df[(results_df['pass@1'] == False) & (results_df['pass@3'] == False)]
print(f"\nFailed all attempts: {len(failed_all)}/{N}")

print("\nExecution failures:")
print(f"Greedy: {(~results_df['exec_ok_greedy']).sum()}")
for j in range(1, N_SAMPLES + 1):
    print(f"Sample {j}: {(~results_df[f'exec_ok_{j}']).sum()}")


Improved by sampling: 178/1319 (13.5%)

First success distribution:
first_success_attempt
1.0    83
2.0    57
3.0    38
Name: count, dtype: int64

Failed all attempts: 838/1319

Execution failures:
Greedy: 98
Sample 1: 123
Sample 2: 416
Sample 3: 497


In [305]:
results_df[results_df['exec_ok_greedy'] == False].head()

Unnamed: 0,problem_id,problem,answer,domain,level,code_greedy,exec_ok_greedy,result_greedy,error_greedy,correct_greedy,...,error_2,correct_2,code_3,exec_ok_3,result_3,error_3,correct_3,pass@1,pass@3,first_success_attempt
12,6,Toulouse has twice as many sheep as Charleston...,260,unknown,-1,seattle = 20\ncharleston = 4 * Seattle\ntoulou...,False,,"Traceback (most recent call last):\n File ""/t...",False,...,"Traceback (most recent call last):\n File ""/t...",False,seattle_sheep = 20\ncharleston_sheep = 4 * Sea...,False,,"Traceback (most recent call last):\n File ""/t...",False,False,False,
33,29,Gloria is shoe shopping when she comes across ...,104,unknown,-1,cost_heels = 33\ncost_highheels = cost_heels *...,False,,"Traceback (most recent call last):\n File ""/t...",False,...,"Traceback (most recent call last):\n File ""/t...",False,cost_heels = 33\ncost_high_heel = 2 * cost_hee...,False,,"Traceback (most recent call last):\n File ""/t...",False,False,False,
37,37,John plans to sell all his toys and use the mo...,2,unknown,-1,total_spent = 15 * 13 + 20 * 8\nremaining = 5\...,False,,"Traceback (most recent call last):\n File ""/t...",False,...,,False,total_revenue = 13 * 15\ntotal_spent = 8 * 20\...,True,30.0,,False,False,False,
44,41,"The great dragon, Perg, sat high atop mount Fa...",200,unknown,-1,distance = 400\ngold_javelin = 1000\nsapphire_...,False,,"Traceback (most recent call last):\n File ""/t...",False,...,"Traceback (most recent call last):\n File ""/t...",False,distance = 400\nhit_distance = distance - 3\np...,True,397.0,,False,False,False,
50,46,Candice put 80 post-it notes in her purse befo...,163,unknown,-1,total = 80 + 23\nassert total % 220 == 0\npack...,False,,"Traceback (most recent call last):\n File ""/t...",False,...,,False,remaining = 23\ntotal_cups = 220\nnotes_per_cu...,False,,"Traceback (most recent call last):\n File ""/t...",False,False,False,
