In [1]:
import datasets

train_ds = datasets.load_dataset("nuprl/engineering-llm-systems", "mbpp-rkt-correct-executions")
test_ds = datasets.load_dataset("nuprl/engineering-llm-systems", "mbpp-rkt-test-problems")


mbpp-rkt-correct-executions/train-00000-(…):   0%|          | 0.00/455k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2646 [00:00<?, ? examples/s]

mbpp-rkt-test-problems/train-00000-of-00(…):   0%|          | 0.00/18.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

In [2]:
from pprint import pprint

train_split = train_ds["train"]
test_split = test_ds["train"]  # their test set is published as the 'train' split

print(len(train_split), "train items")
print(len(test_split), "test items")

pprint(train_split[0].keys())
pprint(test_split[0].keys())


2646 train items
50 test items
dict_keys(['task_id', 'code', 'test_cases', 'lang', 'timeout_s', 'result', 'stdout', 'stderr', 'exit_code', 'description', 'input_format', 'output_format'])
dict_keys(['description', 'input_format', 'output_format', 'tests', 'task_id'])


In [3]:
def build_train_example(ex):
    desc = (ex.get("description") or "").strip()
    io_spec = (ex.get("io_spec") or "").strip()
    code = (ex.get("solution_rkt") or ex.get("reference_rkt") or ex.get("code") or "").strip()

    header = "\n".join([
        "#lang racket",
        ";; Write a Racket program that solves the following problem.",
        f";; Problem: {desc}" if desc else ";; Problem: (missing)",
        f";; I/O: {io_spec}" if io_spec else ";; I/O: (unspecified)",
        ";; The program must read from stdin and print the correct outputs to stdout.",
        ";; Provide only runnable Racket code below.",
        "",
    ])
    return header, code

def build_test_prompt(ex):
    desc = (ex.get("description") or "").strip()
    io_spec = (ex.get("io_spec") or "").strip()
    header = "\n".join([
        "#lang racket",
        ";; Write a Racket program that solves the following problem.",
        f";; Problem: {desc}" if desc else ";; Problem: (missing)",
        f";; I/O: {io_spec}" if io_spec else ";; I/O: (unspecified)",
        ";; The program must read from stdin and print the correct outputs to stdout.",
        ";; Provide only runnable Racket code below.",
        "",
    ])
    return header


In [4]:
import os, tempfile, subprocess, textwrap

RACKET_EXECUTABLE = "racket"
EVAL_TIMEOUT_SEC = 6

def run_racket_program(racket_source: str, stdin_payload: str, timeout_sec: int = EVAL_TIMEOUT_SEC):
    with tempfile.NamedTemporaryFile(mode="w", suffix=".rkt", delete=False) as f:
        f.write(racket_source)
        path = f.name
    try:
        proc = subprocess.run(
            [RACKET_EXECUTABLE, path],
            input=stdin_payload,
            text=True,
            capture_output=True,
            timeout=timeout_sec,
            check=False,
        )
        if proc.returncode == 0:
            return True, (proc.stdout or "").strip()
        return False, (proc.stderr or proc.stdout or "").strip()
    except subprocess.TimeoutExpired:
        return False, "TIMEOUT"
    except FileNotFoundError:
        return False, "RACKET_NOT_FOUND"
    finally:
        try: os.remove(path)
        except OSError: pass

def normalize(s: str) -> str:
    return "\n".join(line.rstrip() for line in (s or "").strip().splitlines()).strip()

def evaluate_problem(prompt, completions, tests):
    results = []
    any_pass = [False]*len(completions)
    for i, comp in enumerate(completions):
        prog = prompt + comp
        ok_all = True
        for t in tests:
            ok, out = run_racket_program(prog, t.get("input",""))
            if not ok or normalize(out) != normalize(t.get("output","")):
                ok_all = False
                break
        any_pass[i] = ok_all
        results.append({"idx": i, "passed": ok_all})
    pass_at_1 = 1.0 if (len(any_pass)>0 and any_pass[0]) else 0.0
    pass_at_5 = 1.0 if any(any_pass) else 0.0
    return pass_at_1, pass_at_5, results


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

MODEL_NAME = "Qwen/Qwen3-1.7B-Base"
TEMPERATURE = 0.2
TOP_P = 0.95
NUM_SAMPLES = 5
MAX_NEW_TOKENS = 512
SEED = 7

set_seed(SEED)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=dtype, device_map="auto").eval()

@torch.no_grad()
def generate_completions(prompt: str, num_samples=NUM_SAMPLES):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        max_new_tokens=MAX_NEW_TOKENS,
        num_return_sequences=num_samples,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    prompt_len = inputs["input_ids"].shape[1]
    comps = []
    for i in range(outputs.shape[0]):
        gen = outputs[i, prompt_len:]
        comps.append(tokenizer.decode(gen, skip_special_tokens=True))
    return comps

# evaluate all test problems
total_p1, total_p5 = 0.0, 0.0
per_item = []
for i, ex in enumerate(test_split):
    prompt = build_test_prompt(ex)
    comps = generate_completions(prompt, NUM_SAMPLES)
    p1, p5, _ = evaluate_problem(prompt, comps, ex["tests"])
    total_p1 += p1
    total_p5 += p5
    per_item.append({"task_id": ex.get("task_id", i), "pass@1": p1, "pass@5": p5})
    if (i+1) % 5 == 0:
        print(f"{i+1}/{len(test_split)} | pass@1={total_p1/(i+1):.3f} pass@5={total_p5/(i+1):.3f}")

raw_pass1 = total_p1/len(test_split)
raw_pass5 = total_p5/len(test_split)
print("RAW model:", {"pass@1": raw_pass1, "pass@5": raw_pass5})


2025-10-04 13:17:08.636809: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-04 13:17:08.636850: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-04 13:17:08.636894: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-04 13:17:08.644288: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

5/50 | pass@1=0.000 pass@5=0.000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

10/50 | pass@1=0.000 pass@5=0.000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

15/50 | pass@1=0.000 pass@5=0.000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av