In [1]:
import subprocess
import time
import socket
import atexit
import argparse
import os
import sys
import json
import requests
from tqdm import tqdm
from torch.utils.data import Dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

# Wait until the server is up
def wait_until_ready(port=8080, timeout=30):
    start = time.time()
    while time.time() - start < timeout:
        try:
            with socket.create_connection(("localhost", port), timeout=1):
                return
        except OSError:
            time.sleep(0.2)
    raise TimeoutError("Server did not start in time.")

def start_coq_verification_server():
    subprocess.run([
            "docker", "run", "--rm", "-d",
            "--name", "coqstoq-server",
            "-p", "8080:8080",
            "coqstoq-full",
            "poetry", "run", "python3",
            "coqstoq/checker_server/server.py", "test", "77785", "."
        ], check=True)

    # Stop the server automatically when the script ends
    atexit.register(lambda: subprocess.run(["docker", "stop", "coqstoq-server"]))
    wait_until_ready()

def check_proof(proof: str) -> dict:
    """
    Start the verification server if necessary, then POST the given proof.
    Returns the parsed JSON‑RPC response as a Python dict.
    """
    start_coq_verification_server()
    wait_until_ready()

    payload = {
        "jsonrpc": "2.0",
        "method":  "check_proof",
        "params":  {"proof": proof},
        "id":      1
    }

    r = requests.post(f"http://localhost:8080", json=payload, timeout=30)
    r.raise_for_status()          # raise if HTTP error
    return r.json()               # e.g. {"result": {"score": 1, "messages": []}, "id": 1, "jsonrpc": "2.0"}


  from .autonotebook import tqdm as notebook_tqdm


INFO 07-04 00:13:09 [__init__.py:244] Automatically detected platform cuda.


2025-07-04 00:13:11,318	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
# Argument parsing
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="/home/t-ilshapiro/CoqStoq/fstarcoq-qwq-32b-singleturn-sft") # path that points to the directory with the model name (e.g. fstarcoq-qwq-32b...)
parser.add_argument("--sample_n", type=int, default=1) # how many times we sample for each prompt (i.e. sample on same input)
parser.add_argument("--temperature", type=float, default=0.7)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--num_gpus", type=int, default=2)
args, _ = parser.parse_known_args() # patch for Jupyter notebooks

print("Starting Coqstoq verification server...")
# Start the Docker server in detached mode

print("Server is running and ready!")

# Load validation data
print("Loading validation data...")
valid_data = []
with open("coq-test-data.jsonl") as file:
    for line in file:
        valid_data.append(json.loads(line))
if args.debug:
    valid_data = valid_data[:100]

# Load tokenizer and vLLM engine
print(f"Loading tokenizer and checkpoint from {args.model_name}... ", end="")
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.padding_side = "left"
llm = LLM(model=args.model_name, dtype="bfloat16", max_model_len=16384, tensor_parallel_size=args.num_gpus)

# Prepare prompts
print("Preparing prompts...")
prompts = []
prompt_to_index = []  # (datum_idx, sample_idx)
for datum_idx, datum in enumerate(tqdm(valid_data)):
    prompt = datum["user_prompt"]
    if len(tokenizer(prompt).input_ids) > 8192:
        continue
    for sample_idx in range(args.sample_n):
        prompts.append(prompt)
        prompt_to_index.append((datum_idx,sample_idx))

# Generate with vLLM
print(f"Sampling responses... {args.sample_n} samples per prompt, temp={args.temperature}")
sampling_params = SamplingParams(temperature=args.temperature, max_tokens=16384, n=1)
outputs = llm.generate(prompts, sampling_params)

# Organize responses into valid_data
for datum in valid_data:
    datum["model_generated_response"] = [] # length of this list will be sample_n

for output, (datum_idx, _) in zip(outputs, prompt_to_index):
    response = output.outputs[0].text
    if "<answer>" in response and "</answer>" in response:
        valid_data[datum_idx]["model_generated_response"].append(response) # recall datum_idx is the line number in the jsonl file



Starting Coqstoq verification server...
Server is running and ready!
Loading validation data...
Loading tokenizer and checkpoint from /home/t-ilshapiro/CoqStoq/fstarcoq-qwq-32b-singleturn-sft... INFO 07-04 00:16:23 [config.py:823] This model supports multiple tasks: {'reward', 'embed', 'generate', 'classify', 'score'}. Defaulting to 'generate'.
INFO 07-04 00:16:23 [config.py:1946] Defaulting to use mp for distributed inference
INFO 07-04 00:16:23 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 07-04 00:16:25 [core.py:455] Waiting for init message from front-end.
INFO 07-04 00:16:25 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='/home/t-ilshapiro/CoqStoq/fstarcoq-qwq-32b-singleturn-sft', speculative_config=None, tokenizer='/home/t-ilshapiro/CoqStoq/fstarcoq-qwq-32b-singleturn-sft', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=tor

Loading safetensors checkpoint shards:   0% Completed | 0/14 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   7% Completed | 1/14 [00:00<00:05,  2.52it/s]
Loading safetensors checkpoint shards:  14% Completed | 2/14 [00:00<00:05,  2.02it/s]
Loading safetensors checkpoint shards:  21% Completed | 3/14 [00:01<00:05,  1.88it/s]
Loading safetensors checkpoint shards:  29% Completed | 4/14 [00:02<00:05,  1.91it/s]
Loading safetensors checkpoint shards:  36% Completed | 5/14 [00:02<00:03,  2.30it/s]
Loading safetensors checkpoint shards:  43% Completed | 6/14 [00:02<00:03,  2.17it/s]
Loading safetensors checkpoint shards:  50% Completed | 7/14 [00:03<00:03,  2.00it/s]
Loading safetensors checkpoint shards:  57% Completed | 8/14 [00:03<00:03,  1.95it/s]
Loading safetensors checkpoint shards:  64% Completed | 9/14 [00:04<00:02,  1.87it/s]
Loading safetensors checkpoint shards:  71% Completed | 10/14 [00:05<00:02,  1.86it/s]
Loading safetensors checkpoint shards:  79% Completed | 11/14

[1;36m(VllmWorker rank=1 pid=1448855)[0;0m INFO 07-04 00:16:34 [default_loader.py:272] Loading weights took 7.10 seconds


Loading safetensors checkpoint shards: 100% Completed | 14/14 [00:07<00:00,  1.85it/s]
Loading safetensors checkpoint shards: 100% Completed | 14/14 [00:07<00:00,  1.93it/s]
[1;36m(VllmWorker rank=0 pid=1448852)[0;0m 


[1;36m(VllmWorker rank=0 pid=1448852)[0;0m INFO 07-04 00:16:34 [default_loader.py:272] Loading weights took 7.36 seconds
[1;36m(VllmWorker rank=1 pid=1448855)[0;0m INFO 07-04 00:16:35 [gpu_model_runner.py:1624] Model loading took 30.7118 GiB and 7.404882 seconds
[1;36m(VllmWorker rank=0 pid=1448852)[0;0m INFO 07-04 00:16:35 [gpu_model_runner.py:1624] Model loading took 30.7118 GiB and 7.663283 seconds
[1;36m(VllmWorker rank=1 pid=1448855)[0;0m INFO 07-04 00:16:48 [backends.py:462] Using cache directory: /home/t-ilshapiro/.cache/vllm/torch_compile_cache/2c592b1f5b/rank_1_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=1448855)[0;0m INFO 07-04 00:16:48 [backends.py:472] Dynamo bytecode transform time: 13.31 s
[1;36m(VllmWorker rank=0 pid=1448852)[0;0m INFO 07-04 00:16:48 [backends.py:462] Using cache directory: /home/t-ilshapiro/.cache/vllm/torch_compile_cache/2c592b1f5b/rank_0_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=0 pid=1448852)[0;0m INFO 07-04 00:16:48

  9%|▉         | 919/10396 [00:11<02:02, 77.17it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (885702 > 131072). Running this sequence through the model will result in indexing errors
 78%|███████▊  | 8114/10396 [01:42<00:34, 66.30it/s] 