# Search with base models

## Goal

Can we solve ARC tasks using base models with access to a DSL?

## Imports

In [None]:
import os
import logging
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
# os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

# # Add VLLM specific environment variables to avoid common issues
# os.environ['VLLM_USE_MODELSCOPE'] = 'False'
# os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

In [None]:
import time
import importlib
import inspect
import json
import gc
import subprocess
import signal
import pandas as pd

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from vllm import LLM, SamplingParams

from arc25.training_tasks import *
from arc25.encoders import create_grid_encoder
from arc25.prompting import pretty_print_prompt, Template
from arc25.metrics import pixel_similarity_score, correct_grids_score

## Code

### Prompt

https://github.com/flowersteam/SOAR/blob/main/soar/prompt.py

In [None]:
def extract_footprint(module_name: str, show_types: bool = False) -> str:
    """
    Load a module by name, then return a newline-separated list of all
    top-level functions in it, in the form:

      def func_name(arg1, arg2) -> return

    If show_types=True, annotations are included; otherwise only names.
    """
    mod = importlib.import_module(module_name)
    footprints = []

    for name, fn in inspect.getmembers(mod, inspect.isfunction):
        # skip imports from elsewhere
        if fn.__module__ != module_name or name.startswith("_"):
            continue

        sig = inspect.signature(fn)
        if not show_types:
            # strip type info
            params = [p.name for p in sig.parameters.values()]
            sig_text = f"({', '.join(params)})"
        else:
            sig_text = str(sig)

        footprints.append(f"- dsl.{name}{sig_text}")

    return "\n".join(footprints)

print(extract_footprint('arc25.BARC_dsl', show_types=True))

In [None]:
with open('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_training_challenges.json', 'r') as f:
    training_challenges = json.load(f)

def get_task(task_name):
    if task_name in training_challenges:
        task_data = training_challenges[task_name]
        inputs = [Img(sample['input']) for sample in task_data['train']]
        outputs = [Img(sample['output']) for sample in task_data['train']]
        return Task(inputs=inputs, outputs=outputs, code='', name=task_name)
    raise ValueError(f"Task {task_name} not found in training challenges.")

In [None]:
system_prompt = """You are an advanced AI assistant specialized in solving Abstract Reasoning Corpus (ARC-AGI) tasks."""


prompt_template = Template(
"""You are tasked with solving a transformation problem from the Abstraction and Reasoning Challenge (ARC).
Implement the transformation rules as a Python function.
You should only write the implemented the transformation in code.
You must write code in triple backticks (```python and then ```). You must write a function called `transform` which takes a single argument, the input grid as `list[list[int]]`, and returns the transformed grid (also as `list[list[int]]`).

## Key Priors:

- **Objectness**: Consider the grid as containing objects (groups of connected cells) rather than just individual pixels.
- **Goal-Directed**: The transformation should achieve a specific goal, such as creating symmetry or changing the color of specific objects.
- **Numbers & Counting**: Keep track of the number of objects, sizes, and their relative positions.
- **Geometry & Topology**: Use spatial relationships such as adjacency, enclosure, or symmetry.

Carefully analyze the examples and find the underlying transformation logic.

## Domain Specific Primitive Functions

You can use the already implemented following functions to manipulate the grid:

{{ dsl }}

The dsl has been already imported, so just simply call the functions as needed. F.e. dsl.foo()
Do not import the dsl again, just use it directly.

## Examples

Below are several input-output examples that illustrate the transformation.
Your function should generalize the pattern from these examples to solve any input following the same logic.

{% for sample in train_samples %}
### Example {{ loop.index }}

#### Input

{{ sample.input }}

#### Output

{{ sample.output }}
{% endfor %}
""")


def create_prompt_from_task(task, grid_encoder, tokenizer):
    train_samples = [{'input': grid_encoder.to_text(grid), 'output': grid_encoder.to_text(output)} for grid, output in zip(task.inputs, task.outputs)]
    render_kwargs = dict(train_samples=train_samples, dsl=extract_footprint('arc25.BARC_dsl', show_types=True))
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template.render(**render_kwargs)}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True,
                                            # enable_thinking=False,
                                            )
    return prompt

### Model

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1, max_model_len=32000):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.9,  # Use less GPU memory
        # max_model_len=4096,  # Limit context length
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

### Code

In [None]:
def parse_python_code(text):
    # Extract Python code from the text
    if '```python' not in text:
        return ''
    code = text.split('```python')[1]
    if not '```' in code:
        return ''

    code = code.split('```')[0].strip()
    return code.split('```')[0].strip()

In [None]:
def curate_python_code(code):
    remove_line_keywords = ['import dsl', 'from dsl import ', 'print(']
    code = '\n'.join(line for line in code.split('\n') if not any(keyword in line for keyword in remove_line_keywords))

    additional_imports = [
        'from typing import List, Tuple',
        'import numpy'
    ]
    code = '\n'.join(additional_imports) + '\n' + code
    return code.strip()

### Validations

In [None]:
def validate_outputs(outputs):
    return [_validate_output(output) for output in outputs]

def _validate_output(output):
    if output is None:
        raise ValueError("Output is None")
    output = np.array(output) # otherwise I see weird outputs that mix list and numpy arrays
    if output.ndim != 2:
        raise ValueError(f"Output is not a 2D array: {output.shape}")
    return output

In [None]:
import hashlib

def fingerprint(prediction):
    """
    Create a compact hash for a list of matrices.
    Includes shape & dtype to distinguish e.g. (2×2) from (4×1).
    """
    h = hashlib.sha256()
    for m in prediction:
        # incorporate shape and dtype in a reproducible way
        h.update(str(m.shape).encode())
        h.update(m.dtype.str.encode())
        # raw data bytes
        h.update(m.tobytes())
    return h.hexdigest()

## First steps

In [None]:
model_path = "/home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct"
# proc = subprocess.Popen([
#     "vllm", "serve", model_path,
#     "--data-parallel-size", "2",
#     "--tensor-parallel-size", "1",
#     "--gpu-memory-utilization", "0.9",
#     "--trust-remote-code",
#     "--dtype", "bfloat16",
#     # "--quantization", "bitsandbytes",
#     "--enable-prefix-caching",
#     "--max-model-len", "32000"
# ],
#     stdout=subprocess.PIPE,
#     stderr=subprocess.STDOUT,
#     text=True
# )

# vllm serve /home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct --data-parallel-size 2 --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --trust-remote-code --dtype bfloat16 --enable-prefix-caching --max-model-len 32000

In [None]:
import httpx
client = httpx.Client(base_url="http://localhost:8000/v1", timeout=5.0)
start = time.time()
timeout = 120.0
while True:
    try:
        resp = client.get("/models")
        if resp.status_code == 200:
            print("✅ vLLM server is ready!")
            break
    except httpx.RequestError:
        # still starting up
        pass

    if time.time() - start > timeout:
        # proc.kill()
        raise RuntimeError(f"vLLM failed to start within {timeout:.0f}s")
    time.sleep(0.5)

# print('\n'.join(proc.stdout.readlines()))

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="token-abc123",
)

# — Completions
completion = client.completions.create(
    model=model_path,
    prompt="Once upon a time,",
    max_tokens=50,
    temperature=0.7,
)
print(completion.choices[0].text)

# — Chat
chat_resp = client.chat.completions.create(
    model=model_path,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": "Tell me a joke."}
    ],
)
print(chat_resp.choices[0].message.content)


In [None]:
task_ids = list(training_challenges.keys())
sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=2048)
grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
prompts = [create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer) for task_id in task_ids]

In [None]:
completion = client.completions.create(
    model=model_path,
    prompt=prompts,
    max_tokens=2048,
    temperature=1.0,
    top_p=0.95,
    n=8,
)
print(completion.choices[0].text)

```
# vllm serve /home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct --data-parallel-size 2 --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --trust-remote-code --dtype bfloat16 --enable-prefix-caching --max-model-len 32000

GPU usage is not good, oscilates between 100% and 0%. I don't expect to see good numbers.
2025-07-27 09:11:51,475 - openai._base_client - INFO - _sleep_for_retry - Retrying request to /completions in 0.464033 seconds
2025-07-27 09:21:52,091 - openai._base_client - INFO - _sleep_for_retry - Retrying request to /completions in 0.945855 seconds

Maybe it is not designed for long calls.

# vllm serve /home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct --data-parallel-size 1 --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --trust-remote-code --dtype bfloat16 --enable-prefix-caching --max-model-len 32000
expecting around 744s of inference
```

In [None]:
import asyncio
from tqdm.notebook import tqdm
from openai import AsyncOpenAI

async def async_generate_completions_openai(
    prompts,
    api_key=None,
    model="gpt-3.5-turbo",
):
    """
    Sends multiple OpenAI completion requests asynchronously and returns their outputs.

    Args:
        prompts (List[str]): List of prompt strings to generate completions for.
        api_key (str, optional): Your OpenAI API key. If None, environment variable is used.
        model (str): Model name to use for completions.
        max_tokens (int): Maximum tokens to generate per prompt.
        temperature (float): Sampling temperature.

    Returns:
        List[str]: Generated text outputs corresponding to each prompt.
    """
    # client = AsyncOpenAI(api_key=api_key) if api_key else AsyncOpenAI()

    client = AsyncOpenAI(
        base_url="http://localhost:8000/v1",
        api_key="token-abc123",
    )

    tasks = []
    task_to_idx = {}
    for idx, prompt in enumerate(prompts):
        task = asyncio.create_task(
            client.completions.create(
                model=model,
                prompt=prompt,
                max_tokens=2048,
                temperature=1.0,
                top_p=0.95,
                n=1,
            )
        )
        tasks.append(task)
        task_to_idx[task] = idx

    outputs = [None] * len(prompts)
    # Use asyncio.wait to handle first-completed tasks and update a progress bar
    with tqdm(total=len(tasks), desc="Generating") as pbar:
        pending = set(tasks)
        while pending:
            done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
            for fut in done:
                resp = fut.result()
                i = task_to_idx[fut]
                outputs[i] = resp.choices[0].text.strip()
                pbar.update(1)

    return outputs

# Example usage in a Jupyter notebook:
# prompts = ["Hello there!", "How are you?", "Tell me a joke."]
# results = asyncio.run(async_generate_completions_openai(prompts, api_key="YOUR_API_KEY"))
# for prompt, output in zip(prompts, results):
#     print(f"Prompt: {prompt}\nOutput: {output}\n")


In [None]:
t0 = time.time()
results = await async_generate_completions_openai(prompts, model=model_path)
print(f"Generated {len(results)} completions in {time.time() - t0:.2f} seconds")

# 1 GPU, 10 tasks -> 36.2s
# 2 GPUs, 10 tasks, 17.6s

# 1 GPU, 100 tasks, 95s
# 2 GPUs, 100 tasks, 68s

# 1 GPU, 400 tasks

Generating:   0%|          | 0/400 [00:00<?, ?it/s]

2025-07-27 10:12:52,102 - asyncio - ERROR - default_exception_handler - Task exception was never retrieved
future: <Task finished name='Task-7' coro=<AsyncCompletions.create() done, defined at /home/gbarbadillo/miniconda3/envs/arc25/lib/python3.10/site-packages/openai/resources/completions.py:1062> exception=BadRequestError("Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}")>
Traceback (most recent call last):
  File "/home/gbarbadillo/miniconda3/envs/arc25/lib/python3.10/site-packages/openai/resources/completions.py", line 1091, in create
    return await self._post(
  File "/home/gbarbadillo/miniconda3/envs/arc25/lib/python3.10/site-packages/openai/_base_client.py", line 1742, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
  File "/home/gbarbadillo/miniconda3/envs/arc25/lib/python3.10/site-packages/openai/_base_client.py", line 1549, in request
    raise self._make_st

In [None]:
completion = client.completions.create(
    model=model_path,
    prompt=prompts[:5],
    max_tokens=2048,
    temperature=1.0,
    n=2,
)

In [None]:
print(completion.choices[2].text)

In [None]:
len(completion.choices)

In [None]:
raise

In [None]:
proc.send_signal(signal.SIGINT)
# (Optional) Wait for it to terminate and grab any remaining output
proc.wait(timeout=5)
print("Server exited with code", proc.returncode)

In [None]:
model_path = "/home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

# model_path = "/home/gbarbadillo/models/Qwen3-4B"
# llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

# model_path = '/home/gbarbadillo/models/Qwen2.5-Coder-14B-Instruct-GGUF/qwen2.5-coder-14b-instruct-q4_k_m.gguf' # Needs 2 GPUs
# llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=2, max_model_len=16000)

In [None]:
task_ids = list(training_challenges.keys())
sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=2048)
grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')
prompts = [create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer) for task_id in task_ids]
if 'Qwen3' in model_path:
    # disable thinking without using the chat template
    prompts = [prompt + '<think>\n\n</think>\n\n' for prompt in prompts]

t0 = time.time()
outputs = llm.generate(prompts, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in outputs)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(outputs):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(outputs) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

In [None]:
import arc25.BARC_dsl as dsl

predicted_code = {key: [] for key in task_ids}
predicted_outputs = {key: [] for key in task_ids}
for task_id, responses in zip(task_ids, outputs):
    task = get_task(task_id)
    for i, output in enumerate(responses.outputs):
        code = parse_python_code(output.text)
        if code:
            code = curate_python_code(code)
            predicted_code[task_id].append(code)
            try:
                task_predicted_outputs = safe_code_execution(code, task.inputs, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

In [None]:
df = pd.DataFrame(columns=['valid code', 'valid outputs', 'unique outputs', 'dsl usage', 'pixel similarity', 'correct grids', 'solved task'])
for task_id in task_ids:
    df.loc[task_id, 'valid code'] = len(predicted_code[task_id])/sampling_params.n
    df.loc[task_id, 'valid outputs'] = len(predicted_outputs[task_id])/sampling_params.n
    df.loc[task_id, 'unique outputs'] = len(set(fingerprint(output) for output in predicted_outputs[task_id]))/sampling_params.n
    df.loc[task_id, 'dsl usage'] = sum(1 for code in predicted_code[task_id] if 'dsl.' in code)/sampling_params.n

    task = get_task(task_id)
    task_predicted_outputs = predicted_outputs[task_id]
    scores = sorted([np.mean([pixel_similarity_score(output, pred) for output, pred in zip(task.outputs, predictions)]) for predictions in task_predicted_outputs])
    df.loc[task_id, 'pixel similarity'] = np.mean(scores) if scores else 0.0

    task_outputs = [np.array(output) for output in task.outputs]
    scores = sorted([correct_grids_score(task_outputs, predictions) for predictions in task_predicted_outputs])
    df.loc[task_id, 'correct grids'] = np.mean(scores) if scores else 0.0
    df.loc[task_id, 'solved task'] = int(np.max(scores) == 1) if scores else 0

df.loc['MEAN'] = df.mean(axis=0)
df

In [None]:
df.iloc[-1:]

In [None]:
output_path = f'{os.path.basename(model_path)}_{len(task_ids)}tasks_{sampling_params.n}preds_{int(inference_time)}runtime.csv'
df.to_csv(output_path, index_label='task_id')
print(f"Results saved to {output_path}")

## TODO

- [x] Create a prompt with the available DSL functions and the training ARC task
- [x] Fix VLLM initialization issues with proper memory management
- [x] Verify the effect of caching
- [x] Generate some code that can be used to test the new BARC dsl
- [x] Update the library to be able to select which DSL to use when executing code
- [x] Verify that I can execute the code generated with the BARC dsl
- [x] Add security checks to code, I have seen some input required in the code
- [ ] Try to solve some easy task with independent sampling
  - [x] How frequently is the dsl used?
  - [x] Influence of the model
  - [x] Implement output validation, and simplify the metric. Those are different responsabilities
  - [x] Correct grids
  - [x] Unique outputs
  - [x] Everything into a dataframe
  - [ ] Metrics distribution
  - [ ] Visualization of the predictions
  - [ ] Global metrics vs task specific analysis
- [ ] Create a refine prompt
- [ ] Make a more complex tree search