# Search with base models

## Goal

Can we solve ARC tasks using base models with access to a DSL?

## Imports

In [None]:
import os
import logging
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

# Add VLLM specific environment variables to avoid common issues
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

In [None]:
import time
import importlib
import inspect
import json
import gc
import random
import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import BeamSearchParams

from arc25.training_tasks import *
from arc25.encoders import create_grid_encoder
from arc25.prompting import pretty_print_prompt, Template
from arc25.metrics import pixel_similarity_score, correct_grids_score
from arc25.utils import get_timestamp
import arc25.BARC_dsl as dsl

## Code

### Prompt

https://github.com/flowersteam/SOAR/blob/main/soar/prompt.py

In [None]:
def extract_footprint(module_name: str, show_types: bool = False) -> str:
    """
    Load a module by name, then return a newline-separated list of all
    top-level functions in it, in the form:

      def func_name(arg1, arg2) -> return

    If show_types=True, annotations are included; otherwise only names.
    """
    mod = importlib.import_module(module_name)
    footprints = []

    for name, fn in inspect.getmembers(mod, inspect.isfunction):
        # skip imports from elsewhere
        if fn.__module__ != module_name or name.startswith("_"):
            continue

        sig = inspect.signature(fn)
        if not show_types:
            # strip type info
            params = [p.name for p in sig.parameters.values()]
            sig_text = f"({', '.join(params)})"
        else:
            sig_text = str(sig)

        footprints.append(f"- dsl.{name}{sig_text}")

    return "\n".join(footprints)

print(extract_footprint('arc25.BARC_dsl', show_types=True))

In [None]:
with open('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_training_challenges.json', 'r') as f:
    training_challenges = json.load(f)

with open('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_evaluation_challenges.json', 'r') as f:
    evaluation_challenges = json.load(f)

with open('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2025/arc-agi_evaluation_challenges.json', 'r') as f:
    evaluation_challenges_2025 = json.load(f)

def get_task(task_name):
    if task_name in training_challenges:
        task_data = training_challenges[task_name]
    elif task_name in evaluation_challenges:
        task_data = evaluation_challenges[task_name]
    elif task_name in evaluation_challenges_2025:
        task_data = evaluation_challenges_2025[task_name]
    else:
        raise ValueError(f"Task {task_name} not found in training or evaluation challenges.")
    inputs = [Img(sample['input']) for sample in task_data['train']]
    outputs = [Img(sample['output']) for sample in task_data['train']]
    return Task(inputs=inputs, outputs=outputs, code='', name=task_name)

In [None]:
# https://huggingface.co/barc0/Llama-3.1-ARC-Potpourri-Induction-8B
system_prompt = """You are a world-class puzzle solver with exceptional pattern recognition skills and expertise in Python programming. Your task is to analyze puzzles and provide Python solutions."""

prompt_template_text = """Given input-output grid pairs as reference examples, carefully observe the patterns to predict the output grid for new test input. Each pair follows the same transformation rule. Grids are 2D arrays represented as strings, with cells (colors) separated by spaces and rows by newlines.
Here are the input and output grids for the reference examples:
{% for sample in train_samples %}Example {{ loop.index }}
Input:
{{ sample.input }}

Output:
{{ sample.output }}

{% endfor %}
Here is the input grid for the test example:
{{ test }}

Write a Python function `transform` that can convert any given input grid to its corresponding output grid based on the pattern observed in the reference examples.
"""


prompt_template = Template(prompt_template_text)


def create_prompt_from_task(task, grid_encoder, tokenizer, shuffle_train_samples=False, remove_last_train_sample=False):
    train_samples = [{'input': grid_encoder.to_text(grid), 'output': grid_encoder.to_text(output)} for grid, output in zip(task.inputs[:-1], task.outputs[:-1])]
    if shuffle_train_samples:
        random.shuffle(train_samples)
    if remove_last_train_sample and len(train_samples) > 1:
        train_samples = train_samples[:-1]
    render_kwargs = dict(train_samples=train_samples, test=grid_encoder.to_text(task.inputs[-1]))
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template.render(**render_kwargs)}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True,
                                            # enable_thinking=False,
                                            )
    return prompt

### Model

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1, max_model_len=32000):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.9,  # Use less GPU memory
        # max_model_len=4096,  # Limit context length
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

### Code

In [None]:
def parse_python_code(text):
    # Extract Python code from the text
    if '```python' not in text:
        return ''
    code = text.split('```python')[1]
    if not '```' in code:
        return ''

    code = code.split('```')[0].strip()
    return code

In [None]:
def curate_python_code(code):
    remove_line_keywords = ['import dsl', 'from dsl import ', 'print(', 'from common import *']
    code = '\n'.join(line for line in code.split('\n') if not any(keyword in line for keyword in remove_line_keywords))
    # code = 'from arc25.BARC_dsl import *\n' + code  # Ensure BARC_dsl is imported
    return code.strip()

def add_additional_imports(code):
    additional_imports = [
        'from typing import List, Tuple',
        'import numpy as np',
        'import numpy'
    ]
    imports = '\n'.join(additional_imports)
    return imports + '\n' + code if code else imports

### Validations

In [None]:
def validate_outputs(outputs):
    if not outputs:
        raise ValueError("Outputs list is empty")
    return [_validate_output(output) for output in outputs]

def _validate_output(output):
    if output is None:
        raise ValueError("Output is None")
    output = np.array(output) # otherwise I see weird outputs that mix list and numpy arrays
    if output.ndim != 2:
        raise ValueError(f"Output is not a 2D array. Output shape: {output.shape}")
    if max(output.shape) > 35:
        raise ValueError(f"Output is too large, the maximum allowed shape is 30x30. Output shape: {output.shape}")
    if min(output.shape) == 0:
        raise ValueError(f"Output has zero dimension, it is empty. Output shape: {output.shape}")
    if np.max(output) > 9 or np.min(output) < 0:
        raise ValueError(f"Output contains invalid values, expected values in range [0, 9]. Output max: {np.max(output)}, min: {np.min(output)}")
    return output

In [None]:
import hashlib

def fingerprint(prediction):
    """
    Create a compact hash for a list of matrices.
    Includes shape & dtype to distinguish e.g. (2×2) from (4×1).
    """
    h = hashlib.sha256()
    for m in prediction:
        # incorporate shape and dtype in a reproducible way
        h.update(str(m.shape).encode())
        h.update(m.dtype.str.encode())
        # raw data bytes
        h.update(m.tobytes())
    return h.hexdigest()

### Metrics

In [None]:
def compute_search_metrics(task_ids, predicted_code, predicted_outputs, n_preds):
    df = pd.DataFrame(columns=['valid code', 'valid outputs', 'unique outputs', 'dsl usage', 'pixel similarity', 'correct grids', 'solved task'])
    for task_id in task_ids:
        df.loc[task_id, 'valid code'] = len(predicted_code[task_id])/n_preds
        df.loc[task_id, 'valid outputs'] = len(predicted_outputs[task_id])/n_preds
        df.loc[task_id, 'unique outputs'] = len(set(fingerprint(output) for output in predicted_outputs[task_id]))/n_preds
        df.loc[task_id, 'dsl usage'] = sum(1 for code in predicted_code[task_id] if 'dsl.' in code)/n_preds

        task = get_task(task_id)
        task_predicted_outputs = predicted_outputs[task_id]
        scores = sorted([np.mean([pixel_similarity_score(output, pred) for output, pred in zip(task.outputs, predictions)]) for predictions in task_predicted_outputs])
        df.loc[task_id, 'pixel similarity'] = np.mean(scores) if scores else 0.0

        task_outputs = [np.array(output) for output in task.outputs]
        scores = sorted([correct_grids_score(task_outputs, predictions) for predictions in task_predicted_outputs])
        df.loc[task_id, 'correct grids'] = np.mean(scores) if scores else 0.0
        df.loc[task_id, 'solved task'] = int(np.max(scores) == 1) if scores else 0

    df.loc['MEAN'] = df.mean(axis=0)
    return df

## Independent search

In [None]:
task_ids = list(training_challenges.keys())
filepath = '/mnt/hdd0/Kaggle/arc25/predictions/training_8preds_2025_08_18_21_26_04_predictions.json'
with open(filepath, 'r') as f:
    predictions = json.load(f)

In [None]:
raise

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

In [None]:
task_ids = list(training_challenges.keys())
task_ids = list(evaluation_challenges.keys())
task_ids = list(evaluation_challenges_2025.keys())
grid_encoder = create_grid_encoder('ColorNameEncoder()')
prompts = [create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer) for task_id in task_ids]
pretty_print_prompt(prompts[0], default_color='white')

In [None]:
for n in [8]:
    sampling_params = SamplingParams(n=n, temperature=1.0, top_p=0.95, max_tokens=2048)
    t0 = time.time()
    outputs = llm.generate(prompts, sampling_params)
    total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in outputs)
    inference_time = time.time() - t0
    print(f"Total tokens generated: {total_tokens}")
    print(f"Time taken: {inference_time:.2f} seconds")
    print(f"Average time per task: {inference_time / len(outputs):.2f} seconds")
    print(f"Average tokens per task: {total_tokens / len(outputs) / sampling_params.n:.2f} tokens")
    print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

    predictions = {task_id: [output.text for output in output.outputs] for task_id, output in zip(task_ids, outputs)}
    # output_filepath = f'/mnt/hdd0/Kaggle/arc25/predictions/training_{sampling_params.n}preds_{get_timestamp()}_predictions.json'
    output_filepath = f'/mnt/hdd0/Kaggle/arc25/predictions/evaluation_2025_{sampling_params.n}preds_{get_timestamp()}_predictions.json'
    with open(output_filepath, 'w') as f:
        json.dump(predictions, f, indent=2)
    print(f"Predictions saved to {output_filepath}")

In [None]:
predicted_code = {key: [] for key in task_ids}
predicted_outputs = {key: [] for key in task_ids}
for task_id, outputs in predictions.items():
    task = get_task(task_id)
    for i, output in enumerate(outputs):
        code = parse_python_code(output)
        if code:
            code = curate_python_code(code)
            predicted_code[task_id].append(code)
            try:
                input_grids = [np.array(input_grid) for input_grid in task.inputs]
                task_predicted_outputs = safe_code_execution(add_additional_imports(code), input_grids, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

In [None]:
print(predictions['045e512c'][3])

In [18]:
df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, len(list(predictions.values())[0]))
df

Unnamed: 0,valid code,valid outputs,unique outputs,dsl usage,pixel similarity,correct grids,solved task
007bbfb7,1.0,0.75,0.75,0.0,0.699177,0.0,0
00d62c1b,1.0,1.0,0.375,0.0,0.987986,0.475,1
017c7c7b,1.0,0.875,0.625,0.0,0.680776,0.428571,1
025d127b,1.0,0.875,0.625,0.0,0.803855,0.0,0
045e512c,1.0,0.5,0.5,0.0,0.741119,0.0,0
...,...,...,...,...,...,...,...
fcc82909,1.0,1.0,1.0,0.0,0.749167,0.0,0
feca6190,1.0,0.875,0.75,0.0,0.131206,0.0,0
ff28f65a,1.0,0.75,0.25,0.0,0.898148,0.5625,0
ff805c23,1.0,0.875,0.875,0.0,0.176743,0.0,0


In [19]:
df.iloc[-1:]

Unnamed: 0,valid code,valid outputs,unique outputs,dsl usage,pixel similarity,correct grids,solved task
MEAN,1.0,0.824063,0.656563,0.0,0.615033,0.148643,0.2975


In [None]:
raise

## WIP

In [None]:
print(predicted_code['007bbfb7'][0])

In [None]:
print("Given input-output grid pairs as reference examples, carefully observe the patterns to predict the output grid for new test input. Each pair follows the same transformation rule. Grids are 2D arrays represented as strings, with cells (colors) separated by spaces and rows by newlines.\nHere are the input and output grids for the reference examples:\nExample 1\nInput:\nGray Black Black Gray Black\nGray Black Black Gray Black\nGray Black Gray Gray Gray\nGray Gray Gray Black Black\nBlack Black Gray Black Black\nBlack Black Gray Gray Gray\nBlack Black Black Gray Black\nGray Gray Gray Gray Black\nBlack Gray Black Black Black\nBlack Gray Black Black Black\nBlack Gray Gray Gray Black\nBlack Black Black Gray Black\nBlack Gray Gray Gray Gray\nGray Gray Black Black Black\nBlack Gray Black Black Black\n\nOutput:\nGray Black Black Gray Black\nGray Black Black Gray Black\nGray Black Gray Gray Gray\nGray Gray Gray Black Black\nBlack Black Gray Black Black\nBlack Black Gray Gray Gray\nBlack Black Black Gray Purple\nGray Gray Gray Gray Purple\nBlack Gray Purple Purple Purple\nBlack Gray Purple Purple Purple\nBlack Gray Gray Gray Purple\nBlack Black Black Gray Purple\nBlack Gray Gray Gray Gray\nGray Gray Black Black Black\nOrange Gray Black Black Black\n\n\nExample 2\nInput:\nBlack Black Gray Black Black Gray Black Black Black\nBlack Black Gray Gray Gray Gray Black Black Black\nGray Gray Gray Black Black Black Black Black Black\nBlack Gray Black Black Black Black Black Black Black\nBlack Gray Black Black Black Gray Gray Gray Gray\nBlack Gray Gray Gray Gray Gray Black Black Black\nGray Gray Black Black Black Gray Gray Gray Gray\nBlack Black Black Black Black Gray Black Black Black\nGray Gray Gray Gray Gray Gray Black Black Black\nBlack Black Black Black Black Gray Black Black Black\n\nOutput:\nBlack Black Gray Orange Orange Gray Purple Purple Purple\nBlack Black Gray Gray Gray Gray Purple Purple Purple\nGray Gray Gray Purple Purple Purple Purple Purple Purple\nBlack Gray Purple Purple Purple Purple Purple Purple Purple\nBlack Gray Purple Purple Purple Gray Gray Gray Gray\nBlack Gray Gray Gray Gray Gray Black Black Black\nGray Gray Black Black Black Gray Gray Gray Gray\nBlack Black Black Black Black Gray Black Black Black\nGray Gray Gray Gray Gray Gray Black Black Black\nBlack Black Black Black Black Gray Black Black Black\n\n\nExample 3\nInput:\nBlack Gray Black Black Gray Black Black Black Black Gray Black Black\nBlack Gray Black Black Gray Gray Gray Black Black Gray Black Black\nBlack Gray Gray Gray Gray Black Gray Black Black Gray Black Black\nBlack Black Gray Black Black Black Gray Gray Gray Gray Black Black\nGray Gray Gray Black Black Black Gray Black Black Gray Gray Gray\nBlack Black Black Black Black Black Gray Black Black Black Black Black\nBlack Black Black Gray Gray Gray Gray Black Black Black Black Black\nGray Gray Gray Gray Black Black Gray Black Black Black Black Black\nBlack Black Black Gray Black Black Gray Gray Gray Black Black Black\nBlack Black Black Gray Black Black Black Black Gray Black Black Black\n\nOutput:\nBlack Gray Orange Orange Gray Black Black Black Black Gray Black Black\nBlack Gray Orange Orange Gray Gray Gray Black Black Gray Black Black\nBlack Gray Gray Gray Gray Black Gray Black Black Gray Black Black\nBlack Black Gray Black Black Black Gray Gray Gray Gray Black Black\nGray Gray Gray Black Black Black Gray Purple Purple Gray Gray Gray\nBlack Black Black Black Black Black Gray Purple Purple Purple Purple Purple\nBlack Black Black Gray Gray Gray Gray Purple Purple Purple Purple Purple\nGray Gray Gray Gray Black Black Gray Purple Purple Purple Purple Purple\nBlack Black Black Gray Black Black Gray Gray Gray Purple Purple Purple\nBlack Black Black Gray Black Black Black Black Gray Purple Purple Purple\n\n\nHere is the input grid for the test example:\nInput:\nBlack Gray Black Black Black Black Black Gray Black Black Gray Black\nBlack Gray Black Black Black Gray Gray Gray Black Gray Gray Black\nGray Gray Gray Black Black Gray Black Gray Gray Gray Black Black\nBlack Black Gray Gray Gray Gray Black Gray Black Gray Gray Black\nBlack Black Black Gray Black Black Black Gray Black Black Gray Black\n\nWrite a Python function `transform` that can convert any given input grid to its corresponding output grid based on the pattern observed in the reference examples.")


In [None]:
len(tokenizer.tokenize(grid_encoder.to_text(np.zeros((1, 20), dtype=np.int32))))

In [None]:
tokenizer.tokenize(grid_encoder.to_text(np.zeros((2, 2), dtype=np.int32)))

In [None]:
print(tokenizer.tokenize('Input:\nGray Black Black Gray Black\nGray Black Black Gray Black\n'))

In [None]:
grid_encoder.to_text([np.arange(10).tolist()])

## Learnings

First predictions took 3 seconds per task to make 8 predictions. A quick estimation will make 2048 predictions
per task when running on 4 GPUs.

Training tasks

```
8 preds
Average time per task: 2.99 seconds
         valid code	valid outputs	unique outputs	dsl usage	pixel similarity	correct grids	solved task
MEAN	1.0	0.652188	0.524687	0.0	0.572869	0.129519	0.245

16 preds
Average time per task: 5.60 seconds
MEAN	1.0	0.657969	0.473125	0.0	0.595426	0.131211	0.3075

32 preds
Average time per task: 11.24 seconds
MEAN	0.25	0.161973	0.104766	0.0	0.610798	0.13038	0.35

64 preds
Average time per task: 22.60 seconds
MEAN	0.49998	0.326367	0.187559	0.0	0.605516	0.123638	0.38

128 preds
Average time per task: 45.14 seconds
MEAN	0.99998	0.657676	0.336602	0.0	0.604492	0.130672	0.4475
```

Evaluation tasks:

```
8 preds
Average time per task: 3.64 seconds
	valid code	valid outputs	unique outputs	dsl usage	pixel similarity	correct grids	solved task
MEAN	1.0	0.62125	0.551562	0.0	0.547859	0.032785	0.0475

16 preds
Average time per task: 6.65 seconds
1.0	0.609062	0.493906	0.0	0.575911	0.032614	0.0575

32 preds
Average time per task: 13.19 seconds
MEAN	1.0	0.615313	0.464687	0.0	0.569353	0.031603	0.1025

64 preds
Average time per task: 26.53 seconds
MEAN	0.999961	0.614297	0.421953	0.0	0.578199	0.031114	0.115
```

Evaluation 2025:

```
8 preds
Average time per task: 4.58 seconds
MEAN	1.0	0.560417	0.515625	0.0	0.505795	0.000333	0.0

```

### Execution improvements



```
    valid code	valid outputs	unique outputs	dsl usage	pixel similarity	correct grids	solved task
# baseline
MEAN	1.0	0.652188	0.524687	0.0	0.572869	0.129519	0.245
# Add purple and brown to Color.
MEAN	1.0	0.769687	0.617188	0.0	0.612253	0.144651	0.2825
# change how the outputs are created
MEAN	1.0	0.797188	0.637188	0.0	0.60988	0.144386	0.285
# use np.array instead of Img as input
MEAN	1.0	0.824063	0.656563	0.0	0.615033	0.148643	0.2975
```

Valid outputs increase by 11%, solved tasks improve from 24.5% to 28.25%.


## TODO:

- [x] Implement new grid encoder
- [x] Use the correct prompt
- [x] Save predictions to file so I can later reprocess them
- [ ] Update code execution to match the code generated by BARC model
- [ ] Check code execution to verify that exceptions are legit and not easily solvable
  - [x] Add missing colors to Color object
  - [x] Code execution fails when there are auxiliary functions. `Error executing code for task 025d127b, response 5: <class 'NameError'> name 'blend_colors' is not defined`
- [ ] Validate that I get the same scores of the paper
- [ ] Evaluate on different datasets
- [ ] Data augmentation

8: "Purple",
9: "Brown" 

TEAL = 8
MAROON = 9