# Search with base models

## Goal

Can we solve ARC tasks using base models with access to a DSL?

## Imports

In [None]:
import os
import logging
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

# Add VLLM specific environment variables to avoid common issues
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

In [None]:
import time
import importlib
import inspect
import json
import gc
import random
import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from vllm import LLM, SamplingParams

from arc25.training_tasks import *
from arc25.encoders import create_grid_encoder
from arc25.prompting import pretty_print_prompt, Template
from arc25.metrics import pixel_similarity_score, correct_grids_score
import arc25.BARC_dsl as dsl

## Code

### Prompt

https://github.com/flowersteam/SOAR/blob/main/soar/prompt.py

In [None]:
def extract_footprint(module_name: str, show_types: bool = False) -> str:
    """
    Load a module by name, then return a newline-separated list of all
    top-level functions in it, in the form:

      def func_name(arg1, arg2) -> return

    If show_types=True, annotations are included; otherwise only names.
    """
    mod = importlib.import_module(module_name)
    footprints = []

    for name, fn in inspect.getmembers(mod, inspect.isfunction):
        # skip imports from elsewhere
        if fn.__module__ != module_name or name.startswith("_"):
            continue

        sig = inspect.signature(fn)
        if not show_types:
            # strip type info
            params = [p.name for p in sig.parameters.values()]
            sig_text = f"({', '.join(params)})"
        else:
            sig_text = str(sig)

        footprints.append(f"- dsl.{name}{sig_text}")

    return "\n".join(footprints)

print(extract_footprint('arc25.BARC_dsl', show_types=True))

In [None]:
with open('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_training_challenges.json', 'r') as f:
    training_challenges = json.load(f)

def get_task(task_name):
    if task_name in training_challenges:
        task_data = training_challenges[task_name]
        inputs = [Img(sample['input']) for sample in task_data['train']]
        outputs = [Img(sample['output']) for sample in task_data['train']]
        return Task(inputs=inputs, outputs=outputs, code='', name=task_name)
    raise ValueError(f"Task {task_name} not found in training challenges.")

In [None]:
system_prompt = """You are an advanced AI assistant specialized in solving Abstract Reasoning Corpus (ARC-AGI) tasks."""

prompt_template_text ="""You are tasked with solving a transformation problem from the Abstraction and Reasoning Challenge (ARC).
Implement the transformation rules as a Python function.
You should only write the implemented the transformation in code.
You must write code in triple backticks (```python and then ```). You must write a function called `transform` which takes a single argument, the input grid as `list[list[int]]`, and returns the transformed grid (also as `list[list[int]]`).

## Key Priors:

- **Objectness**: Consider the grid as containing objects (groups of connected cells) rather than just individual pixels.
- **Goal-Directed**: The transformation should achieve a specific goal, such as creating symmetry or changing the color of specific objects.
- **Numbers & Counting**: Keep track of the number of objects, sizes, and their relative positions.
- **Geometry & Topology**: Use spatial relationships such as adjacency, enclosure, or symmetry.

Carefully analyze the examples and find the underlying transformation logic.

## Domain Specific Primitive Functions

You can use the already implemented following functions to manipulate the grid:

{{ dsl }}

The dsl has been already imported, so just simply call the functions as needed. F.e. dsl.foo()
Do not import the dsl again, just use it directly.

## Examples

Below are several input-output examples that illustrate the transformation.
Your function should generalize the pattern from these examples to solve any input following the same logic.

{% for sample in train_samples %}
### Example {{ loop.index }}

#### Input

{{ sample.input }}

#### Output

{{ sample.output }}
{% endfor %}
"""

prompt_template = Template(prompt_template_text)


def create_prompt_from_task(task, grid_encoder, tokenizer, shuffle_train_samples=False):
    train_samples = [{'input': grid_encoder.to_text(grid), 'output': grid_encoder.to_text(output)} for grid, output in zip(task.inputs, task.outputs)]
    if shuffle_train_samples:
        random.shuffle(train_samples)
    render_kwargs = dict(train_samples=train_samples, dsl=extract_footprint('arc25.BARC_dsl', show_types=True))
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template.render(**render_kwargs)}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True,
                                            # enable_thinking=False,
                                            )
    return prompt

In [None]:
# v1. One problem of v1 is that 13% of the tasks have predicted the exact same code. So let's rewrite a prompt to avoid that
prompt_novel_tasks_text_v1 = prompt_template_text + """
## Already generated code

Below are python functions already generated to try to solve the task. Take them into account when generating the new code.
Your code should be different from the already generated code, it should generate a new and different solution to the task.
The functions are given to avoid repeating the same code. Your code should be a new and different solution to the task.

{% for prediction in previous_predictions %}
### Code {{ loop.index }}

```python
{{ prediction }}
```

{% endfor %}
"""

# v2, 8% exact repetitions
prompt_novel_tasks_text_v2 = prompt_template_text + """
## Already generated code

Below are python functions already generated to try to solve the task. 
Do not repeat them, make sure to generate a new and different solution to the task.

{% for prediction in previous_predictions %}
### Code sample{{ loop.index }}

```python
{{ prediction }}
```

{% endfor %}

## New solution

Now implement a new and original solution to the task.
"""

# v3, 5% of repeated code
prompt_novel_tasks_text_v3 = prompt_template_text + """
## Already generated code

Below are python functions already generated to try to solve the task. 
Do not repeat them, make sure to generate a new and different solution to the task.

{% for prediction in previous_predictions %}
### Code sample{{ loop.index }}

DO NOT REPEAT THIS CODE, generate a new and different solution to the task.

```python
{{ prediction }}
```

{% endfor %}

## New solution

Now implement a new and original solution to the task.
Write some code that solves the task using a different approach than the previous ones.
"""

# system_prompt = """You are an advanced AI assistant specialized in solving Abstract Reasoning Corpus (ARC-AGI) tasks.
# Do not repeat any of the code snippets provided in the previous predictions. Always generate a new and different solution to the task."""

prompt_novel_tasks_template = Template(prompt_novel_tasks_text_v1)
# TODO: add scores to the prompt
#Pixel similarity scores: {{ prediction.pixel_similarity_scores }}
#Correct grids scores: {{ prediction.correct_grids_scores }}

def create_prompt_for_novel_solutions(task, grid_encoder, tokenizer, previous_predictions):
    train_samples = [{'input': grid_encoder.to_text(grid), 'output': grid_encoder.to_text(output)} for grid, output in zip(task.inputs, task.outputs)]
    render_kwargs = dict(train_samples=train_samples, dsl=extract_footprint('arc25.BARC_dsl', show_types=True),
                         previous_predictions=previous_predictions)
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_novel_tasks_template.render(**render_kwargs)}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True,
                                            # enable_thinking=False,
                                            )
    return prompt

In [None]:
refine_prompt_v1 = prompt_template_text + """
## Code to be refined

Below are python functions already generated to try to solve the task.
Those functions do not solve the task, but the direction of the solution is correct.
Refine them into a new function that solves the task.

{% for prediction in previous_predictions %}
### Code sample {{ loop.index }}

```python
{{ prediction }}
```

{% endfor %}

## Refined solution

Now implement a new function that refines the previous code and solves the task.
"""

refine_prompt_template = Template(refine_prompt_v1)


def create_refine_prompt(task, grid_encoder, tokenizer, previous_predictions):
    train_samples = [{'input': grid_encoder.to_text(grid), 'output': grid_encoder.to_text(output)} for grid, output in zip(task.inputs, task.outputs)]
    render_kwargs = dict(train_samples=train_samples, dsl=extract_footprint('arc25.BARC_dsl', show_types=True),
                         previous_predictions=previous_predictions)
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": refine_prompt_template.render(**render_kwargs)}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True,
                                            # enable_thinking=False,
                                            )
    return prompt

### Model

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1, max_model_len=32000):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.9,  # Use less GPU memory
        # max_model_len=4096,  # Limit context length
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

### Code

In [None]:
def parse_python_code(text):
    # Extract Python code from the text
    if '```python' not in text:
        return ''
    code = text.split('```python')[1]
    if not '```' in code:
        return ''

    code = code.split('```')[0].strip()
    return code.split('```')[0].strip()

In [None]:
def curate_python_code(code):
    remove_line_keywords = ['import dsl', 'from dsl import ', 'print(']
    code = '\n'.join(line for line in code.split('\n') if not any(keyword in line for keyword in remove_line_keywords))
    return code.strip()

def add_additional_imports(code):
    additional_imports = [
        'from typing import List, Tuple',
        'import numpy as np'
    ]
    imports = '\n'.join(additional_imports)
    return imports + '\n' + code if code else imports

### Validations

In [None]:
def validate_outputs(outputs):
    return [_validate_output(output) for output in outputs]

def _validate_output(output):
    if output is None:
        raise ValueError("Output is None")
    output = np.array(output) # otherwise I see weird outputs that mix list and numpy arrays
    if output.ndim != 2:
        raise ValueError(f"Output is not a 2D array: {output.shape}")
    return output

In [None]:
import hashlib

def fingerprint(prediction):
    """
    Create a compact hash for a list of matrices.
    Includes shape & dtype to distinguish e.g. (2×2) from (4×1).
    """
    h = hashlib.sha256()
    for m in prediction:
        # incorporate shape and dtype in a reproducible way
        h.update(str(m.shape).encode())
        h.update(m.dtype.str.encode())
        # raw data bytes
        h.update(m.tobytes())
    return h.hexdigest()

### Metrics

In [None]:
def compute_search_metrics(task_ids, predicted_code, predicted_outputs, n_preds):
    df = pd.DataFrame(columns=['valid code', 'valid outputs', 'unique outputs', 'dsl usage', 'pixel similarity', 'correct grids', 'solved task'])
    for task_id in task_ids:
        df.loc[task_id, 'valid code'] = len(predicted_code[task_id])/n_preds
        df.loc[task_id, 'valid outputs'] = len(predicted_outputs[task_id])/n_preds
        df.loc[task_id, 'unique outputs'] = len(set(fingerprint(output) for output in predicted_outputs[task_id]))/n_preds
        df.loc[task_id, 'dsl usage'] = sum(1 for code in predicted_code[task_id] if 'dsl.' in code)/n_preds

        task = get_task(task_id)
        task_predicted_outputs = predicted_outputs[task_id]
        scores = sorted([np.mean([pixel_similarity_score(output, pred) for output, pred in zip(task.outputs, predictions)]) for predictions in task_predicted_outputs])
        df.loc[task_id, 'pixel similarity'] = np.mean(scores) if scores else 0.0

        task_outputs = [np.array(output) for output in task.outputs]
        scores = sorted([correct_grids_score(task_outputs, predictions) for predictions in task_predicted_outputs])
        df.loc[task_id, 'correct grids'] = np.mean(scores) if scores else 0.0
        df.loc[task_id, 'solved task'] = int(np.max(scores) == 1) if scores else 0

    df.loc['MEAN'] = df.mean(axis=0)
    return df

In [None]:
raise

## Independent search

In [None]:
model_path = "/home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

# model_path = "/home/gbarbadillo/models/Qwen3-4B"
# llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

# model_path = '/home/gbarbadillo/models/Qwen2.5-Coder-14B-Instruct-GGUF/qwen2.5-coder-14b-instruct-q4_k_m.gguf' # Needs 2 GPUs
# llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=2, max_model_len=16000)

In [None]:
task_ids = list(training_challenges.keys())
sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=2048)
grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')
prompts = [create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer) for task_id in task_ids]
if 'Qwen3' in model_path:
    # disable thinking without using the chat template
    prompts = [prompt + '<think>\n\n</think>\n\n' for prompt in prompts]

t0 = time.time()
outputs = llm.generate(prompts, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in outputs)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(outputs):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(outputs) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

In [None]:
predicted_code = {key: [] for key in task_ids}
predicted_outputs = {key: [] for key in task_ids}
for task_id, responses in zip(task_ids, outputs):
    task = get_task(task_id)
    for i, output in enumerate(responses.outputs):
        code = parse_python_code(output.text)
        if code:
            code = curate_python_code(code)
            predicted_code[task_id].append(code)
            try:
                task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

In [None]:
df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, sampling_params.n)
df

In [None]:
df.iloc[-1:]

In [None]:
output_path = f'{os.path.basename(model_path)}_{len(task_ids)}tasks_{sampling_params.n}preds_{int(inference_time)}runtime.csv'
df.to_csv(output_path, index_label='task_id')
print(f"Results saved to {output_path}")

## Sequential search

Let's give the model the context of all the functions generated so far to try induce more diversity.

### Prompt tuning

#### Configuration

In [None]:
model_path = "/home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

In [None]:
task_ids = list(training_challenges.keys())
predicted_code = {key: [] for key in task_ids}
predicted_outputs = {key: [] for key in task_ids}
sampling_params = SamplingParams(n=1, temperature=1.0, top_p=0.95, max_tokens=2048)
grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')

#### Initialization

As a first step let's make sure that we have a single prediction that predicts a valid output for each task

In [None]:
# first prediction initialization
while True:
    prompts, epoch_task_ids = [], []
    for task_id in task_ids:
        if not predicted_code[task_id]:  # only create prompt if no code has been predicted yet
            prompt = create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer)
            prompts.append(prompt)
            epoch_task_ids.append(task_id)
    if not prompts:
        break

    outputs = llm.generate(prompts, sampling_params)

    for task_id, responses in zip(epoch_task_ids, outputs):
        task = get_task(task_id)
        for i, output in enumerate(responses.outputs):
            code = parse_python_code(output.text)
            if code:
                code = curate_python_code(code)
                try:
                    task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                    task_predicted_outputs = validate_outputs(task_predicted_outputs)
                    predicted_code[task_id].append(code)
                    predicted_outputs[task_id].append(task_predicted_outputs)
                except Exception as e:
                    logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

In [None]:
df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, 1)
df

#### Independent search Baseline

In [None]:
predicted_outputs = {key: values[:1] for key, values in predicted_outputs.items()}
predicted_code = {key: values[:1] for key, values in predicted_code.items()}

prompts = []
for task_id in task_ids:
    prompt = create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer)
    prompts.append(prompt)

outputs = llm.generate(prompts, sampling_params)
for task_id, responses in zip(task_ids, outputs):
    task = get_task(task_id)
    for i, output in enumerate(responses.outputs):
        code = parse_python_code(output.text)
        if code:
            predicted_code[task_id].append(code)
            code = curate_python_code(code)
            try:
                task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, 2)
df

#### Compare with sequential search

In [None]:
predicted_outputs = {key: values[:1] for key, values in predicted_outputs.items()}
predicted_code = {key: values[:1] for key, values in predicted_code.items()}

sampling_params = SamplingParams(n=1, temperature=1.0, top_p=0.95, max_tokens=2048, repetition_penalty=1.0)

prompts = []
for task_id in task_ids:
    prompt = create_prompt_for_novel_solutions(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer, previous_predictions=predicted_code[task_id])
    prompts.append(prompt)

outputs = llm.generate(prompts, sampling_params)
for task_id, responses in zip(task_ids, outputs):
    task = get_task(task_id)
    for i, output in enumerate(responses.outputs):
        code = parse_python_code(output.text)
        if code:
            predicted_code[task_id].append(code)
            code = curate_python_code(code)
            try:
                task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, 2)
df

In [None]:
total, equal = 0, 0
repeated_code_task_ids = []
for task_id in task_ids:
    if len(predicted_code[task_id]) > 1:
        total += 1
        if predicted_code[task_id][0] == predicted_code[task_id][1]:
            repeated_code_task_ids.append(task_id)
            equal += 1
print(f"Total tasks with multiple predictions: {total}")
print(f"Tasks with exactly equal predictions: {equal} ({equal/total:.2%})")
print(f"Tasks with repeated code: {repeated_code_task_ids}")

In [None]:
pretty_print_prompt(prompts[task_ids.index(repeated_code_task_ids[0])], default_color='white')

In [None]:
predicted_code[repeated_code_task_ids[0]]

#### Compare with refine prompt approach

In [None]:
predicted_outputs = {key: values[:1] for key, values in predicted_outputs.items()}
predicted_code = {key: values[:1] for key, values in predicted_code.items()}

sampling_params = SamplingParams(n=1, temperature=1.0, top_p=0.95, max_tokens=2048, repetition_penalty=1.0)

prompts = []
for task_id in task_ids:
    prompt = create_refine_prompt(get_task(task_id), grid_encoder=grid_encoder, tokenizer=tokenizer, previous_predictions=predicted_code[task_id])
    prompts.append(prompt)

outputs = llm.generate(prompts, sampling_params)
for task_id, responses in zip(task_ids, outputs):
    task = get_task(task_id)
    for i, output in enumerate(responses.outputs):
        code = parse_python_code(output.text)
        if code:
            predicted_code[task_id].append(code)
            code = curate_python_code(code)
            try:
                task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, 2)
df

In [None]:
total, equal = 0, 0
repeated_code_task_ids = []
for task_id in task_ids:
    if len(predicted_code[task_id]) > 1:
        total += 1
        if predicted_code[task_id][0] == predicted_code[task_id][1]:
            repeated_code_task_ids.append(task_id)
            equal += 1
print(f"Total tasks with multiple predictions: {total}")
print(f"Tasks with exactly equal predictions: {equal} ({equal/total:.2%})")
print(f"Tasks with repeated code: {repeated_code_task_ids}")

In [None]:
pretty_print_prompt(prompts[task_ids.index(repeated_code_task_ids[0])], default_color='white')

In [None]:
predicted_code[repeated_code_task_ids[0]]

## Increase search diversity

I'm going to make experiments with 8 predictions per task, that should take around 30 minutes per experiment. Hopefully I will have enough resolution to measure changes.

### Default Configuration

In [None]:
model_path = "/home/gbarbadillo/models/Qwen2.5-Coder-7B-Instruct"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

In [None]:
n_preds = 8
task_ids = list(training_challenges.keys())
sampling_params = SamplingParams(n=1, temperature=1.0, top_p=0.95, max_tokens=2048)
grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')

### Baseline

In [None]:
sampling_params = SamplingParams(n=n_preds, temperature=1.0, top_p=0.95, max_tokens=2048)
predicted_code = {key: [] for key in task_ids}
predicted_outputs = {key: [] for key in task_ids}
t0 = time.time()
prompts = []
for task_id in task_ids:
    prompt = create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder,
                                        tokenizer=tokenizer, shuffle_train_samples=False)
    prompts.append(prompt)

outputs = llm.generate(prompts, sampling_params)
for task_id, responses in zip(task_ids, outputs):
    task = get_task(task_id)
    for i, output in enumerate(responses.outputs):
        code = parse_python_code(output.text)
        if code:
            predicted_code[task_id].append(code)
            code = curate_python_code(code)
            try:
                task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

inference_time = time.time() - t0
print(f'Inference time: {inference_time:.1f} seconds')
df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, n_preds)
df

### Shuffle train samples

In [None]:
sampling_params = SamplingParams(n=1, temperature=1.0, top_p=0.95, max_tokens=2048)
predicted_code = {key: [] for key in task_ids}
predicted_outputs = {key: [] for key in task_ids}
t0 = time.time()
print(f"Generating {n_preds} predictions for {len(task_ids)} tasks...")
for epoch in tqdm(range(n_preds), smoothing=0, desc="Generating predictions"):
    prompts = []
    for task_id in task_ids:
        prompt = create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder,
                                         tokenizer=tokenizer, shuffle_train_samples=True)
        prompts.append(prompt)

    outputs = llm.generate(prompts, sampling_params)
    for task_id, responses in zip(task_ids, outputs):
        task = get_task(task_id)
        for i, output in enumerate(responses.outputs):
            code = parse_python_code(output.text)
            if code:
                predicted_code[task_id].append(code)
                code = curate_python_code(code)
                try:
                    task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                    task_predicted_outputs = validate_outputs(task_predicted_outputs)
                    predicted_outputs[task_id].append(task_predicted_outputs)
                except Exception as e:
                    logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

inference_time = time.time() - t0
print(f'Inference time: {inference_time:.1f} seconds')
df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, n_preds)
df

### Effect of temperature

In [None]:
n_preds = 16
sampling_params = SamplingParams(n=n_preds, temperature=1.2, top_p=0.95, max_tokens=2048)
predicted_code = {key: [] for key in task_ids}
predicted_outputs = {key: [] for key in task_ids}
t0 = time.time()
prompts = []
for task_id in task_ids:
    prompt = create_prompt_from_task(get_task(task_id), grid_encoder=grid_encoder,
                                        tokenizer=tokenizer, shuffle_train_samples=False)
    prompts.append(prompt)

outputs = llm.generate(prompts, sampling_params)
for task_id, responses in zip(task_ids, outputs):
    task = get_task(task_id)
    for i, output in enumerate(responses.outputs):
        code = parse_python_code(output.text)
        if code:
            predicted_code[task_id].append(code)
            code = curate_python_code(code)
            try:
                task_predicted_outputs = safe_code_execution(add_additional_imports(code), task.inputs, func_name='transform', dsl=dsl)
                task_predicted_outputs = validate_outputs(task_predicted_outputs)
                predicted_outputs[task_id].append(task_predicted_outputs)
            except Exception as e:
                logging.error(f"Error executing code for task {task_id}, response {i}: {type(e)} {e}")

inference_time = time.time() - t0
print(f'Inference time: {inference_time:.1f} seconds')
df = compute_search_metrics(task_ids, predicted_code, predicted_outputs, n_preds)
df

## TODO

- [x] Create a prompt with the available DSL functions and the training ARC task
- [x] Fix VLLM initialization issues with proper memory management
- [x] Verify the effect of caching
- [x] Generate some code that can be used to test the new BARC dsl
- [x] Update the library to be able to select which DSL to use when executing code
- [x] Verify that I can execute the code generated with the BARC dsl
- [x] Add security checks to code, I have seen some input required in the code
- [ ] Try to solve some easy task with independent sampling
  - [x] How frequently is the dsl used?
  - [x] Influence of the model
  - [x] Implement output validation, and simplify the metric. Those are different responsabilities
  - [x] Correct grids
  - [x] Unique outputs
  - [x] Everything into a dataframe
  - [ ] Metrics distribution
  - [ ] Visualization of the predictions
  - [ ] Global metrics vs task specific analysis
- [ ] Create a refine prompt
- [ ] Make a more complex tree search
- [x] Validate the tokenizer on the input grids

```python
for i in range(10):
    print(i, {word for word in tokenizer.vocab if str(i) in word})
```