# Search with BARC induction models

## Goal

Can we solve ARC tasks using base models with access to a DSL?

## Imports

In [None]:
import os
import logging
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

# Add VLLM specific environment variables to avoid common issues
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

In [None]:
import time
import importlib
import inspect
import json
import gc
import random
import glob
import pandas as pd
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Markdown, display


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import BeamSearchParams

from arc25.training_tasks import *
from arc25.encoders import create_grid_encoder
from arc25.prompting import pretty_print_prompt, Template
from arc25.metrics import pixel_similarity_score, correct_grids_score
from arc25.utils import get_timestamp
from arc25.plot import plot_task
import arc25.BARC_dsl as dsl

In [None]:
plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

### Data

In [None]:
def load_arc_data_with_solutions(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    solutions_filepath = filepath.replace('challenges.json', 'solutions.json')
    if filepath != solutions_filepath and os.path.exists(solutions_filepath):
        with open(solutions_filepath, 'r') as f:
            solutions = json.load(f)
        for sample_id, task in data.items():
            for idx, sample in enumerate(task['test']):
                sample['output'] = solutions[sample_id][idx]
    verify_that_all_samples_have_output(data)
    return data


def verify_that_all_samples_have_output(data):
    for task in data.values():
        if isinstance(task, dict):
            verify_that_task_has_outputs(task)
        elif isinstance(task, list):
            for subtask in task:
                verify_that_task_has_outputs(subtask)


def verify_that_task_has_outputs(task):
    for partition, samples in task.items():
        if partition not in ['train', 'test']:
            continue
        for sample in samples:
            if 'output' not in sample:
                raise ValueError('Not all samples have output')

In [None]:
training_challenges = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_training_challenges.json')
evaluation_challenges = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_evaluation_challenges.json')
evaluation_challenges_2025 = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2025/arc-agi_evaluation_challenges.json')
all_challenges = {**training_challenges, **evaluation_challenges, **evaluation_challenges_2025}

In [None]:
def get_task(task_id):
    if task_id in all_challenges:
        task = all_challenges[task_id]
        task = {partition: [{key: np.array(value) for key, value in sample.items()} for sample in samples] for partition, samples in task.items()}
        return task
    else:
        raise ValueError(f'Task ID {task_id} not found in challenges')

### Prompt

https://github.com/flowersteam/SOAR/blob/main/soar/prompt.py

In [None]:
# https://huggingface.co/barc0/Llama-3.1-ARC-Potpourri-Induction-8B
system_prompt = """You are a world-class puzzle solver with exceptional pattern recognition skills and expertise in Python programming. Your task is to analyze puzzles and provide Python solutions."""

prompt_template_text = """Given input-output grid pairs as reference examples, carefully observe the patterns to predict the output grid for new test input. Each pair follows the same transformation rule. Grids are 2D arrays represented as strings, with cells (colors) separated by spaces and rows by newlines.
Here are the input and output grids for the reference examples:
{% for sample in train_samples %}Example {{ loop.index }}
Input:
{{ sample.input }}

Output:
{{ sample.output }}

{% endfor %}
Here is the input grid for the test example:
{{ test }}

Write a Python function `transform` that can convert any given input grid to its corresponding output grid based on the pattern observed in the reference examples.
"""

# I have verified that all responses start with this prefix
common_prefix = "Let's solve this puzzle using Python code with the common library functions. We'll first reason about the problem and then write the code to solve it. The `transform` function will take the input grid and return the output grid. Here is the Python code with the comments describing how to solve the problem:\n" #```python\nfrom common import *\n"

prompt_template = Template(prompt_template_text)

def create_prompt_from_task(task, grid_encoder, tokenizer, shuffle_train_samples=True):
    train_samples = [{'input': grid_encoder.to_text(sample['input']), 'output': grid_encoder.to_text(sample['output'])} for sample in task['train']]
    if shuffle_train_samples:
        random.shuffle(train_samples)
    test_sample = random.choice(task['test'])
    render_kwargs = dict(train_samples=train_samples, test=grid_encoder.to_text(test_sample['input']))
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template.render(**render_kwargs)},
                {"role": "assistant", "content": common_prefix}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=False,
                                            continue_final_message=True,
                                            # enable_thinking=False,
                                            )
    return prompt

### Model

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1, max_model_len=32000):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.92,  # Use less GPU memory
        # max_model_len=4096,  # Limit context length
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

### Code

In [None]:
def parse_python_code(text):
    # Extract Python code from the text
    if '```python' not in text:
        return ''
    code = text.split('```python')[1]
    if not '```' in code:
        return ''

    code = code.split('```')[0].strip()
    return code

In [None]:
def curate_python_code(code):
    remove_line_keywords = ['import dsl', 'from dsl import ', 'print(', 'from common import *']
    code = '\n'.join(line for line in code.split('\n') if not any(keyword in line for keyword in remove_line_keywords))
    # code = 'from arc25.BARC_dsl import *\n' + code  # Ensure BARC_dsl is imported
    return code.strip()

def add_additional_imports(code):
    additional_imports = [
        'from typing import List, Tuple',
        'import numpy as np',
        'import numpy'
    ]
    imports = '\n'.join(additional_imports)
    return imports + '\n' + code if code else imports

In [None]:
def run_code_from_predictions(predictions: dict[str, list[str]], log_errors: bool = True):
    # Precompute inputs per task once
    # task_inputs = {tid: [np.array(g) for g in get_task(tid).inputs] for tid in predictions}

    task_inputs = dict()
    for task_id in predictions:
        task = get_task(task_id)
        task_inputs[task_id] = [sample['input'] for sample in task['train']] + [sample['input'] for sample in task['test']]

    # Flatten all predictions into a work list
    work = [
        (tid, i, text_pred, task_inputs[tid], prediction_data['data_augmentation_params'][i])
        for tid, prediction_data in predictions.items()
        for i, text_pred in enumerate(prediction_data['text_predictions'])
    ]
    # print(work[0])

    predicted_code = {tid: [] for tid in predictions}
    predicted_outputs = {tid: [] for tid in predictions}

    n_jobs = -1  # all cores; set to an int to cap

    # with tqdm_joblib(tqdm(total=len(work), desc="Executing predictions", unit="pred")):
    with tqdm_joblib(total=len(work), desc="Executing predictions", unit="pred", smoothing=0):
        results = Parallel(
            n_jobs=n_jobs,
            backend="loky",
            prefer="processes",
            batch_size="auto",
        )(delayed(_run_one)(*args) for args in work)

    # Rebuild per-task outputs, preserving your original behavior (code appended even on exec error)
    for task_id, i, code, outs, err in results:
        predicted_code[task_id].append(code)
        predicted_outputs[task_id].append(outs)
        if err and log_errors:
            logging.error(f"Error executing code for task {task_id}, response {i}: {err}")

    return predicted_code, predicted_outputs


def _run_one(task_id, i, text_prediction, input_grids, data_augmentation_params=None):
    code = parse_python_code(text_prediction)
    if not code:
        return (task_id, i, None, None, "parse_failed")
    try:
        if data_augmentation_params is not None:
            # Apply data augmentation to the input grids
            input_grids = [apply_data_augmentation_to_grid(grid, **data_augmentation_params) for grid in input_grids]
        outs = safe_code_execution(
            add_additional_imports(curate_python_code(code)),
            input_grids,
            func_name="transform",
        )
        outs = validate_outputs(outs)
        # print(outs)
        if data_augmentation_params is not None:
            outs = [np.array(revert_data_augmentation(output, **data_augmentation_params)) for output in outs]
        # print(outs)
        return (task_id, i, code, outs, None)
    except Exception as e:
        return (task_id, i, code, None, f"{type(e).__name__}: {e}")

# tiny_predictions = {'00576224': predictions['00576224']}
# predicted_code, predicted_outputs = run_code_from_predictions(tiny_predictions, log_errors=True)
# df = compute_search_metrics(list(tiny_predictions.keys()), predicted_code, predicted_outputs, n_preds)
# df.round(3)
# predicted_code, predicted_outputs = run_code_from_predictions(predictions, log_errors=False)

### Validations

In [None]:
def validate_outputs(outputs):
    if not outputs:
        raise ValueError("Outputs list is empty")
    return [_validate_output(output) for output in outputs]

def _validate_output(output):
    if output is None:
        raise ValueError("Output is None")
    output = np.array(output) # otherwise I see weird outputs that mix list and numpy arrays
    if output.ndim != 2:
        raise ValueError(f"Output is not a 2D array. Output shape: {output.shape}")
    if max(output.shape) > 35:
        raise ValueError(f"Output is too large, the maximum allowed shape is 30x30. Output shape: {output.shape}")
    if min(output.shape) == 0:
        raise ValueError(f"Output has zero dimension, it is empty. Output shape: {output.shape}")
    if np.max(output) > 9 or np.min(output) < 0:
        raise ValueError(f"Output contains invalid values, expected values in range [0, 9]. Output max: {np.max(output)}, min: {np.min(output)}")
    return output

In [None]:
import hashlib

def fingerprint(prediction):
    """
    Create a compact hash for a list of matrices.
    Includes shape & dtype to distinguish e.g. (2×2) from (4×1).
    """
    h = hashlib.sha256()
    for m in prediction:
        # incorporate shape and dtype in a reproducible way
        h.update(str(m.shape).encode())
        h.update(m.dtype.str.encode())
        # raw data bytes
        h.update(m.tobytes())
    return h.hexdigest()

### Metrics

In [None]:
def compute_search_metrics(task_ids, predicted_code, predicted_outputs, n_preds):
    df = pd.DataFrame(columns=['n_preds', 'valid code', 'valid outputs', 'unique outputs', 'pixel similarity', 'correct grids', 
                               'train_pass_rate', 'train_pass@n', 'pass_rate', 'pass@n'])
    for task_id in task_ids:
        df.loc[task_id, 'n_preds'] = n_preds
        valid_code = [code for code in predicted_code[task_id] if code is not None]
        df.loc[task_id, 'valid code'] = len(valid_code)/n_preds
        valid_outputs = [output for output in predicted_outputs[task_id] if output is not None]
        df.loc[task_id, 'valid outputs'] = len(valid_outputs)/n_preds
        df.loc[task_id, 'unique outputs'] = len(set(fingerprint(output) for output in valid_outputs))/n_preds

        task = get_task(task_id)
        task_outputs = [sample['output'] for sample in task['train']] + [sample['output'] for sample in task['test']]
        scores = sorted([np.mean([pixel_similarity_score(output, pred) for output, pred in zip(task_outputs, predictions)]) for predictions in valid_outputs])
        df.loc[task_id, 'pixel similarity'] = np.mean(scores) if scores else 0.0
        
        scores = sorted([correct_grids_score(task_outputs, predictions) for predictions in valid_outputs])
        df.loc[task_id, 'correct grids'] = np.mean(scores) if scores else 0.0
        df.loc[task_id, 'pass_rate'] = np.mean(np.array(scores) == 1) if scores else 0
        df.loc[task_id, 'pass@n'] = int(np.max(scores) == 1) if scores else 0

        train_outputs = [sample['output'] for sample in task['train']]
        train_scores = sorted([correct_grids_score(train_outputs, predictions[:len(train_outputs)]) for predictions in valid_outputs])
        df.loc[task_id, 'train_pass_rate'] = np.mean(np.array(train_scores) == 1) if train_scores else 0
        df.loc[task_id, 'train_pass@n'] = int(np.max(train_scores) == 1) if train_scores else 0

    df.loc['MEAN'] = df.mean(axis=0)
    return df.astype(float)

### Data augmentation

In [None]:
def apply_data_augmentation(task, hflip, n_rot90, color_map=None):
    augmented_task = {partition: [{key: apply_data_augmentation_to_grid(grid, hflip, n_rot90, color_map) for key, grid in sample.items()} \
                 for sample in samples] for partition, samples in task.items()}
    return augmented_task


def apply_data_augmentation_to_grid(grid, hflip, n_rot90, color_map=None):
    grid = geometric_augmentation(grid, hflip, n_rot90)
    if color_map is not None:
        grid = apply_colormap(grid, color_map)
    return np.array(grid)


def revert_data_augmentation(grid, hflip, n_rot90, color_map=None):
    grid = revert_geometric_augmentation(grid, hflip, n_rot90)
    if color_map is not None:
        grid = revert_color_swap(grid, color_map)
    return grid


def geometric_augmentation(grid, hflip, n_rot90):
    grid = np.array(grid)
    if hflip:
        grid = np.flip(grid, axis=1)
    grid = np.rot90(grid, k=n_rot90)
    return grid


def revert_geometric_augmentation(grid, hflip, n_rot90):
    grid = np.array(grid)
    grid = np.rot90(grid, k=-n_rot90)
    if hflip:
        grid = np.flip(grid, axis=1)
    return grid


def revert_color_swap(grid, color_map):
    reverse_color_map = {v: int(k) for k, v in color_map.items()}
    vectorized_mapping = np.vectorize(reverse_color_map.get)
    return vectorized_mapping(grid)


def swap_task_colors(task, color_map=None, change_background_probability=0.1):
    if color_map is None:
        color_map = get_random_color_map(change_background_probability)
    vectorized_mapping = np.vectorize(color_map.get)
    new_task = Task(
        inputs = [vectorized_mapping(grid) for grid in task.inputs],
        outputs = [vectorized_mapping(grid) for grid in task.outputs],
        code = '',
        name = task.name,)
    return new_task


def apply_colormap(grid, color_map):
    vectorized_mapping = np.vectorize(color_map.get)
    return vectorized_mapping(grid)


def get_random_data_augmentation_params():
    params = get_random_geometric_augmentation_params()
    params['color_map'] = get_random_color_map()
    return params


def get_random_geometric_augmentation_params():
    return dict(hflip=random.choice([True, False]), n_rot90=random.choice([0, 1, 2, 3]))


def get_random_color_map(change_background_probability=0.1):
    colors = list(range(10))
    if random.random() < change_background_probability:
        new_colors = list(range(10))
        random.shuffle(new_colors)
    else:
        new_colors = list(range(1, 10))
        random.shuffle(new_colors)
        new_colors = [0] + new_colors

    color_map = {x: y for x, y in zip(colors, new_colors)}
    return color_map

### Utils

In [None]:
def load_all_predictions(path_pattern):
    filepaths = glob.glob(path_pattern)
    predictions = dict()
    for filepath in tqdm(filepaths, desc="Loading predictions"):
        with open(filepath, 'r') as f:
            preds = json.load(f)
        for task_id, outputs in preds.items():
            if task_id not in predictions:
                predictions[task_id] = dict(text_predictions=[], data_augmentation_params=[])
            if isinstance(outputs, dict):
                predictions[task_id]['text_predictions'].extend(outputs['text_predictions'])
                data_augmentation_params = outputs.get('data_augmentation_params', None)
                if data_augmentation_params['color_map'] is not None:
                    data_augmentation_params['color_map'] = {int(k): int(v) for k, v in data_augmentation_params['color_map'].items()}
                predictions[task_id]['data_augmentation_params'].extend([data_augmentation_params]*len(outputs['text_predictions']))
            else:
                predictions[task_id]['text_predictions'].extend(outputs)
                predictions[task_id]['data_augmentation_params'].extend([None] * len(outputs))  # Assuming no params for old format
    return predictions

In [None]:
raise

## Data augmentation

Does using data augmentation increases the diversity of the predictions and improves the pass@n metric?

### Make predictions

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

In [None]:
dataset_to_task_ids = {
    'training': list(training_challenges.keys()),
    'evaluation': list(evaluation_challenges.keys()),
    'evaluation-2025': list(evaluation_challenges_2025.keys())
}

experiment_name = '2025-08-22_add-common-prefix'
dataset = 'training'
task_ids = dataset_to_task_ids[dataset]
grid_encoder = create_grid_encoder('ColorNameEncoder()')

In [None]:
sample_prompt = create_prompt_from_task(get_task(task_ids[0]), grid_encoder, tokenizer)
pretty_print_prompt(sample_prompt, default_color='white')

In [None]:
for n in [8]:
    sampling_params = SamplingParams(n=n, temperature=1.0, top_p=0.95, max_tokens=2048)

    prompts, data_augmentation_params = [], []
    for task_id in task_ids:
        params = get_random_data_augmentation_params()
        data_augmentation_params.append(params)
        task = get_task(task_id)
        task = apply_data_augmentation(task, **params)
        prompt = create_prompt_from_task(
            task, grid_encoder=grid_encoder, tokenizer=tokenizer, shuffle_train_samples=True)
        prompts.append(prompt)

    t0 = time.time()
    text_predictions = llm.generate(prompts, sampling_params)
    total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
    inference_time = time.time() - t0
    print(f"Total tokens generated: {total_tokens}")
    print(f"Time taken: {inference_time:.2f} seconds")
    print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
    print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
    print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

    predictions = dict()
    for task_id, output, params in zip(task_ids, text_predictions, data_augmentation_params):
        predictions[task_id] = {
            'text_predictions': [output.text for output in output.outputs],
            'data_augmentation_params': params,
        }

    output_filepath = f'/mnt/hdd0/Kaggle/arc25/predictions/{experiment_name}/{dataset}_{sampling_params.n}preds_{get_timestamp()}_predictions.json'
    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
    with open(output_filepath, 'w') as f:
        json.dump(predictions, f, indent=2)
    print(f"Predictions saved to {output_filepath}")

```
training
8 preds, Average time per task: 3.25 seconds
Average time per task: 2.88 seconds, when adding the common prefix

evaluation
8 preds, Average time per task: 4.17 seconds
(previously it was 3.64 when using n-1 training samples)

evaluation-2025
8 preds, Average time per task: 5.57 seconds
```

### Evaluation

In [None]:
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_fix-bug/evaluation_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_fix-bug/training_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-18_barc-first-steps/evaluation-2025_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-21_data-augmentation/evaluation_8preds_2025_08_21_13_58_47_predictions.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

In [None]:
predicted_code, predicted_outputs = run_code_from_predictions(predictions, log_errors=False)

In [None]:
df = compute_search_metrics(list(predictions.keys()), predicted_code, predicted_outputs, n_preds)
df.round(3)

In [None]:
df.iloc[-1:]

In [None]:
scores = []
n_preds_range = 2**np.arange(0, int(np.log2(n_preds)) + 2)
fail_prob = 1 - df['pass_rate'].values[:-1]
for n in n_preds_range:
    scores.append(float(np.mean(1 - fail_prob**n)))

import matplotlib.pyplot as plt

plt.plot(n_preds_range, scores, marker='o')
plt.xscale('log')
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('pass@n vs Number of Predictions')
plt.grid()
plt.show()
dict(evaluation_data_augmentation=(n_preds_range.tolist(), scores))

### Compare with and without data augmentation

#### evaluation ARC-AGI-1 Comparison

In [None]:
metrics = {
'baseline': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.01996212663472896,
   0.02831747138692302,
   0.03977086910606171,
   0.05542900033336087,
   0.07674247281753263,
   0.10320479610157163,
   0.13093643779484748,
   0.15594708997780551,
   0.1770796829272273,
   0.19471691227422966,
   0.20712048546783934]),
'+ data augmentation': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.019813790637075775,
   0.029064191088443207,
   0.04100375897586597,
   0.05671308781660802,
   0.07779377587518771,
   0.10484135687838109,
   0.13591735274813102,
   0.1675520359797603,
   0.19696420491620842,
   0.22203487375136557,
   0.23821892651950458])
}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
for key, (n_preds_range, scores) in metrics.items():
    plt.plot(n_preds_range, scores, marker='o', label=key)
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-1')
plt.legend()

plt.tight_layout()

#### Bias of the number of predictions

In [None]:
metrics = {
'64 preds': ([1, 2, 4, 8, 16, 32, 64, 128],
  [0.019106336951575514,
   0.028082339121390357,
   0.039026953359591325,
   0.05221714653327929,
   0.06785124181952008,
   0.08534713350917737,
   0.10178389655716932,
   0.11195929946811921]),
'112 preds': ([1, 2, 4, 8, 16, 32, 64, 128],
  [0.019387411477336133,
   0.02887665434462053,
   0.04103250559571824,
   0.05662528637189867,
   0.07657867399726664,
   0.1006969445110981,
   0.12613859060350577,
   0.14808143720453426]),
'184 preds': ([1, 2, 4, 8, 16, 32, 64, 128, 256],
  [0.019316797975629206,
   0.02854922966041234,
   0.0405175566715264,
   0.056039023485551356,
   0.07638928013313853,
   0.10206766052179019,
   0.13087569581761588,
   0.15844583931075643,
   0.1786899038965614]),
'584 preds': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.019813790637075775,
   0.029064191088443207,
   0.04100375897586597,
   0.05671308781660802,
   0.07779377587518771,
   0.10484135687838109,
   0.13591735274813102,
   0.1675520359797603,
   0.19696420491620842,
   0.22203487375136557,
   0.23821892651950458])
}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
for key, (n_preds_range, scores) in metrics.items():
    plt.plot(n_preds_range, scores, marker='o', label=key)
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-1')
plt.legend()

plt.tight_layout()

#### evaluation arc-agi-2

In [None]:
metrics = {'baseline': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024,
   2048],
  [0.000525120200263568,
   0.0010290649730527559,
   0.001977218777084015,
   0.0036586995036725616,
   0.006324151121543866,
   0.009785455928199816,
   0.013112192645441874,
   0.015422166919107317,
   0.016485239578931988,
   0.016662717947703534,
   0.016666664795580977,
   0.016666666666666247]),
'data_augmentation': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.0004395694358616827,
   0.0008607058465457645,
   0.001651027938816423,
   0.003045200797763149,
   0.005231079244621852,
   0.00801093382400977,
   0.010647211929364896,
   0.012848270575498736,
   0.014933881468222887,
   0.016306378321672744])}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
for key, (n_preds_range, scores) in metrics.items():
    plt.plot(n_preds_range, scores, marker='o', label=key)
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-2')
plt.legend()

plt.tight_layout()

### Distribution of prediction length

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B")

In [None]:
for key in ['training', 'evaluation', 'evaluation-2025']:
    predictions = load_all_predictions(f'/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_fix-bug/{key}_*.json')
    prediction_length_distribution = {tid: [len(tokens) for tokens in tokenizer(preds['text_predictions'])['input_ids']] \
                                      for tid, preds in tqdm(predictions.items(), desc="Computing prediction lengths", total=len(predictions))}
    all_lengths = [length for lengths in prediction_length_distribution.values() for length in lengths]
    label = f"{key} (max output tokens: {max(all_lengths)}, median output tokens: {int(np.median(all_lengths))})"
    bins = np.linspace(0, 2000, 100)
    plt.hist(all_lengths, bins=bins, label=label, alpha=0.5, density=True)
plt.legend()
plt.xlabel('Number of tokens')
plt.title('Distribution of prediction lengths')

### Inspect correct solutions

In [None]:
for task_id in df[df['pass_rate'] > 0].index.values[:-1]:
    print(f'https://arcprize.org/play?task={task_id} pass rate: {df.loc[task_id, "pass_rate"]:.2%}')
    task = get_task(task_id)
    correct_solution_found = False
    for idx, output in enumerate(predicted_outputs[task_id]):
        if output is None:
            continue
        if correct_grids_score([np.array(output) for output in task.outputs], output) == 1:
            correct_solution_found = True
            data_augmentation_params = predictions[task_id]['data_augmentation_params'][idx]
            text_pred = predictions[task_id]['text_predictions'][idx]
            print(data_augmentation_params)
            augmented_task = apply_data_augmentation(task, **data_augmentation_params) if data_augmentation_params is not None else task
            plot_task(augmented_task); plt.show()
            display(Markdown(text_pred + '\n\n---\n\n'))
            break
    if not correct_solution_found:
        raise ValueError("Could not find correct solution")

I'm impressed by the tasks that the model is able to solve. The reasoning is correct. This is a powerful model to experiment with.

## TODO:

- [x] Load all ARC data
- [x] Modify prompt generation to use all the training data, and a random sample from the test samples.
- [x] Update the data augmentation pipeline to use dicts instead of tasks
- [x] Modify the code execution to use all the data
- [x] Add a new metric to check if the code is correct for the train samples but incorrect for the test samples
- [x] Refactor
- [ ] Convert to python script so I can make predictions remotely
- [ ] Evaluate the datasets
- [x] Check if the answers always start with the same prefix, if that is the case I could speedup inference.

- [x] Implement new grid encoder
- [x] Use the correct prompt
- [x] Save predictions to file so I can later reprocess them
- [x] Update code execution to match the code generated by BARC model
- [x] Check code execution to verify that exceptions are legit and not easily solvable
  - [x] Add missing colors to Color object
  - [x] Code execution fails when there are auxiliary functions. `Error executing code for task 025d127b, response 5: <class 'NameError'> name 'blend_colors' is not defined`
  - [x] Arrays as inputs
- [x] Remove dsl usage metric
- [x] Add correct task metric
- [x] Parallelize code execution
- [x] Refactor code
- [x] Plots showing the effect of increasing the number of predictions
- [x] Validate that I get the same scores of the paper
- [x] Evaluate on different datasets
- [x] Improve metrics
- [x] Data augmentation