# Search with BARC induction models

## Goal

Can we solve ARC tasks using base models with access to a DSL?

## Imports

In [None]:
import os
import logging
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

# Add VLLM specific environment variables to avoid common issues
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

In [None]:
import time
import importlib
import inspect
import json
import gc
import random
import glob
from collections import namedtuple
import pandas as pd
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Markdown, display

def display_python_code(code):
    display(Markdown(f"```python\n{code}\n```"))


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

from arc25.encoders import create_grid_encoder
from arc25.prompting import pretty_print_prompt, Template
from arc25.metrics import pixel_similarity_score, correct_grids_score
from arc25.utils import get_timestamp
from arc25.plot import plot_task
from arc25.data_augmentation import apply_data_augmentation, revert_data_augmentation, get_random_data_augmentation_params
from arc25.code_execution import safe_code_execution

In [None]:
plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

### Data

In [None]:
def load_arc_data_with_solutions(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    solutions_filepath = filepath.replace('challenges.json', 'solutions.json')
    if filepath != solutions_filepath and os.path.exists(solutions_filepath):
        with open(solutions_filepath, 'r') as f:
            solutions = json.load(f)
        for sample_id, task in data.items():
            for idx, sample in enumerate(task['test']):
                sample['output'] = solutions[sample_id][idx]
    verify_that_all_samples_have_output(data)
    return data


def verify_that_all_samples_have_output(data):
    for task in data.values():
        if isinstance(task, dict):
            verify_that_task_has_outputs(task)
        elif isinstance(task, list):
            for subtask in task:
                verify_that_task_has_outputs(subtask)


def verify_that_task_has_outputs(task):
    for partition, samples in task.items():
        if partition not in ['train', 'test']:
            continue
        for sample in samples:
            if 'output' not in sample:
                raise ValueError('Not all samples have output')

In [None]:
training_challenges = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_training_challenges.json')
evaluation_challenges = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_evaluation_challenges.json')
evaluation_challenges_2025 = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2025/arc-agi_evaluation_challenges.json')
all_challenges = {**training_challenges, **evaluation_challenges, **evaluation_challenges_2025}

In [None]:
def get_task(task_id):
    if task_id in all_challenges:
        task = all_challenges[task_id]
        task = {partition: [{key: np.array(value) for key, value in sample.items()} for sample in samples] for partition, samples in task.items()}
        return task
    else:
        raise ValueError(f'Task ID {task_id} not found in challenges')

In [None]:
for task_id in all_challenges:
    try:
        _ = get_task(task_id)
    except Exception as e:
        print(f"Error loading task {task_id}: {e}")

### Prompt

https://github.com/flowersteam/SOAR/blob/main/soar/prompt.py

In [None]:
# https://huggingface.co/barc0/Llama-3.1-ARC-Potpourri-Induction-8B
system_prompt = """You are a world-class puzzle solver with exceptional pattern recognition skills and expertise in Python programming. Your task is to analyze puzzles and provide Python solutions."""

prompt_template_text = """Given input-output grid pairs as reference examples, carefully observe the patterns to predict the output grid for new test input. Each pair follows the same transformation rule. Grids are 2D arrays represented as strings, with cells (colors) separated by spaces and rows by newlines.
Here are the input and output grids for the reference examples:
{% for sample in train_samples %}Example {{ loop.index }}
Input:
{{ sample.input }}

Output:
{{ sample.output }}

{% endfor %}
Here is the input grid for the test example:
{{ test }}

Write a Python function `transform` that can convert any given input grid to its corresponding output grid based on the pattern observed in the reference examples.
"""

# I have verified that all responses start with this prefix
common_prefix = "Let's solve this puzzle using Python code with the common library functions. We'll first reason about the problem and then write the code to solve it. The `transform` function will take the input grid and return the output grid. Here is the Python code with the comments describing how to solve the problem:\n" #```python\nfrom common import *\n"

prompt_template = Template(prompt_template_text)

def create_prompt_from_task(task, grid_encoder, tokenizer, shuffle_train_samples=True):
    train_samples = [{'input': grid_encoder.to_text(sample['input']), 'output': grid_encoder.to_text(sample['output'])} for sample in task['train']]
    if shuffle_train_samples:
        random.shuffle(train_samples)
    test_sample = random.choice(task['test'])
    render_kwargs = dict(train_samples=train_samples, test=grid_encoder.to_text(test_sample['input']))
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template.render(**render_kwargs)},
                {"role": "assistant", "content": common_prefix}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=False,
                                            continue_final_message=True,
                                            # enable_thinking=False,
                                            )
    return prompt

### Model

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1,
               max_model_len=32000, enable_lora=False, max_lora_rank=16):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.92,  # Use less GPU memory
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
        enable_lora=enable_lora,
        max_lora_rank=max_lora_rank,
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

### Code

In [None]:
def parse_python_code(text):
    # Extract Python code from the text
    if '```python' not in text:
        return ''
    code = text.split('```python')[1]
    if not '```' in code:
        return ''

    code = code.split('```')[0].strip()
    return code

In [None]:
def curate_python_code(code):
    remove_line_keywords = ['import dsl', 'from dsl import ', 'print(', 'from common import *']
    code = '\n'.join(line for line in code.split('\n') if not any(keyword in line for keyword in remove_line_keywords))
    # code = 'from arc25.BARC_dsl import *\n' + code  # Ensure BARC_dsl is imported
    return code.strip()

def add_additional_imports(code):
    additional_imports = [
        'from typing import List, Tuple',
        'import numpy as np',
        'import numpy',
        'from arc25.BARC_dsl import *',
    ]
    imports = '\n'.join(additional_imports)
    return imports + '\n' + code if code else imports

In [None]:
ExecutionResult = namedtuple("ExecutionResult", ["task_id", "index", "code", "outputs", "error_type", "error_message"])

def run_code_from_predictions(predictions: dict[str, list[str]], batch_size=32000, n_jobs=-1):
    # Precompute inputs per task once
    # task_inputs = {tid: [np.array(g) for g in get_task(tid).inputs] for tid in predictions}

    task_inputs = dict()
    for task_id in predictions:
        task = get_task(task_id)
        task_inputs[task_id] = [sample['input'] for sample in task['train']] + [sample['input'] for sample in task['test']]

    # Flatten all predictions into a work list
    work = [
        (tid, i, text_pred, task_inputs[tid], prediction_data['data_augmentation_params'][i])
        for tid, prediction_data in predictions.items()
        for i, text_pred in enumerate(prediction_data['text_predictions'])
    ]

    # sort the work by prediction index first and the task id second, I believe this will improve resource allocation
    # because some tasks are more resource intensive than others
    work.sort(key=lambda x: (x[1], x[0]))

    # with tqdm_joblib(total=len(work), desc="Executing predictions", unit="pred", smoothing=0):
    #     results = Parallel(
    #         n_jobs=n_jobs,
    #         backend="loky",
    #         prefer="processes",
    #         batch_size='auto', # previously was 1
    #     )(delayed(_run_one)(*args) for args in work)
    results = []
    for i in tqdm(range(0, len(work), batch_size), desc="Executing predictions", unit="batch"):
        batch = work[i:i+batch_size]
        with tqdm_joblib(total=len(batch), desc=f"Executing predictions for batch {i//batch_size}", unit="pred", smoothing=0):
            batch_results = Parallel(
                n_jobs=n_jobs,
                backend="loky",
                prefer="processes",
                batch_size='auto', # previously was 1
            )(delayed(_run_one)(*args) for args in batch)
            results.extend(batch_results)

    results_by_task = {tid: [] for tid in predictions}
    for result in results:
        results_by_task[result.task_id].append(result)
    return results_by_task


def _run_one(task_id, i, text_prediction, input_grids, data_augmentation_params=None):
    code = parse_python_code(text_prediction)
    if not code:
        return ExecutionResult(task_id, i, None, None, "ParsingCodeFailed", '')
    try:
        if data_augmentation_params is not None:
            # Apply data augmentation to the input grids
            input_grids = apply_data_augmentation(input_grids, **data_augmentation_params)
        outs = safe_code_execution(
            add_additional_imports(curate_python_code(code)),
            input_grids,
            func_name="transform",
            execution_method='exec',
        )
        outs = validate_outputs(outs)
        # print(outs)
        if data_augmentation_params is not None:
            outs = revert_data_augmentation(outs, **data_augmentation_params)
        # print(outs)
        return ExecutionResult(task_id, i, code, outs, None, None)
    except BaseException as e:
        return ExecutionResult(task_id, i, code, None, type(e).__name__, e)

# tiny_predictions = {'00576224': predictions['00576224']}
# predicted_code, predicted_outputs = run_code_from_predictions(tiny_predictions, log_errors=True)
# df = compute_search_metrics(list(tiny_predictions.keys()), predicted_code, predicted_outputs, n_preds)
# df.round(3)
# predicted_code, predicted_outputs = run_code_from_predictions(predictions, log_errors=False)

### Validations

In [None]:
def validate_outputs(outputs):
    if not outputs:
        raise ValueError("Outputs list is empty")
    return [_validate_output(output) for output in outputs]

def _validate_output(output):
    if output is None:
        raise ValueError("Output is None")
    output = np.array(output, dtype=int) # otherwise I see weird outputs that mix list and numpy arrays
    if output.ndim != 2:
        raise ValueError(f"Output is not a 2D array. Output shape: {output.shape}")
    if max(output.shape) > 35:
        raise ValueError(f"Output is too large, the maximum allowed shape is 30x30. Output shape: {output.shape}")
    if min(output.shape) == 0:
        raise ValueError(f"Output has zero dimension, it is empty. Output shape: {output.shape}")
    if np.max(output) > 9 or np.min(output) < 0:
        raise ValueError(f"Output contains invalid values, expected values in range [0, 9]. Output max: {np.max(output)}, min: {np.min(output)}")
    # if not np.issubdtype(output.dtype, np.integer):
    #     raise ValueError(f"Output contains non-integer values, expected integer values. Output dtype: {output.dtype}")
    return output

In [None]:
import hashlib

def fingerprint(prediction):
    """
    Create a compact hash for a list of matrices.
    Includes shape & dtype to distinguish e.g. (2×2) from (4×1).
    """
    h = hashlib.sha256()
    for m in prediction:
        # incorporate shape and dtype in a reproducible way
        h.update(str(m.shape).encode())
        h.update(m.dtype.str.encode())
        # raw data bytes
        h.update(m.tobytes())
    return h.hexdigest()

### Metrics

In [None]:
def compute_search_metrics(results):
    df = pd.DataFrame(columns=['n_preds', 'valid code', 'valid outputs', 'unique outputs', 'pixel similarity', 'correct grids', 
                               'train_pass_rate', 'train_pass@n', 'pass_rate', 'pass@n'])
    for task_id, task_results in results.items():
        n_preds = len(task_results)
        df.loc[task_id, 'n_preds'] = n_preds
        valid_code = [result.code for result in task_results if result.code is not None]
        df.loc[task_id, 'valid code'] = len(valid_code)/n_preds
        valid_outputs = [result.outputs for result in task_results if result.outputs is not None]
        df.loc[task_id, 'valid outputs'] = len(valid_outputs)/n_preds
        df.loc[task_id, 'unique outputs'] = len(set(fingerprint(output) for output in valid_outputs))/n_preds

        task = get_task(task_id)
        task_outputs = [sample['output'] for sample in task['train']] + [sample['output'] for sample in task['test']]
        scores = sorted([np.mean([pixel_similarity_score(output, pred) for output, pred in zip(task_outputs, predictions)]) for predictions in valid_outputs])
        df.loc[task_id, 'pixel similarity'] = np.mean(scores) if scores else 0.0
        
        #TODO: there is a bug in this pass_rate calculation, because it does not use all the outputs, only the valid ones
        scores = sorted([correct_grids_score(task_outputs, predictions) for predictions in valid_outputs])
        df.loc[task_id, 'correct grids'] = np.mean(scores) if scores else 0.0
        df.loc[task_id, 'pass_rate'] = np.mean(np.array(scores) == 1) if scores else 0
        df.loc[task_id, 'pass@n'] = int(np.max(scores) == 1) if scores else 0

        train_outputs = [sample['output'] for sample in task['train']]
        train_scores = sorted([correct_grids_score(train_outputs, predictions[:len(train_outputs)]) for predictions in valid_outputs])
        df.loc[task_id, 'train_pass_rate'] = np.mean(np.array(train_scores) == 1) if train_scores else 0
        df.loc[task_id, 'train_pass@n'] = int(np.max(train_scores) == 1) if train_scores else 0

    df.loc['MEAN'] = df.mean(axis=0)
    return df.astype(float)

In [None]:
def error_analysis(results):
    errors_to_check = ['TimeoutException', 'NonDeterministicCode', 'UnsafeCode', 'ParsingCodeFailed']

    df = pd.DataFrame(columns=['n_preds', 'error_rate'] + errors_to_check)
    all_errors = []
    for task_id, task_results in results.items():
        task_errors = [result.error_type for result in task_results if result.error_type is not None]
        all_errors.extend(task_errors)
        df.loc[task_id, 'n_preds'] = len(task_results)
        df.loc[task_id, 'error_rate'] = len(task_errors) / len(task_results) if task_results else 0.0
        for error_type in errors_to_check:
            df.loc[task_id, error_type] = sum(1 for error in task_errors if error == error_type) / len(task_results) if task_results else 0.0
    df.loc['MEAN'] = df.mean(axis=0)

    error_counts = pd.Series(all_errors).value_counts()
    print("Most common errors:")
    display(error_counts.head(20))
    return df.astype(float)

### Utils

In [None]:
def load_all_predictions(path_pattern):
    filepaths = glob.glob(path_pattern)
    predictions = dict()
    for filepath in tqdm(filepaths, desc="Loading predictions", disable=len(filepaths)<=1):
        with open(filepath, 'r') as f:
            preds = json.load(f)
        for task_id, outputs in preds.items():
            if task_id not in predictions:
                predictions[task_id] = dict(text_predictions=[], data_augmentation_params=[])
            if isinstance(outputs, dict):
                predictions[task_id]['text_predictions'].extend(outputs['text_predictions'])
                data_augmentation_params = outputs.get('data_augmentation_params', None)
                if data_augmentation_params is not None and data_augmentation_params['color_map'] is not None:
                    data_augmentation_params['color_map'] = {int(k): int(v) for k, v in data_augmentation_params['color_map'].items()}
                predictions[task_id]['data_augmentation_params'].extend([data_augmentation_params]*len(outputs['text_predictions']))
            else:
                predictions[task_id]['text_predictions'].extend(outputs)
                predictions[task_id]['data_augmentation_params'].extend([None] * len(outputs))  # Assuming no params for old format
    return predictions

In [None]:
raise

## Independent search

Does using data augmentation increases the diversity of the predictions and improves the pass@n metric?

### Make predictions

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1)

In [None]:
dataset_to_task_ids = {
    'training': list(training_challenges.keys()),
    'evaluation': list(evaluation_challenges.keys()),
    'evaluation-2025': list(evaluation_challenges_2025.keys())
}

experiment_name = '2025-08-22_add-common-prefix'
dataset = 'training'
task_ids = dataset_to_task_ids[dataset]
grid_encoder = create_grid_encoder('ColorNameEncoder()')

In [None]:
text = create_prompt_from_task(get_task(task_ids[0]), grid_encoder, tokenizer)
pretty_print_prompt(text, default_color='white')

In [None]:
for n in [8]:
    sampling_params = SamplingParams(n=n, temperature=1.0, top_p=0.95, max_tokens=2048)

    prompts, data_augmentation_params = [], []
    for task_id in task_ids:
        params = get_random_data_augmentation_params()
        data_augmentation_params.append(params)
        task = get_task(task_id)
        task = apply_data_augmentation(task, **params)
        prompt = create_prompt_from_task(
            task, grid_encoder=grid_encoder, tokenizer=tokenizer, shuffle_train_samples=True)
        prompts.append(prompt)

    t0 = time.time()
    text_predictions = llm.generate(prompts, sampling_params)
    total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
    inference_time = time.time() - t0
    print(f"Total tokens generated: {total_tokens}")
    print(f"Time taken: {inference_time:.2f} seconds")
    print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
    print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
    print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

    predictions = dict()
    for task_id, output, params in zip(task_ids, text_predictions, data_augmentation_params):
        predictions[task_id] = {
            'text_predictions': [output.text for output in output.outputs],
            'data_augmentation_params': params,
        }

    output_filepath = f'/mnt/hdd0/Kaggle/arc25/predictions/{experiment_name}/{dataset}_{sampling_params.n}preds_{get_timestamp()}_predictions.json'
    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
    with open(output_filepath, 'w') as f:
        json.dump(predictions, f, indent=2)
    print(f"Predictions saved to {output_filepath}")

```
training
8 preds, Average time per task: 3.25 seconds
Average time per task: 2.88 seconds, when adding the common prefix

evaluation
8 preds, Average time per task: 4.17 seconds
(previously it was 3.64 when using n-1 training samples)

evaluation-2025
8 preds, Average time per task: 5.57 seconds
```

### Evaluation

In [None]:
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/evaluation-2025_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/evaluation_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-23_no-data-augmentation/evaluation_*.json')

predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/*.json')
# predictions = {key: predictions[key] for key in list(predictions)[:35]}

# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_8preds_2025_08_23_11_57_11_predictions.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

results = run_code_from_predictions(predictions)
df = compute_search_metrics(results)

error_analysis(results);
df.iloc[-1:]

Make inference more robust:

- baseline, 10.4s
- batch size auto, clearly improves speed 6.6s
- BaseException for TimeoutException, 6.4s
- Try subprocess: 45.2s
- Restrict globals: 6.5s
- Redirect prints: 6.5s

```
baseline, 10.4s
	n_preds	valid code	valid outputs	unique outputs	pixel similarity	correct grids	train_pass_rate	train_pass@n	pass_rate	pass@n
MEAN	8.0	1.0	0.765625	0.615938	0.611174	0.148103	0.12081	0.27	0.11822	0.265 # baseline
MEAN	8.0	1.0	0.765	0.615313	0.611206	0.148114	0.12081	0.27	0.11822	0.265 # batch size auto
MEAN	8.0	1.0	0.765312	0.615625	0.611184	0.148103	0.12081	0.27	0.11822	0.265 # BaseException
MEAN	8.0	1.0	0.7625	0.612812	0.610983	0.147886	0.120512	0.27	0.117863	0.2625 # subprocess
MEAN	8.0	1.0	0.765	0.615313	0.611078	0.148103	0.12081	0.27	0.11822	0.265 # restrict globals
MEAN	8.0	1.0	0.765312	0.615625	0.611184	0.148103	0.12081	0.27	0.11822	0.265 # redirect prints

NonDeterministicCode    233
ValueError              160
IndexError              152
AssertionError          112
TypeError                26
TimeoutException         14
AttributeError           14
UnboundLocalError        10
UnsafeCode                7
StopIteration             5
NameError                 5
ZeroDivisionError         5
AxisError                 3
KeyError                  3
SyntaxError               1
Name: count, dtype: int64

```

- Full evaluation, 7m54
- Split in batches, 8m5s

```
n_preds	valid code	valid outputs	unique outputs	pixel similarity	correct grids	train_pass_rate	train_pass@n	pass_rate	pass@n
MEAN	480.0	1.0	0.708432	0.438161	0.564959	0.027705	0.01887	0.225	0.018578	0.2175
MEAN	480.0	1.0	0.708312	0.438109	0.564917	0.027686	0.01889	0.23	0.018598	0.2225


NonDeterministicCode    12993
ValueError              12521
AssertionError          11387
IndexError              11175
TimeoutException         3022
TypeError                1538
AttributeError           1155
UnboundLocalError         618
NameError                 398
StopIteration             288
KeyError                  282
ZeroDivisionError         223
UnsafeCode                167
SyntaxError               131
RecursionError             31
AxisError                  14
OverflowError              12
IndentationError           11
RuntimeError                7
UFuncTypeError              7
Name: count, dtype: int64
```

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/arc25/code_execution/evaluation_480.csv')

In [None]:
df.round(3)

Problems with output type
```
n_preds	valid code	valid outputs	unique outputs	pixel similarity	correct grids	train_pass_rate	train_pass@n	pass_rate	pass@n
MEAN	464.0	1.0	0.747236	0.425787	0.570709	0.027715	0.018743	0.2	0.018591	0.195 # baseline
MEAN	464.0	1.0	0.745275	0.425339	0.570379	0.027173	0.018295	0.1975	0.018143	0.1925 # force type to int
MEAN	464.0	1.0	0.726083	0.417619	0.565073	0.027337	0.018675	0.1975	0.018524	0.19 # raise if type is not int
```

In [None]:
scores = []
n_preds_range = 2**np.arange(0, int(np.log2(n_preds)) + 2)
fail_prob = 1 - df['pass_rate'].values[:-1]
for n in n_preds_range:
    scores.append(float(np.mean(1 - fail_prob**n)))

import matplotlib.pyplot as plt

plt.plot(n_preds_range, scores, marker='o')
plt.xscale('log', base=2)
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('pass@n vs Number of Predictions')
plt.grid()
plt.show()
dict(evaluation_data_augmentation=(n_preds_range.tolist(), scores))

In [None]:
{'evaluation_no_data_augmentation': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.018653395632067863,
   0.028123807513897266,
   0.040470801097098115,
   0.05604907442616278,
   0.07517185395071181,
   0.09714182632584087,
   0.12045095653955698,
   0.14292755780979274,
   0.16171510346135057,
   0.1753448200072779]),
   'evaluation_data_augmentation': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.018665553329123646,
   0.02879110915200162,
   0.04224158293230386,
   0.059708013904966675,
   0.08223659237317406,
   0.10940176527483192,
   0.1384685993616617,
   0.1664683605627952,
   0.1908837471080634,
   0.2094882044080468]),
   'evaluation_2025': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.0006127450980392154,
   0.0011804354094579,
   0.0021936594882075557,
   0.0038098619423907474,
   0.005877918122372206,
   0.007609845670347021,
   0.008270521205514126,
   0.008332859889725193,
   0.00833333330643547,
   0.008333333333333333]),
   'training': ([1, 2, 4, 8, 16, 32, 64, 128, 256],
  [0.11879157321399551,
   0.1820998968374452,
   0.2523552039747298,
   0.320614792900346,
   0.3840515764870714,
   0.44540837829332164,
   0.5039219325812776,
   0.5537227577351216,
   0.5891170927046698]),
   'evaluation_6064': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024,
   2048,
   4096,
   8192],
  [0.020675799545352484,
   0.031800090738608865,
   0.04635819085811816,
   0.06506021688568361,
   0.08888375747790557,
   0.11740666375181305,
   0.14840423707784808,
   0.1798639618765866,
   0.21174751560392843,
   0.24540332456877859,
   0.27906954197611594,
   0.30882907544705135,
   0.3307049086550017,
   0.3426571213621735])}

### Compare with and without data augmentation

In [None]:
metrics = {'training-arc-agi-1': ([1, 2, 4, 8, 16, 32, 64, 128, 256],
  [0.1208341472917777,
   0.1852714696641388,
   0.256145417062672,
   0.323924981207185,
   0.3859031489395669,
   0.4455752909442623,
   0.5032849187983576,
   0.553359922920049,
   0.5897329957581053]),
   'evaluation-arc-agi-1': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.019526342790732235,
   0.02979252155031361,
   0.04328670515467564,
   0.060383761781122286,
   0.08205688086630819,
   0.10825312270378742,
   0.13655160734160618,
   0.1640966274942665,
   0.18866504543675547,
   0.20813729316511306]),
  'evaluation-arc-agi-2': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.0006410256410256406,
   0.001232741617357001,
   0.002283125007294328,
   0.003940730838716728,
   0.00601793853224789,
   0.007690006963146114,
   0.008283669075103941,
   0.008333037348707877,
   0.008333333322820506,
   0.008333333333333333])}

keys = list(metrics.keys())
plt.figure(figsize=(20, 5))
for plot_ids, key in enumerate(metrics):
    plt.subplot(1, len(keys), plot_ids + 1)
    n_preds_range, scores = metrics[key]
    plt.plot(n_preds_range, scores, marker='o', label=key)
    plt.xscale('log', base=2)
    # plt.grid(which='both', axis='both')
    plt.grid()
    plt.xlabel('Number of predictions')
    plt.ylabel('pass@n')
    plt.title(key)

plt.suptitle('pass@n vs Number of Predictions')

plt.tight_layout()

In [None]:
2**16 # might solve training-arc-agi-1, this shows that there is room for improvement

In [None]:
metrics = {'baseline': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.01833019880853686,
   0.027800302745602773,
   0.039987770244753335,
   0.05523558985150286,
   0.07429521938067872,
   0.09699108936447659,
   0.12167009514547647,
   0.14566997505731888,
   0.16623501973442983,
   0.18227063267871776]),
'data augmentation': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.019526342790732235,
   0.02979252155031361,
   0.04328670515467564,
   0.060383761781122286,
   0.08205688086630819,
   0.10825312270378742,
   0.13655160734160618,
   0.1640966274942665,
   0.18866504543675547,
   0.20813729316511306]),
}
keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
for key, (n_preds_range, scores) in metrics.items():
    plt.plot(n_preds_range, scores, marker='o', label=key)
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-1')
plt.legend()

plt.tight_layout()

#### evaluation ARC-AGI-1 Comparison

In [None]:
metrics = {
'baseline': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.01996212663472896,
   0.02831747138692302,
   0.03977086910606171,
   0.05542900033336087,
   0.07674247281753263,
   0.10320479610157163,
   0.13093643779484748,
   0.15594708997780551,
   0.1770796829272273,
   0.19471691227422966,
   0.20712048546783934]),
'+ data augmentation': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.019813790637075775,
   0.029064191088443207,
   0.04100375897586597,
   0.05671308781660802,
   0.07779377587518771,
   0.10484135687838109,
   0.13591735274813102,
   0.1675520359797603,
   0.19696420491620842,
   0.22203487375136557,
   0.23821892651950458])
}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
for key, (n_preds_range, scores) in metrics.items():
    plt.plot(n_preds_range, scores, marker='o', label=key)
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-1')
plt.legend()

plt.tight_layout()

#### Bias of the number of predictions

In [None]:
metrics = {
'64 preds': ([1, 2, 4, 8, 16, 32, 64, 128],
  [0.019106336951575514,
   0.028082339121390357,
   0.039026953359591325,
   0.05221714653327929,
   0.06785124181952008,
   0.08534713350917737,
   0.10178389655716932,
   0.11195929946811921]),
'112 preds': ([1, 2, 4, 8, 16, 32, 64, 128],
  [0.019387411477336133,
   0.02887665434462053,
   0.04103250559571824,
   0.05662528637189867,
   0.07657867399726664,
   0.1006969445110981,
   0.12613859060350577,
   0.14808143720453426]),
'184 preds': ([1, 2, 4, 8, 16, 32, 64, 128, 256],
  [0.019316797975629206,
   0.02854922966041234,
   0.0405175566715264,
   0.056039023485551356,
   0.07638928013313853,
   0.10206766052179019,
   0.13087569581761588,
   0.15844583931075643,
   0.1786899038965614]),
'584 preds': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.019813790637075775,
   0.029064191088443207,
   0.04100375897586597,
   0.05671308781660802,
   0.07779377587518771,
   0.10484135687838109,
   0.13591735274813102,
   0.1675520359797603,
   0.19696420491620842,
   0.22203487375136557,
   0.23821892651950458])
}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
for key, (n_preds_range, scores) in metrics.items():
    plt.plot(n_preds_range, scores, marker='o', label=key)
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-1')
plt.legend()

plt.tight_layout()

#### evaluation arc-agi-2

In [None]:
metrics = {'baseline': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024,
   2048],
  [0.000525120200263568,
   0.0010290649730527559,
   0.001977218777084015,
   0.0036586995036725616,
   0.006324151121543866,
   0.009785455928199816,
   0.013112192645441874,
   0.015422166919107317,
   0.016485239578931988,
   0.016662717947703534,
   0.016666664795580977,
   0.016666666666666247]),
'data_augmentation': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.0004395694358616827,
   0.0008607058465457645,
   0.001651027938816423,
   0.003045200797763149,
   0.005231079244621852,
   0.00801093382400977,
   0.010647211929364896,
   0.012848270575498736,
   0.014933881468222887,
   0.016306378321672744])}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
for key, (n_preds_range, scores) in metrics.items():
    plt.plot(n_preds_range, scores, marker='o', label=key)
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-2')
plt.legend()

plt.tight_layout()

### Distribution of prediction length

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B")

In [None]:
for key in ['training', 'evaluation', 'evaluation-2025']:
    predictions = load_all_predictions(f'/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_fix-bug/{key}_*.json')
    prediction_length_distribution = {tid: [len(tokens) for tokens in tokenizer(preds['text_predictions'])['input_ids']] \
                                      for tid, preds in tqdm(predictions.items(), desc="Computing prediction lengths", total=len(predictions))}
    all_lengths = [length for lengths in prediction_length_distribution.values() for length in lengths]
    label = f"{key} (max output tokens: {max(all_lengths)}, median output tokens: {int(np.median(all_lengths))})"
    bins = np.linspace(0, 2000, 100)
    plt.hist(all_lengths, bins=bins, label=label, alpha=0.5, density=True)
plt.legend()
plt.xlabel('Number of tokens')
plt.title('Distribution of prediction lengths')

### Inspect correct solutions

In [None]:
for task_id in df[df['pass_rate'] > 0].index.values[:-1]:
    print(f'https://arcprize.org/play?task={task_id} pass rate: {df.loc[task_id, "pass_rate"]:.2%}')
    task = get_task(task_id)
    task_outputs = [sample['output'] for sample in task['train']] + [sample['output'] for sample in task['test']]
    correct_solution_found = False
    for idx, output in enumerate(predicted_outputs[task_id]):
        if output is None:
            continue
        if correct_grids_score([np.array(output) for output in task_outputs], output) == 1:
            correct_solution_found = True
            data_augmentation_params = predictions[task_id]['data_augmentation_params'][idx]
            text_pred = predictions[task_id]['text_predictions'][idx]
            print(data_augmentation_params)
            augmented_task = apply_data_augmentation(task, **data_augmentation_params) if data_augmentation_params is not None else task
            plot_task(augmented_task); plt.show()
            display(Markdown(text_pred + '\n\n---\n\n'))
            break
    if not correct_solution_found:
        raise ValueError("Could not find correct solution")

I'm impressed by the tasks that the model is able to solve. The reasoning is correct. This is a powerful model to experiment with.

## Create dataset for training with hindsight relabel

Using the predictions I'm going to create a dataset to fine-tune the BARC model on the evaluation tasks.

What do I need to generate the training data?

1. The task after applying data augmentation (if used)
2. The code predicted by the model
3. The output after applying the code
4. The output after reverting the data augmentation (so I can cluster identical solutions)

So it seems that I need to modify slightly the function that runs the code to return all that additional information.

### Code

In [None]:
def create_hindsight_relabeled_tasks(predictions: dict[str, list[str]], log_errors: bool = True):
    # Flatten all predictions into a work list
    work = [
        (tid, i, text_pred, get_task(tid), prediction_data['data_augmentation_params'][i])
        for tid, prediction_data in predictions.items()
        for i, text_pred in enumerate(prediction_data['text_predictions'])
    ]
    work.sort(key=lambda x: (x[1], x[0]))
    n_jobs = -1  # all cores; set to an int to cap
    with tqdm_joblib(total=len(work), desc="Executing predictions", unit="pred", smoothing=0):
        results = Parallel(
            n_jobs=n_jobs,
            backend="loky",
            prefer="processes",
            batch_size="auto",
        )(delayed(_create_hindsight_relabeled_task)(*args) for args in work)

    # Rebuild per-task outputs, preserving your original behavior (code appended even on exec error)
    hindsight_relabeled_tasks = {tid: dict() for tid in predictions}
    for task_id, i, hr_task, err in results:
        if hr_task is not None:
            key = hr_task['fingerprint']
            if key in hindsight_relabeled_tasks[task_id]:
                if len(hr_task['code']) < len(hindsight_relabeled_tasks[task_id][key]['code']):
                    # Keep the shorter code
                    hindsight_relabeled_tasks[task_id][key] = hr_task
            else:
                hindsight_relabeled_tasks[task_id][key] = hr_task
        if err and log_errors:
            logging.error(f"Error executing code for task {task_id}, response {i}: {err}")

    return hindsight_relabeled_tasks


def _create_hindsight_relabeled_task(task_id, i, text_prediction, task, data_augmentation_params=None):
    code = parse_python_code(text_prediction)
    if not code:
        return (task_id, i, None, "parse_failed")
    try:
        input_grids = [sample['input'] for sample in task['train']] + [sample['input'] for sample in task['test']]
        if data_augmentation_params is not None:
            # Apply data augmentation to the input grids
            input_grids = apply_data_augmentation(input_grids, **data_augmentation_params)
        outs = safe_code_execution(
            add_additional_imports(curate_python_code(code)),
            input_grids,
            func_name="transform",
        )
        outs = validate_outputs(outs)
        if data_augmentation_params is not None:
            original_outs = revert_data_augmentation(outs, **data_augmentation_params)
        else:
            original_outs = outs
        pixel_scores = [pixel_similarity_score(output, pred) for output, pred in zip(
            [sample['output'] for sample in task['train']] + [sample['output'] for sample in task['test']],
            original_outs)]
        hr_task = {
            'text_prediction': text_prediction,
            'code': code,
            'train': [{'input': inp, 'output': out} for inp, out in zip(input_grids, outs[:len(task['train'])])],
            'test': [{'input': inp, 'output': out} for inp, out in zip(input_grids[len(task['train']):], outs[len(task['train']):])],
            'fingerprint': fingerprint(original_outs),
            'is_correct_solution': correct_grids_score(
                [sample['output'] for sample in task['train']] + [sample['output'] for sample in task['test']],
                original_outs) == 1,
            'mean_pixel_score': float(np.mean(pixel_scores)),
            'correct_grids_ratio': float(np.mean(np.array(pixel_scores) == 1)),
        }
        return (task_id, i, hr_task, None)
    except Exception as e:
        return (task_id, i, None, f"{type(e).__name__}: {e}")

In [None]:
def plot_original_task_and_hindsight_variations(task_id, hindsight_relabeled_tasks):
    task = get_task(task_id)
    print(f"Original task {task_id}:")
    plot_task(task)
    plt.show()
    display(Markdown('---'))
    if task_id in hindsight_relabeled_tasks:
        for i, (key, hr_task) in enumerate(hindsight_relabeled_tasks[task_id].items()):
            print(f"Hindsight relabeled task variation {i+1} (fingerprint: {key}):")
            if hr_task['is_correct_solution']:
                print("This variation is a correct solution.")
            plot_task(hr_task)
            plt.show()
            display_python_code(hr_task['code'])
            display(Markdown('---'))
    else:
        print("No hindsight relabeled tasks found.")

### Dataset generation

In [None]:
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-23_no-data-augmentation/evaluation_*.json')
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/evaluation_*.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-23_no-data-augmentation/evaluation_*.json')


n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

In [None]:
hindsight_relabeled_tasks = create_hindsight_relabeled_tasks(predictions, log_errors=False)

In [None]:
#plot_original_task_and_hindsight_variations('195ba7dc', hindsight_relabeled_tasks) #00576224, 009d5c81, 00dbd492, '195ba7dc'

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B")
grid_encoder = create_grid_encoder('ColorNameEncoder()')

training_texts = dict()
for task_id in tqdm(hindsight_relabeled_tasks):
    for hr_task in hindsight_relabeled_tasks[task_id].values():
        # if not hr_task['is_correct_solution']:
        #     continue
        text = create_prompt_from_task(hr_task, grid_encoder, tokenizer)
        text += hr_task['text_prediction'] + tokenizer.eos_token
        if task_id not in training_texts:
            training_texts[task_id] = []
        training_texts[task_id].append(text)

pretty_print_prompt(text, default_color='white')

In [None]:
prompts_per_task = [len(texts) for texts in training_texts.values()]
plt.hist(prompts_per_task, bins=30)
plt.xlabel('Number of prompts per task')
plt.ylabel('Number of tasks')
plt.title('Distribution of prompts per task')

In [None]:
all_prompts = [text for texts in training_texts.values() for text in texts]
prompt_lengths = [len(tokens) for tokens in tokenizer(all_prompts)['input_ids']]
plt.hist(prompt_lengths, bins=50, cumulative=True, density=True)
plt.xlabel('Number of tokens')
plt.title(f'Cumulative Distribution of prompt lengths for hindsight relabeled tasks (n={len(all_prompts)})')
plt.ylim(0, 1)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.grid()

We can train of around 90% of the data with a length of 4000 tokens.

In [None]:
filepath = f'/mnt/hdd0/Kaggle/arc25/data/hindsight_relabeled/2025-08-25_evaluation-{len(all_prompts)}.json'
with open(filepath, 'w') as f:
    json.dump(training_texts, f, indent=2)
print(f'Dataset size: {os.path.getsize(filepath) / (1024 * 1024):.1f} MB')

I could create a dataset with correct solutions and another without it. That would allow me to see the impact of training on correct solutions.

### Create filtered dataset (with smaller number of samples)

In [None]:
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/evaluation_*.json')
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-23_no-data-augmentation/evaluation_*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")
hindsight_relabeled_tasks = create_hindsight_relabeled_tasks(predictions, log_errors=False)

Let's study the metrics distribution.

In [None]:
metric = 'mean_pixel_score'
bins = np.linspace(0, 1, 50)
solved_tasks_distribution, unsolved_tasks_distribution = [], []
for task_id in tqdm(hindsight_relabeled_tasks):
    tasks = list(hindsight_relabeled_tasks[task_id].values())
    is_solved = any(hr_task['is_correct_solution'] for hr_task in tasks)
    scores = [hr_task[metric] for hr_task in tasks]
    if is_solved:
        solved_tasks_distribution.extend(scores)
    else:
        unsolved_tasks_distribution.extend(scores)
plt.hist(solved_tasks_distribution, bins=bins, alpha=0.5, density=True, color='blue', label='solved tasks')
plt.hist(unsolved_tasks_distribution, bins=bins, alpha=0.5, density=True, color='red', label='unsolved tasks')
plt.legend()
plt.title(f'Distribution of {metric} for solved and unsolved tasks')
plt.xlabel(metric);

In [None]:
metric = 'correct_grids_ratio'
bins = np.linspace(0, 1, 10)
solved_tasks_distribution, unsolved_tasks_distribution = [], []
for task_id in tqdm(hindsight_relabeled_tasks):
    tasks = list(hindsight_relabeled_tasks[task_id].values())
    is_solved = any(hr_task['is_correct_solution'] for hr_task in tasks)
    scores = [hr_task[metric] for hr_task in tasks]
    if is_solved:
        solved_tasks_distribution.extend(scores)
    else:
        unsolved_tasks_distribution.extend(scores)
plt.hist(solved_tasks_distribution, bins=bins, alpha=0.5, density=True, color='blue', label='solved tasks', log=True)
plt.hist(unsolved_tasks_distribution, bins=bins, alpha=0.5, density=True, color='red', label='unsolved tasks', log=True)
plt.legend()
plt.title(f'Distribution of {metric} for solved and unsolved tasks')
plt.xlabel(metric);

correct_grids_ratio seems to be a much better way to select samples.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B")
grid_encoder = create_grid_encoder('ColorNameEncoder()')

n_samples_per_task = 8

training_texts = dict()
mean_scores = []
for task_id in tqdm(hindsight_relabeled_tasks):
    tasks = list(hindsight_relabeled_tasks[task_id].values())
    tasks.sort(key=lambda x: (-x['correct_grids_ratio'], -x['mean_pixel_score'], len(x['code'])))
    tasks = tasks[:n_samples_per_task]
    mean_scores.append(np.mean([task['correct_grids_ratio'] for task in tasks]))
    for task in tasks:
        text = create_prompt_from_task(task, grid_encoder, tokenizer)
        text += task['text_prediction'] + tokenizer.eos_token
        if task_id not in training_texts:
            training_texts[task_id] = []
        training_texts[task_id].append(text)

plt.hist(mean_scores, bins=30, log=True)
plt.grid(which='both', axis='y')
plt.title(f'Mean correct grids ratio of top {n_samples_per_task} variations')
plt.xlabel('Mean correct grids ratio')
plt.ylabel('Number of tasks');

In [None]:
prompts_per_task = [len(texts) for texts in training_texts.values()]
plt.hist(prompts_per_task, bins=30)
plt.xlabel('Number of prompts per task')
plt.ylabel('Number of tasks')
plt.title('Distribution of prompts per task')

In [None]:
all_prompts = [text for texts in training_texts.values() for text in texts]
prompt_lengths = [len(tokens) for tokens in tokenizer(all_prompts)['input_ids']]
plt.hist(prompt_lengths, bins=50, cumulative=True, density=True)
plt.xlabel('Number of tokens')
plt.title(f'Cumulative Distribution of prompt lengths for hindsight relabeled tasks (n={len(all_prompts)})')
plt.ylim(0, 1)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.grid()

In [None]:
filepath = f'/mnt/hdd0/Kaggle/arc25/data/hindsight_relabeled/2025-08-25_evaluation-selected{n_samples_per_task}_no-data-augmentation.json'
with open(filepath, 'w') as f:
    json.dump(training_texts, f, indent=2)
print(f'Dataset size: {os.path.getsize(filepath) / (1024 * 1024):.1f} MB')

## Evaluate finetuned models

### Make predictions

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1,
                            enable_lora=True, max_model_len=16000, max_lora_rank=32)

In [None]:
adapter_path = '/mnt/hdd0/MEGA/TEMP/2025-08-26-lora-rank/2xA6000--1000steps-8192msl-1e-4lr-lora32/checkpoint-1000'
lora_request = LoRARequest('LoRA', 1, adapter_path)

In [None]:
dataset_to_task_ids = {
    'training': list(training_challenges.keys()),
    'evaluation': list(evaluation_challenges.keys()),
    'evaluation-2025': list(evaluation_challenges_2025.keys())
}

experiment_name = '2025-08-27_first-finetuning-steps'
dataset = 'evaluation'
task_ids = dataset_to_task_ids[dataset]
grid_encoder = create_grid_encoder('ColorNameEncoder()')

In [None]:
text = create_prompt_from_task(get_task(task_ids[0]), grid_encoder, tokenizer)
pretty_print_prompt(text, default_color='white')

In [None]:
use_data_augmentation = True
for n in [8]:
    sampling_params = SamplingParams(n=n, temperature=1.0, top_p=0.95, max_tokens=2048)

    prompts, data_augmentation_params = [], []
    for task_id in task_ids:
        task = get_task(task_id)
        if use_data_augmentation:
            params = get_random_data_augmentation_params()
            task = apply_data_augmentation(task, **params)
        else:
            params = None
        data_augmentation_params.append(params)
        prompt = create_prompt_from_task(
            task, grid_encoder=grid_encoder, tokenizer=tokenizer, shuffle_train_samples=True)
        prompts.append(prompt)

    t0 = time.time()
    text_predictions = llm.generate(prompts, sampling_params, lora_request=lora_request)
    total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
    inference_time = time.time() - t0
    print(f"Total tokens generated: {total_tokens}")
    print(f"Time taken: {inference_time:.2f} seconds")
    print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
    print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
    print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

    predictions = dict()
    for task_id, output, params in zip(task_ids, text_predictions, data_augmentation_params):
        predictions[task_id] = {
            'text_predictions': [output.text for output in output.outputs],
            'data_augmentation_params': params,
        }

    output_filepath = f'/mnt/hdd0/Kaggle/arc25/predictions/{experiment_name}/{dataset}_{sampling_params.n}preds_{get_timestamp()}_predictions.json'
    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
    with open(output_filepath, 'w') as f:
        json.dump(predictions, f, indent=2)
    print(f"Predictions saved to {output_filepath}")

Compare inference speed with and without LoRA

```
evaluation without LoRA
8 preds, Average time per task: 4.17 seconds

evaluation with LoRA rank 32
Average time per task: 4.87 seconds, 5.08, 4.59, 5.06
```

### Evaluation

In [None]:
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-27_first-finetuning-steps/*.json')
# predictions = load_all_predictions('/mnt/hdd0/MEGA/TEMP/predictions/2025-08-27-training-steps/2xH100-8000steps-8192msl-1e-5lr-full-finetuning-continue/evaluation/*.json')
predictions = load_all_predictions('/mnt/hdd0/MEGA/TEMP/predictions/2025-08-29-smaller-datasets/2xA6000-1000steps-8192msl-1e-4lr-lora32*/evaluation/*.json')
# predictions = load_all_predictions('/mnt/hdd0/MEGA/TEMP/predictions/2025-08-29-smaller-datasets-no-data-augmentation/2xA6000-1000steps-8192msl-1e-4lr-lora32/evaluation/*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

In [None]:
results = run_code_from_predictions(predictions)
df = compute_search_metrics(results)
df.iloc[-1:]

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/arc25/code_execution/evaluation_2025-08-29-smaller-datasets-1000steps_512.csv')

In [None]:
scores = []
n_preds_range = 2**np.arange(0, int(np.log2(n_preds)) + 2)
fail_prob = 1 - df['pass_rate'].values[:-1]
for n in n_preds_range:
    scores.append(float(np.mean(1 - fail_prob**n)))

import matplotlib.pyplot as plt

plt.plot(n_preds_range, scores, marker='o')
plt.xscale('log', base=2)
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('pass@n vs Number of Predictions')
plt.grid()
plt.show()
dict(evaluation_data_augmentation=(n_preds_range.tolist(), scores))

### Plots

#### Data augmentation

In [None]:
metrics = {
   'baseline': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.018665553329123646,
   0.02879110915200162,
   0.04224158293230386,
   0.059708013904966675,
   0.08223659237317406,
   0.10940176527483192,
   0.1384685993616617,
   0.1664683605627952,
   0.1908837471080634,
   0.2094882044080468]),
#    'baseline_6064': ([1,
#    2,
#    4,
#    8,
#    16,
#    32,
#    64,
#    128,
#    256,
#    512,
#    1024,
#    2048,
#    4096,
#    8192],
#   [0.020675799545352484,
#    0.031800090738608865,
#    0.04635819085811816,
#    0.06506021688568361,
#    0.08888375747790557,
#    0.11740666375181305,
#    0.14840423707784808,
#    0.1798639618765866,
#    0.21174751560392843,
#    0.24540332456877859,
#    0.27906954197611594,
#    0.30882907544705135,
#    0.3307049086550017,
#    0.3426571213621735]),
   '2025-08-29-smaller-datasets/2xA6000-20steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.023606329073264366,
   0.0352476921085585,
   0.050501333159261946,
   0.07003085421915217,
   0.09419090612849824,
   0.12113183968221045,
   0.14803950592285675,
   0.1739829401977137,
   0.19875668196923418,
   0.21980857854481178,
   0.2328619884758857]),
   '2025-08-29-smaller-datasets/2xA6000-50steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.022661425810380838,
   0.03481883641545316,
   0.05094183584628108,
   0.0716586140699154,
   0.09738483594465794,
   0.12678086459475907,
   0.15656512697994462,
   0.18444358302270544,
   0.2101792723053778,
   0.232086348106364,
   0.24541442209449776]),
   '2025-08-29-smaller-datasets/2xA6000-100steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.02378096490088292,
   0.03754834234524835,
   0.055772371764274686,
   0.07811478359718953,
   0.10437513965709139,
   0.13375836535645524,
   0.16418840621709735,
   0.19274152853230261,
   0.21843439192234648,
   0.23989066332449474,
   0.25305110403574377]),
   '2025-08-29-smaller-datasets/2xA6000-200steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.026407894401354506,
   0.04183597142086427,
   0.06136782516904767,
   0.08414126336106854,
   0.10926185586165676,
   0.13571473592796823,
   0.1629771841582275,
   0.18954746817746748,
   0.21358035273903242,
   0.23388701626531558,
   0.24738480183743544]),
   '2025-08-29-smaller-datasets/2xA6000-400steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.03117564177009315,
   0.04998497655109415,
   0.07385574233762617,
   0.10109329793913328,
   0.13032582762064335,
   0.16063589178430518,
   0.1903189163633815,
   0.21801216832181225,
   0.24286374668830452,
   0.26271536089726744,
   0.2752622875910397]),
   '2025-08-29-smaller-datasets/2xA6000-1000steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.03393188286719861,
   0.05618490491219927,
   0.08531506508418366,
   0.11875323375283692,
   0.1530693753851221,
   0.1857451548706929,
   0.2151749652079954,
   0.24097422198127702,
   0.26278415057116694,
   0.27943911006045935,
   0.2891937503094295]),
   '2025-08-29-smaller-datasets/2xA6000-2000steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.03598298972600957,
   0.05938001927686663,
   0.0899627911892049,
   0.12510289665136845,
   0.16063207974081734,
   0.19122662010398997,
   0.21507110907071642,
   0.23376178443142487,
   0.24870419044859723,
   0.2603286642701881,
   0.2674460627031699]),
#    'full-finetuning': ([1,
#    2,
#    4,
#    8,
#    16,
#    32,
#    64,
#    128,
#    256,
#    512,
#    1024],
#   [0.020943503218614056,
#    0.03539542169542603,
#    0.05516301589810301,
#    0.07978942217012977,
#    0.10925191308893745,
#    0.14241075863007166,
#    0.17641764413891386,
#    0.20827495116804257,
#    0.23495958143629544,
#    0.2549056629232699,
#    0.26631500511324085])
}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
# set a viridis colormap with as many colors as keys
cmap = plt.get_cmap('viridis', len(keys))
colors = [cmap(i) for i in range(len(keys))]
for key, (n_preds_range, scores) in metrics.items():
    label = key if 'steps' not in key else key.split('steps-')[0].split('-')[-1] + ' finetuning steps'
    plt.plot(n_preds_range[:9], scores[:9], marker='o', label=label, color=colors[keys.index(key)])
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-1')
plt.legend()

plt.tight_layout()

#### No data augmentation

In [None]:
metrics = {
   'baseline': ([1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
  [0.018653395632067863,
   0.028123807513897266,
   0.040470801097098115,
   0.05604907442616278,
   0.07517185395071181,
   0.09714182632584087,
   0.12045095653955698,
   0.14292755780979274,
   0.16171510346135057,
   0.1753448200072779],),
   '2025-08-29-smaller-datasets-no-data-augmentation/2xA6000-100steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.024755481392362774,
   0.0367296855115465,
   0.051931907728330916,
   0.071667129777274,
   0.09634285881126135,
   0.12451250069057292,
   0.15435278313874515,
   0.18383258470644478,
   0.20928428503505228,
   0.22709159441068621,
   0.23668065815082812]),
   '2025-08-29-smaller-datasets-no-data-augmentation/2xA6000-200steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.025170263188187957,
   0.03939721859599572,
   0.05702557841718987,
   0.07870673666868633,
   0.10524887049008552,
   0.13491733441516116,
   0.16497592064853683,
   0.19372497684150716,
   0.2195907575602184,
   0.2408218665804371,
   0.2546336295672011]),
   '2025-08-29-smaller-datasets-no-data-augmentation/2xA6000-400steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.028915353428317324,
   0.04635151032425139,
   0.06823145086396849,
   0.09350457920187943,
   0.12129978397972246,
   0.1502646416692249,
   0.17806324360439388,
   0.20372412789366146,
   0.22650050489856888,
   0.24383315592615584,
   0.25380920871243623]),
   '2025-08-29-smaller-datasets-no-data-augmentation/2xA6000-1000steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.06464566983287443,
   0.0960075917248096,
   0.1273494403720554,
   0.15302477552865526,
   0.17240088138783308,
   0.1877761751774143,
   0.2015373151125997,
   0.21536516950023035,
   0.22903181236179837,
   0.23992157059581673,
   0.24569130991579102]),
   '2025-08-29-smaller-datasets-no-data-augmentation/2xA6000-2000steps-8192msl-1e-4lr-lora32': ([1,
   2,
   4,
   8,
   16,
   32,
   64,
   128,
   256,
   512,
   1024],
  [0.08198481692553986,
   0.11571581682183918,
   0.14643463821422242,
   0.16763511016356147,
   0.17986060840918847,
   0.18851721505402538,
   0.196651421697631,
   0.2049564001993034,
   0.21316720212693696,
   0.22049393241597864,
   0.22546817719225815])
}

keys = list(metrics.keys())
plt.figure(figsize=(10, 5))
# set a viridis colormap with as many colors as keys
cmap = plt.get_cmap('viridis', len(keys))
colors = [cmap(i) for i in range(len(keys))]
for key, (n_preds_range, scores) in metrics.items():
    label = key if 'steps' not in key else key.split('steps-')[0].split('-')[-1] + ' finetuning steps'
    plt.plot(n_preds_range[:9], scores[:9], marker='o', label=label, color=colors[keys.index(key)])
plt.xscale('log', base=2)
# plt.grid(which='both', axis='both')
plt.grid()
plt.xlabel('Number of predictions')
plt.ylabel('pass@n')
plt.title('Evaluation ARC-AGI-1')
plt.legend()

plt.tight_layout()

### Check how significative are the improvements

I want to understand if the improvements get with test-time training are significative. The criticism is that I have made a total of 1024 predictions with those models instead of 1024.
Another way to skip the criticism would be to train without the totally correct tasks.

In [None]:
finetuned_df = pd.read_csv('/mnt/hdd0/Kaggle/arc25/code_execution/evaluation_2025-08-29-smaller-datasets-1000steps_512.csv'
finetuned_df.tail()

In [None]:
finetuned_df = pd.read_csv('/mnt/hdd0/Kaggle/arc25/code_execution/evaluation_2025-08-29-smaller-datasets-1000steps_512.csv', index_col=0)
seed_df = pd.read_csv('/mnt/hdd0/Kaggle/arc25/code_execution/evaluation_480.csv', index_col=0)
baseline_df = pd.read_csv('/mnt/hdd0/Kaggle/arc25/code_execution/evaluation_6064.csv', index_col=0)

In [None]:
solved_task_ids = finetuned_df[finetuned_df['pass_rate'] > 0].index.values[:-1]
print(f"Number of solved tasks: {len(solved_task_ids)}")

Let's check how many of those were already solved in the seed dataset.

In [None]:
(seed_df.loc[solved_task_ids]['pass_rate'] > 0).sum()

In [None]:
seed_df.tail()

In [None]:
88/400, 88/117

So 75% of the tasks were already solved. 22% score and the baseline solved 22.5% so almost all of the tasks solved in the seed data were solved by the model. Let's focus on the other tasks.

In [None]:
newly_solved_task_ids = [task_id for task_id in solved_task_ids if seed_df.loc[task_id]['pass_rate'] == 0]
already_solved_task_ids = [task_id for task_id in solved_task_ids if seed_df.loc[task_id]['pass_rate'] > 0]

In [None]:
# there is a bug in the calculation of the pass rate
baseline_df['fixed_pass_rate'] = baseline_df['pass_rate'] * baseline_df['valid outputs']

In [None]:
values = sorted(1/baseline_df.loc[newly_solved_task_ids]['fixed_pass_rate'].values)
values = np.clip(values, None, 6064*2)
plt.hist(values, bins=np.logspace(np.log10(2**7), np.log10(6064*2), 30))
plt.xscale('log', base=10)
# draw a vertical line at 1024
plt.axvline(1024, color='red', linestyle='--', label='1024')
plt.axvline(6064, color='green', linestyle='--', label='6064')
plt.legend(loc=0)
plt.grid(axis='x', which='both')
plt.xlabel('Estimated number of predictions to solve task with baseline model')
plt.ylabel('Number of tasks')
plt.title('Distribution of estimated predictions to solve newly solved tasks');

In [None]:
(values > 7000).mean()

The fine-tuned model using TTT was able to solve tasks that on average require more than 1024 predictions to be solved (72% of the newly solved tasks). In fact 17% of the tasks were not solved after doing more than 6000 predictions.

## Debugging

### LoRA saving uses a lot of space

In [None]:
import sys
import os
import shutil
sys.path.append('../scripts')
from finetuning import get_model, get_tokenizer, get_lora_model, Accelerator


model_path = '/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B'
accelerator = Accelerator()

model = get_model(model_path, torch_dtype="bfloat16",
                      use_4bit_quantization=False, device_map='None',
                      use_gradient_checkpointing=False)
tokenizer = get_tokenizer(model_path, model, 'ColorNameEncoder()')
model = get_lora_model(model, None, 32, False, False, 'default')
model.print_trainable_parameters()

shutil.rmtree('/mnt/hdd0/Kaggle/arc25/models/debug_PEFT_saving')
model.save_pretrained('/mnt/hdd0/Kaggle/arc25/models/debug_PEFT_saving', safe_serialization=True) #4.3GB
print(os.path.getsize('/mnt/hdd0/Kaggle/arc25/models/debug_PEFT_saving/adapter_model.safetensors') / (1024*1024), 'MB') #4112 MB for rank 32, 4034MB for rank 8, 2108 when not using 4 bit quantization and rank 32
shutil.rmtree('/mnt/hdd0/Kaggle/arc25/models/debug_PEFT_saving')
model.save_pretrained('/mnt/hdd0/Kaggle/arc25/models/debug_PEFT_saving', safe_serialization=True, save_embedding_layers=False)
print(os.path.getsize('/mnt/hdd0/Kaggle/arc25/models/debug_PEFT_saving/adapter_model.safetensors') / (1024*1024), 'MB') #104 MB for rank 32, 26MB for rank 8

### Collator with eos_token=pad_token

In [None]:
from trl import  DataCollatorForCompletionOnlyLM

model_path = '/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B'
tokenizer = AutoTokenizer.from_pretrained(model_path)
print(tokenizer.special_tokens_map)
data_collator = DataCollatorForCompletionOnlyLM(
            tokenizer=tokenizer,
            # instruction_template='<|start_header_id|>user<|end_header_id|>',
            response_template='<|start_header_id|>assistant<|end_header_id|>',
)

text = '<|begin_of_text|><|start_header_id|>assistant<|end_header_id|> Hello, world! <|eot_id|><|eot_id|>'

# print(tokenizer(text))
print(data_collator([tokenizer(text)]))

Even if I add multiple eos_tokens, they are all masked. So that is not a solution.

In [None]:
print(tokenizer.special_tokens_map)
for word in tokenizer.get_vocab():
    if '▁<|' in word or '|>' in word and 'reserved_special' not in word:
        print(word, tokenizer.convert_tokens_to_ids(word))

In [None]:
text = '<|begin_of_text|> <|start_header_id|>assistant<|end_header_id|> Hello, world! <|eot_id|><|eot_id|>'

# print(tokenizer(text))
print(data_collator([tokenizer(text)]))

In [None]:
model_path = '/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B'
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = '<|finetune_right_pad_id|>'
data_collator = DataCollatorForCompletionOnlyLM(
            tokenizer=tokenizer,
            # instruction_template='<|start_header_id|>user<|end_header_id|>',
            response_template='<|start_header_id|>assistant<|end_header_id|>',
)

text = '<|begin_of_text|><|start_header_id|>assistant<|end_header_id|> Hello, world! <|eot_id|><|eot_id|>'

# print(tokenizer(text))
print(data_collator([tokenizer(text)]))

Is it valid for Llama models?

In [None]:
model_path = '/home/gbarbadillo/models/Llama-3.1-8B'
tokenizer = AutoTokenizer.from_pretrained(model_path)
print(tokenizer.special_tokens_map)
for word in tokenizer.get_vocab():
    if '▁<|' in word or '|>' in word and 'reserved_special' not in word:
        print(word, tokenizer.convert_tokens_to_ids(word))

In [None]:
model_path = '/home/gbarbadillo/models/Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path)
print(tokenizer.special_tokens_map)
for word in tokenizer.get_vocab():
    if '▁<|' in word or '|>' in word and 'reserved_special' not in word:
        print(word, tokenizer.convert_tokens_to_ids(word))

Both models have the `<|finetune_right_pad_id|>` token, so I can make a general rule for llama 3.1 models.

### VLLM LoRA compatibility

https://docs.vllm.ai/en/v0.9.1/features/lora.html

#### First steps

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(
    model_path, use_4bit_quantization=False,
    tensor_parallel_size=1, enable_lora=True)

In [None]:
messages = [
    {"role": "system", "content": 'You are a helpful assistant that always ends your responses with "Have a great day!"'},
    {"role": "user", "content": 'Hi, how are you today?'},]
prompt = tokenizer.apply_chat_template(
    messages,  tokenize=False, add_generation_prompt=True)
print(prompt)

In [None]:
sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=1024)
outputs = llm.generate([prompt], sampling_params)
for idx, output in enumerate(outputs[0].outputs):
    print(f'{idx}. {output.text}')

In [None]:
sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=1024)
outputs = llm.generate([prompt], sampling_params)
for idx, output in enumerate(outputs[0].outputs):
    print(f'{idx}. {output.text}')

In [None]:
from vllm.lora.request import LoRARequest
adapter_path = '/mnt/hdd0/Kaggle/arc25/trainings/2025-08-26-lora-compatibility/qLoRA_8/checkpoint-1'
lora_request = LoRARequest("plain-lora", 1, adapter_path)

In [None]:
outputs = llm.generate([prompt], sampling_params, lora_request=lora_request)
for idx, output in enumerate(outputs[0].outputs):
    print(f'{idx}. {output.text}')

In [None]:
from vllm.lora.request import LoRARequest
adapter_path = '/mnt/hdd0/Kaggle/arc25/trainings/2025-08-26-lora-compatibility/qLoRA_8_dora_rslora/checkpoint-1'
lora_request = LoRARequest("dora", 2, adapter_path)

In [None]:
# ValueError: vLLM does not yet support DoRA.
outputs = llm.generate([prompt], sampling_params, lora_request=lora_request)
for idx, output in enumerate(outputs[0].outputs):
    print(f'{idx}. {output.text}')

In [None]:
from vllm.lora.request import LoRARequest
adapter_path = '/mnt/hdd0/Kaggle/arc25/trainings/2025-08-26-lora-compatibility/qLoRA_8_rslora/checkpoint-1'
lora_request = LoRARequest("rslora", 3, adapter_path)

In [None]:
outputs = llm.generate([prompt], sampling_params, lora_request=lora_request)
for idx, output in enumerate(outputs[0].outputs):
    print(f'{idx}. {output.text}')

VLLM supports LoRA and RSLoRA, it does not support DoRA. Moreover I can give models on the fly, it seems that the first time is slower but otherwise speed looks to be the same.

#### Speed tests

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(
    model_path, use_4bit_quantization=False,
    tensor_parallel_size=1, enable_lora=True)

In [None]:
from vllm.lora.request import LoRARequest
adapter_paths = {
    'LoRA': '/mnt/hdd0/Kaggle/arc25/trainings/2025-08-26-lora-compatibility/qLoRA_8/checkpoint-1',
    'RSLoRA': '/mnt/hdd0/Kaggle/arc25/trainings/2025-08-26-lora-compatibility/qLoRA_8_rslora/checkpoint-1',
}
loras = {name: LoRARequest(name, idx, path) for idx, (name, path) in enumerate(adapter_paths.items(), 1)}

In [None]:
messages = [
    {"role": "system", "content": 'You are a helpful assistant that always ends your responses with "Have a great day!"'},
    {"role": "user", "content": 'Hi, how are you today?'},]
prompt = tokenizer.apply_chat_template(
    messages,  tokenize=False, add_generation_prompt=True)
print(prompt)

In [None]:
sampling_params = SamplingParams(n=800, temperature=1.0, top_p=0.95, max_tokens=10)
t0 = time.time()
outputs = llm.generate([prompt], sampling_params)
generation_time = time.time() - t0
output_tokens = sum(len(output.token_ids) for output in outputs[0].outputs)
print(f"Base model: {output_tokens} tokens generated in {generation_time:.2f} seconds ({output_tokens / generation_time:.2f} tokens/second)")

In [None]:
for key in loras.keys():
    t0 = time.time()
    outputs = llm.generate([prompt], sampling_params, lora_request=loras[key])
    generation_time = time.time() - t0
    output_tokens = sum(len(output.token_ids) for output in outputs[0].outputs)
    print(f"{key} model: {output_tokens} tokens generated in {generation_time:.2f} seconds ({output_tokens / generation_time:.2f} tokens/second)")

In [None]:
for key in loras.keys():
    t0 = time.time()
    outputs = llm.generate([prompt], sampling_params, lora_request=loras[key])
    generation_time = time.time() - t0
    output_tokens = sum(len(output.token_ids) for output in outputs[0].outputs)
    print(f"{key} model: {output_tokens} tokens generated in {generation_time:.2f} seconds ({output_tokens / generation_time:.2f} tokens/second)")

```
sampling_params = SamplingParams(n=800, temperature=1.0, top_p=0.95, max_tokens=1024)
Base model: 32055 tokens generated in 18.17 seconds (1764.12 tokens/second)
LoRA model: 33062 tokens generated in 28.28 seconds (1169.06 tokens/second)
RSLoRA model: 31708 tokens generated in 19.74 seconds (1606.59 tokens/second)
LoRA model: 32293 tokens generated in 19.14 seconds (1687.62 tokens/second)
RSLoRA model: 32403 tokens generated in 19.16 seconds (1691.01 tokens/second)

sampling_params = SamplingParams(n=800, temperature=1.0, top_p=0.95, max_tokens=10)
Base model: 8000 tokens generated in 4.94 seconds (1618.46 tokens/second)
LoRA model: 8000 tokens generated in 5.30 seconds (1508.31 tokens/second)
RSLoRA model: 8000 tokens generated in 5.21 seconds (1534.20 tokens/second)
LoRA model: 8000 tokens generated in 5.23 seconds (1530.30 tokens/second)
RSLoRA model: 8000 tokens generated in 5.65 seconds (1415.46 tokens/second)

# restart
sampling_params = SamplingParams(n=800, temperature=1.0, top_p=0.95, max_tokens=10)
Base model: 8000 tokens generated in 4.95 seconds (1614.81 tokens/second)
LoRA model: 8000 tokens generated in 5.78 seconds (1384.20 tokens/second)
RSLoRA model: 8000 tokens generated in 6.01 seconds (1330.54 tokens/second)
LoRA model: 8000 tokens generated in 5.26 seconds (1522.17 tokens/second)
RSLoRA model: 8000 tokens generated in 5.29 seconds (1512.97 tokens/second)
```

It seems that the first time a model is called it is slightly slower. And the LoRA model by itself is slightly slower than the base model. But manageable.

### Increase observability

In [None]:
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-27_first-finetuning-steps/evaluation_8preds_2025_08_27_12_30_09_predictions.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")
results = run_code_from_predictions(predictions)
display(compute_search_metrics(results).iloc[-1:])
display(error_analysis(results).iloc[-1:])

### Variability in evaluation

I have observed that running the evaluation multiple times changes the results. This does not have sense, so let's check it.

In [None]:
filepath = '/mnt/hdd0/Kaggle/arc25/predictions/2025-08-27_first-finetuning-steps/evaluation_8preds_2025_08_27_12_30_09_predictions.json'
dfs = []
for _ in tqdm(range(2)):
    predictions = load_all_predictions(filepath)
    n_preds = len(list(predictions.values())[0]['text_predictions'])
    predicted_code, predicted_outputs = run_code_from_predictions(predictions, log_errors=False)
    dfs.append(compute_search_metrics(list(predictions.keys()), predicted_code, predicted_outputs, n_preds))
    display(dfs[-1].iloc[-1:])

In [None]:
indices = [idx for idx, changed in enumerate(np.sum((dfs[0] - dfs[1]).values, axis=1) != 0) if changed]
print(len(indices), (dfs[0] - dfs[1]).iloc[indices].index)
(dfs[0] - dfs[1]).iloc[indices]

In [None]:
filepath = '/mnt/hdd0/Kaggle/arc25/predictions/2025-08-27_first-finetuning-steps/evaluation_8preds_2025_08_27_12_30_09_predictions.json'
predictions = load_all_predictions(filepath)

keys = ['20818e16', '37d3e8b2', '4acc7107', '69889d6e', '8b28cd80', '93b4f4b3',
       'af22c60d', 'cd3c21df', 'd4b1c2b1', 'de493100', 'e78887d1', 'e9c9d9a1']
predictions = {key: predictions[key] for key in keys if key in predictions}


n_preds = len(list(predictions.values())[0]['text_predictions'])
dfs = []
rets = []
for _ in tqdm(range(2)):
    predicted_code, predicted_outputs = run_code_from_predictions(predictions, log_errors=False)
    dfs.append(compute_search_metrics(list(predictions.keys()), predicted_code, predicted_outputs, n_preds))
    rets.append((predicted_code, predicted_outputs))
dfs[0] - dfs[1]

In [None]:
# predicted code seems to be the same
rets[0][0] == rets[1][0]

In [None]:
task_id = '20818e16'
pred_idx = 1
task = get_task(task_id)
task = apply_data_augmentation(task, **predictions[task_id]['data_augmentation_params'][pred_idx]) if predictions[task_id]['data_augmentation_params'][pred_idx] is not None else task
outputs = [sample['output'] for sample in task['train']] + [sample['output'] for sample in task['test']]
print(rets[0][0][task_id][pred_idx])
for idx, output in enumerate(outputs):
    print(f"Output {idx}:")
    print(pixel_similarity_score(output, rets[0][1][task_id][pred_idx][idx]), pixel_similarity_score(output, rets[1][1][task_id][pred_idx][idx]))
    print(rets[0][1][task_id][pred_idx][idx].shape, rets[1][1][task_id][pred_idx][idx].shape)
    print(output)
    # print(rets[1][1][task_id][pred_idx][idx])
    # print()

In [None]:
output

In [None]:
np.array_equal(rets[0][1]['20818e16'][0], rets[0][1]['20818e16'][0])

In [None]:
rets[0][1]['20818e16'][0]

In [None]:
for key in rets[0][1].keys():
    for idx in range(len(rets[0][1][key])):
        out1 = rets[0][1][key][idx]
        out2 = rets[1][1][key][idx]
        if not np.array_equal(out1, out2):
            print(f"Difference found in task {key} at prediction {idx}")
            # print(rets[0][0][key][idx])
            print(len(out1), len(out2))
            for sample_output1, sample_output2 in zip(out1, out2):
                if not np.array_equal(sample_output1, sample_output2):
                    print("Sample outputs differ:")
                    print("Sample Output 1:", sample_output1.shape)
                    print(sample_output1)
                    print("Sample Output 2:", sample_output2.shape)
                    print(sample_output2)
            print()
            # raise ValueError("Outputs differ")

In [None]:
rets[0][1] == rets[1][1]

In [None]:
indices = [idx for idx, changed in enumerate(np.sum((dfs[0] - dfs[1]).values, axis=1) != 0) if changed]
(dfs[0] - dfs[1]).iloc[indices]

In [None]:
# this could allow to detect differences in valid outputs
for key in rets[0][1].keys():
    for idx in range(len(rets[0][1][key])):
        out1 = rets[0][1][key][idx]
        out2 = rets[1][1][key][idx]
        if out1 is None and out2 is not None:
            print(f"Different outputs for task {key}, prediction {idx}")
        elif out1 is not None and out2 is None:
            print(f"Different outputs for task {key}, prediction {idx}")

### Import dsl on each code execution

In [None]:
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-27_first-finetuning-steps/evaluation_8preds_2025_08_27_12_30_09_predictions.json')
#predictions = {key: predictions[key] for key in list(predictions.keys())[:1]}
results = run_code_from_predictions(predictions)
display(compute_search_metrics(results).iloc[-1:])
display(error_analysis(results).iloc[-1:])

### Learn to use subprocess

In [None]:
import os
import sys
import subprocess

code = """import numpy as np
print(np.__version__)"""

proc = subprocess.Popen(
        [sys.executable, "-I", "-c", code],
        start_new_session=True, env=os.environ.copy(),
)
proc.wait()

### New safe execution method

In [None]:
from arc25.code_execution import _safe_code_execution_subprocess as safe_code_execution
predictions = load_all_predictions('/mnt/hdd0/MEGA/TEMP/predictions/2025-08-29-smaller-datasets/2xA6000-20steps-8192msl-1e-4lr-lora32/evaluation/8preds_2025_09_01_09_54_56_predictions.json')
results = run_code_from_predictions(predictions)
df = compute_search_metrics(results)
df.iloc[-1:]

In [None]:
error_analysis(results)

```
# exec
12.2s
	n_preds	valid code	valid outputs	unique outputs	pixel similarity	correct grids	train_pass_rate	train_pass@n	pass_rate	pass@n
MEAN	8.0	1.0	0.7575	0.667813	0.57695	0.038117	0.028202	0.0675	0.027429	0.0625
MEAN	8.0	1.0	0.757188	0.6675	0.575372	0.036742	0.026952	0.065	0.026179	0.06
MEAN	8.0	1.0	0.7575	0.667813	0.576488	0.037784	0.026952	0.065	0.026179	0.06
ValueError              207
IndexError              169
AssertionError          146
NonDeterministicCode    141
TimeoutException         44
TypeError                23
AttributeError           15
UnboundLocalError        10
StopIteration            10
NameError                 4
ZeroDivisionError         4
UnsafeCode                2
KeyError                  1
Name: count, dtype: int64


# subprocess
53.1s
n_preds	valid code	valid outputs	unique outputs	pixel similarity	correct grids	train_pass_rate	train_pass@n	pass_rate	pass@n
MEAN	8.0	1.0	0.759687	0.669375	0.577302	0.038509	0.027577	0.0675	0.026804	0.0625
MEAN	8.0	1.0	0.759687	0.669375	0.577302	0.038509	0.027577	0.0675	0.026804	0.0625

RuntimeError            526
NonDeterministicCode    141
ValueError               63
TimeoutException         37
UnsafeCode                2
Name: count, dtype: int64

### Run code from predictions

#### Effect of using a big number of jobs

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions
from arc25.metrics import aggregate_metrics, error_analysis
#predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/*.json')
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_8preds_2025_08_23_11_57_11_predictions.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params, n_jobs=-1)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params, n_jobs=20)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params, n_jobs=200)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params, n_jobs=2000)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

#### Large scale run

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                    n_jobs=-1, timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
df.iloc[-1:]

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/arc25/code_execution/evaluation_6064_bis.csv', index_label='task_id')

Still gets hang randomly, without any hint.

```
Loading predictions: 100%
 758/758 [01:06<00:00, 10.89it/s]
Loaded 400 tasks with 6064 predictions each.
Executing predictions:   4%
 3/76 [02:52<1:10:04, 57.60s/batch]
Executing predictions for batch 0: 100%
 32000/32000 [00:37<00:00, 849.46pred/s]
Executing predictions for batch 1: 100%
 32000/32000 [01:29<00:00, 356.84pred/s]
/mnt/hdd0/MEGA/AI/22_Kaggle/arc25/arc25/validation.py:19: RuntimeWarning: invalid value encountered in cast
  output = np.array(output, dtype=int) # otherwise I see weird outputs that mix list and numpy arrays
Executing predictions for batch 2: 100%
 32000/32000 [00:44<00:00, 712.58pred/s]
Executing predictions for batch 3: 100%
 31968/32000 [00:35<00:00, 907.65pred/s]

What if I evaluate each file independently?

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions

for filepath in tqdm(sorted(glob.glob('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/*.json')), desc="Processing files"):
    print(filepath)
    predictions = load_all_predictions(filepath)
    tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
    for task_id, task_preds in predictions.items():
        tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
        task_ids.extend([task_id] * len(task_preds['text_predictions']))
        text_predictions.extend(task_preds['text_predictions'])
        data_augmentation_params.extend(task_preds['data_augmentation_params'])
    results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                        n_jobs=-1, timeout_duration=1)

This files produce consistent hangs:

- /mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/8preds_2025_08_31_09_47_48_predictions.json, 252, problem on task 2661
- /mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/8preds_2025_09_01_13_46_42_predictions.json, 670, problem on task 

This files throw exceptions and are run with subprocess, but it is not consistent:

- /mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/8preds_2025_08_28_22_05_23_predictions.json
- /mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/8preds_2025_08_28_20_29_40_predictions.json

Apart from that the code is able to evaluate all the other files.

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions

predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/8preds_2025_08_28_22_05_23_predictions.json')
tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                    n_jobs=-1, timeout_duration=1)

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions

predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/8preds_2025_08_28_22_05_23_predictions.json')
tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

for i in tqdm(range(len(tasks))):
    results = run_code_from_predictions(tasks[i:i+1], task_ids[i:i+1], text_predictions[i:i+1], data_augmentation_params[i:i+1],
                                        n_jobs=-1, timeout_duration=1)

In [None]:
print(text_predictions[441])

```python
from common import *

import numpy as np
from typing import *

# concepts:
# pixel patterns, expansion, color swapping

# description:
# In the input you will see a grid containing a colored star shape with a single pixel in the center of a different color.
# To make the output, you should expand the star shape outward in all directions, swapping the colors of the star and the center pixel each time the star expands.

def transform(input_grid):
    # Identify the center pixel and its color
    center_color = None
    center_x = center_y = None
    
    for x in range(input_grid.shape[0]):
        for y in range(input_grid.shape[1]):
            if input_grid[x, y] != Color.BLACK:
                if center_color is None:
                    center_color = input_grid[x, y]
                    center_x, center_y = x, y
                else:
                    # If we find another colored pixel, it must be part of the star
                    star_color = input_grid[x, y]
                    break

    # Prepare output grid
    output_grid = input_grid.copy()
    width, height = input_grid.shape

    # Function to expand the star in a specified direction
    def expand_star(x, y, color_a, color_b, distance):
        for d in range(distance + 1):
            # Expand in all four directions
            if 0 <= x + d < width:
                output_grid[x + d, y] = color_b  # Down
            if 0 <= x - d < width:
                output_grid[x - d, y] = color_b  # Up
            if 0 <= y + d < height:
                output_grid[x, y + d] = color_b  # Right
            if 0 <= y - d < height:
                output_grid[x, y - d] = color_b  # Left
            if d == distance:  # Only at the last distance, swap the colors
                output_grid[x, y] = color_a  # Center pixel gets the original color

    # Expand the star in increasing distances until we hit the edge of the grid
    distance = 0
    while True:
        try:
            expand_star(center_x, center_y, star_color, center_color, distance)
            distance += 1
        except:
            break

    return output_grid
```

Other problem on task 166

```python
from common import *

import numpy as np
from typing import *

# concepts:
# concentric patterns, color propagation

# description:
# In the input, you will see a grid with one colored pixel located in the center and other colored pixels surrounding it. 
# To create the output, expand the color of the central pixel outward in concentric layers, coloring adjacent pixels with the same color 
# until a boundary of another color is reached. Each layer should alternate colors based on the surrounding pixels' colors.

def transform(input_grid):
    # Plan:
    # 1. Identify the center pixel and its color.
    # 2. Expand the color of the center pixel outward until a boundary of another color is reached.
    # 3. Alternate colors for each layer based on the surrounding pixels' colors.

    # Step 1: Find the center pixel
    center_x, center_y = input_grid.shape[0] // 2, input_grid.shape[1] // 2
    center_color = input_grid[center_x, center_y]

    # Step 2: Prepare the output grid and define the layer expansion
    output_grid = np.copy(input_grid)
    colors = [center_color]  # List to track the colors we are using

    # To keep track of already colored pixels
    visited = set((center_x, center_y))

    def color_layer(layer):
        # Iterate over the layer to color adjacent pixels
        for dx in range(-layer, layer + 1):
            for dy in range(-layer, layer + 1):
                if abs(dx) == layer or abs(dy) == layer:  # Only the outer layer
                    x, y = center_x + dx, center_y + dy
                    if (x, y) not in visited and (0 <= x < output_grid.shape[0] and 0 <= y < output_grid.shape[1]):
                        if output_grid[x, y] != Color.BLACK and output_grid[x, y] != center_color:  # Hit a boundary
                            break
                        output_grid[x, y] = colors[layer % len(colors)]
                        visited.add((x, y))

    # Step 3: Expand layers until boundaries are reached
    layer = 1
    while True:
        try:
            color_layer(layer)
        except:
            break  # If we hit a boundary, stop
        layer += 1

    return output_grid
```

Task that raise exception:

```python
from common import *

import numpy as np
from typing import *

# concepts:
# scaling, layering, color matching

# description:
# In the input, you will see a grid containing colored circles of different sizes, each surrounded by a gray border.
# To make the output, you need to scale the largest circle down to the size of the smallest circle and then place it on top of each circle in the same position.
# The output grid should show the original circles with the scaled version on top, while ensuring that the colors of the circles are preserved.

def transform(input_grid):
    # Plan:
    # 1. Extract the circles and determine their sizes and positions.
    # 2. Identify the largest and smallest circles.
    # 3. Scale the largest circle down to the size of the smallest circle.
    # 4. Overlay the scaled circle onto each original circle in the output grid.

    # 1. Extract the circles
    objects = find_connected_components(input_grid, background=Color.BLACK, connectivity=8, monochromatic=False)

    # Determine the sizes of the circles
    sizes = [np.sum(obj != Color.BLACK) for obj in objects]
    positions = [object_position(obj, background=Color.BLACK, anchor='center') for obj in objects]

    # Identify the smallest and largest circles
    smallest_circle = objects[np.argmin(sizes)]
    largest_circle = objects[np.argmax(sizes)]

    # Get the size of the smallest circle
    smallest_circle_size = np.sum(smallest_circle != Color.BLACK)

    # 2. Scale the largest circle down to the size of the smallest circle
    largest_circle_sprite = crop(largest_circle, background=Color.BLACK)
    scaled_largest_circle = scale_sprite(largest_circle_sprite, smallest_circle_size // np.sum(largest_circle_sprite != Color.BLACK))

    # 3. Create the output grid
    output_grid = np.full(input_grid.shape, Color.BLACK)

    # 4. Overlay the scaled circle onto each original circle
    for obj in objects:
        circle_position = object_position(obj, background=Color.BLACK, anchor='center')
        blit_sprite(output_grid, obj, x=circle_position[0] - obj.shape[0] // 2, y=circle_position[1] - obj.shape[1] // 2)
        # Overlay the scaled largest circle
        blit_sprite(output_grid, scaled_largest_circle, x=circle_position[0] - scaled_largest_circle.shape[0] // 2, y=circle_position[1] - scaled_largest_circle.shape[1] // 2)

    return crop(output_grid, background=Color.BLACK)
```

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions
from arc25.metrics import aggregate_metrics, error_analysis

predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/8preds_2025_08_28_22_05_23_predictions.json')
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_*.json')
tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                    n_jobs=-1, timeout_duration=1)

df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

#### Timeout effect

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_8preds_2025_08_23_11_57_11_predictions.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                    n_jobs=-1, timeout_duration=1)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                    n_jobs=-1, timeout_duration=5)
df = aggregate_metrics(results)
error_analysis(results);
df.iloc[-1:]

### Save training evaluation

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                    n_jobs=-1, timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)

error_analysis(results);

df.iloc[-1:]

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/arc25/code_execution/training_240.csv', index_label='task_id')

### Try different parallel implementations

In [None]:
from arc25.parallel_code_execution import run_code_from_predictions
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_8preds_2025_08_23_11_57_11_predictions.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

results = run_code_from_predictions(tasks, task_ids, text_predictions, data_augmentation_params,
                                    n_jobs=-1, timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)

error_analysis(results);

df.iloc[-1:]

In [None]:
results['007bbfb7']

```python
parallel_kwargs = dict(n_jobs=n_jobs, backend="loky", prefer="processes", batch_size='auto')
parallel_kwargs = dict(n_jobs=n_jobs, backend="threading", batch_size='auto')
```

`signal only works in main thread of the main interpreter`

### Try the new code runner

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-28-base-model/evaluation/*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, text_predictions, data_augmentation_params,
                          timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics, error_analysis
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_*.json')
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/evaluation_*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, text_predictions, data_augmentation_params,
                          timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions2025-09-15-debug-grpo/lr1e-5_small-dataset_80epochs_16gens_continue/training/*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, text_predictions, data_augmentation_params,
                          timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)
df.iloc[-1:]

Notice that it was trained just on the 67 shortests tasks.

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>n_preds</th>
      <th>valid code</th>
      <th>valid outputs</th>
      <th>unique outputs</th>
      <th>train_pixel_score</th>
      <th>train_correct_grids</th>
      <th>train_pass_rate</th>
      <th>train_is_correct</th>
      <th>test_pixel_score</th>
      <th>test_correct_grids</th>
      <th>test_pass_rate</th>
      <th>test_is_correct</th>
      <th>is_correct</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>MEAN</th>
      <td>240.0</td>
      <td>1.0</td>
      <td>0.881615</td>
      <td>0.291</td>
      <td>0.632037</td>
      <td>0.239512</td>
      <td>0.208427</td>
      <td>0.58</td>
      <td>0.621141</td>
      <td>0.23495</td>
      <td>0.232323</td>
      <td>0.6475</td>
      <td>0.575</td>
    </tr>
  </tbody>
</table>
</div>

Baseline:

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>n_preds</th>
      <th>valid code</th>
      <th>valid outputs</th>
      <th>unique outputs</th>
      <th>train_pixel_score</th>
      <th>train_correct_grids</th>
      <th>train_pass_rate</th>
      <th>train_is_correct</th>
      <th>test_pixel_score</th>
      <th>test_correct_grids</th>
      <th>test_pass_rate</th>
      <th>test_is_correct</th>
      <th>is_correct</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>MEAN</th>
      <td>240.0</td>
      <td>1.0</td>
      <td>0.763875</td>
      <td>0.410521</td>
      <td>0.481401</td>
      <td>0.117876</td>
      <td>0.100604</td>
      <td>0.6175</td>
      <td>0.471229</td>
      <td>0.111345</td>
      <td>0.109667</td>
      <td>0.6675</td>
      <td>0.615</td>
    </tr>
  </tbody>
</table>
</div>


In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions2025-09-15-debug-grpo/lr1e-5_small-dataset_80epochs_16gens_continue/evaluation/*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, text_predictions, data_augmentation_params,
                          timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)
df.iloc[-1:]

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>n_preds</th>
      <th>valid code</th>
      <th>valid outputs</th>
      <th>unique outputs</th>
      <th>train_pixel_score</th>
      <th>train_correct_grids</th>
      <th>train_pass_rate</th>
      <th>train_is_correct</th>
      <th>test_pixel_score</th>
      <th>test_correct_grids</th>
      <th>test_pass_rate</th>
      <th>test_is_correct</th>
      <th>is_correct</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>MEAN</th>
      <td>256.0</td>
      <td>1.0</td>
      <td>0.830996</td>
      <td>0.385771</td>
      <td>0.522011</td>
      <td>0.051177</td>
      <td>0.036602</td>
      <td>0.1775</td>
      <td>0.507435</td>
      <td>0.04501</td>
      <td>0.043789</td>
      <td>0.245</td>
      <td>0.175</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics, error_analysis
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-09-19-rl-first-steps/lr1e-6_epochs100_16gen_1prompts-per-step_32lora/training/*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, text_predictions, data_augmentation_params,
                          timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)
df.iloc[-1:]

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/arc25/code_execution/training_2025-09-19-rl-first-steps_lr1e-6_epochs100_16gen_1prompts-per-step_32lora_checkpoint8400.csv')

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>n_preds</th>
      <th>valid code</th>
      <th>valid outputs</th>
      <th>unique outputs</th>
      <th>train_pixel_score</th>
      <th>train_correct_grids</th>
      <th>train_pass_rate</th>
      <th>train_is_correct</th>
      <th>test_pixel_score</th>
      <th>test_correct_grids</th>
      <th>test_pass_rate</th>
      <th>test_is_correct</th>
      <th>is_correct</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>MEAN</th>
      <td>48.0</td>
      <td>0.963958</td>
      <td>0.822083</td>
      <td>0.492188</td>
      <td>0.565531</td>
      <td>0.071803</td>
      <td>0.048854</td>
      <td>0.1875</td>
      <td>0.553833</td>
      <td>0.06224</td>
      <td>0.061458</td>
      <td>0.24</td>
      <td>0.18</td>
    </tr>
  </tbody>
</table>
</div>

## Test memory limit

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics, error_analysis
# predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/training_*.json')
predictions = load_all_predictions('/mnt/hdd0/Kaggle/arc25/predictions/2025-08-22_add-common-prefix/evaluation_*.json')
n_preds = len(list(predictions.values())[0]['text_predictions'])
print(f"Loaded {len(predictions)} tasks with {n_preds} predictions each.")

tasks, task_ids, text_predictions, data_augmentation_params = [], [], [], []
for task_id, task_preds in predictions.items():
    tasks.extend([get_task(task_id)] * len(task_preds['text_predictions']))
    task_ids.extend([task_id] * len(task_preds['text_predictions']))
    text_predictions.extend(task_preds['text_predictions'])
    data_augmentation_params.extend(task_preds['data_augmentation_params'])

code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, text_predictions, data_augmentation_params,
                          timeout_duration=1, batch_size=5000)
df = aggregate_metrics(results)

error_analysis(results);
df.iloc[-1:]

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import error_analysis
task_id = '00576224'

code = """```python
import numpy as np

def transform(input_grid):
    a = np.arange(N_MB * 1024 * 1024 // 8)
    return input_grid.copy()
```
"""

mb_range = np.linspace(10, 1000, 20, dtype=int).tolist()
codes = [code.replace('N_MB', str(mb)) for mb in mb_range]
task_ids = [task_id] * len(codes)
tasks = [get_task(task_id)] * len(codes)
data_augmentation_params = [None] * len(codes)


code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, codes, data_augmentation_params,
                          timeout_duration=10, batch_size=5000)

successful_runs = [mb for result, mb in zip(results[task_id], mb_range) if 'error_message' not in result]
unsuccessful_runs = [mb for result, mb in zip(results[task_id], mb_range) if 'error_message' in result]
print("Successful runs for MB values:", successful_runs)
print("Unsuccessful runs for MB values:", unsuccessful_runs)
if unsuccessful_runs:
    error_analysis(results)

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import error_analysis
task_id = '00576224'

code = """```python
import numpy as np

def transform(input_grid):
    a = np.arange(N_MB * 1024 * 1024 // 8)
    return input_grid.copy()
```
"""

mb_range = np.linspace(10, 4000, 20, dtype=int).tolist()
codes = [code.replace('N_MB', str(mb)) for mb in mb_range]
task_ids = [task_id] * len(codes)
tasks = [get_task(task_id)] * len(codes)
data_augmentation_params = [None] * len(codes)


code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, codes, data_augmentation_params,
                          timeout_duration=10, batch_size=5000)

successful_runs = [mb for result, mb in zip(results[task_id], mb_range) if 'error_message' not in result]
unsuccessful_runs = [mb for result, mb in zip(results[task_id], mb_range) if 'error_message' in result]
print("Successful runs for MB values:", successful_runs)
print("Unsuccessful runs for MB values:", unsuccessful_runs)
if unsuccessful_runs:
    error_analysis(results)

In [None]:
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import error_analysis
task_id = '00576224'

code = """```python
import numpy as np

def transform(input_grid):
    a = np.arange(N_MB * 1024 * 1024 // 8)
    return input_grid.copy()
```
"""

mb_range = np.linspace(10, 4000, 20, dtype=int).tolist()
mb_range = mb_range + mb_range
codes = [code.replace('N_MB', str(mb)) for mb in mb_range]
task_ids = [task_id] * len(codes)
tasks = [get_task(task_id)] * len(codes)
data_augmentation_params = [None] * len(codes)


code_runner = CodeRunner(n_jobs=-1)

results = code_runner.run(tasks, task_ids, codes, data_augmentation_params,
                          timeout_duration=10, batch_size=5000)

successful_runs = [mb for result, mb in zip(results[task_id], mb_range) if 'error_message' not in result]
unsuccessful_runs = [mb for result, mb in zip(results[task_id], mb_range) if 'error_message' in result]
print("Successful runs for MB values:", successful_runs)
print("Unsuccessful runs for MB values:", unsuccessful_runs)
if unsuccessful_runs:
    error_analysis(results)

## TODO:

- [x] Load all ARC data
- [x] Modify prompt generation to use all the training data, and a random sample from the test samples.
- [x] Update the data augmentation pipeline to use dicts instead of tasks
- [x] Modify the code execution to use all the data
- [x] Add a new metric to check if the code is correct for the train samples but incorrect for the test samples
- [x] Refactor
- [x] Convert to python script so I can make predictions remotely
- [x] Evaluate the datasets
- [x] Check if the answers always start with the same prefix, if that is the case I could speedup inference.

- [x] Implement new grid encoder
- [x] Use the correct prompt
- [x] Save predictions to file so I can later reprocess them
- [x] Update code execution to match the code generated by BARC model
- [x] Check code execution to verify that exceptions are legit and not easily solvable
  - [x] Add missing colors to Color object
  - [x] Code execution fails when there are auxiliary functions. `Error executing code for task 025d127b, response 5: <class 'NameError'> name 'blend_colors' is not defined`
  - [x] Arrays as inputs
- [x] Remove dsl usage metric
- [x] Add correct task metric
- [x] Parallelize code execution
- [x] Refactor code
- [x] Plots showing the effect of increasing the number of predictions
- [x] Validate that I get the same scores of the paper
- [x] Evaluate on different datasets
- [x] Improve metrics
- [x] Data augmentation