# Hindsight Experience Replay with real ARC tasks

## Goal

Check if we can solve real ARC tasks using HER

## Imports

In [None]:
import os
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

In [None]:
import json
from tqdm.auto import tqdm
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
from transformers import AutoTokenizer, AutoConfig, BitsAndBytesConfig
import matplotlib.pyplot as plt
import matplotlib as mpl
from datasets import Dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import logging
from IPython.display import Markdown, display
import torch
import random
from typing import List
from dataclasses import field
import wandb

from arc25.training_tasks import *
from arc25.encoders import create_grid_encoder
from arc25.prompting import create_prompt_from_task, pretty_print_prompt
from arc25.plot import plot_task, plot_grids_with_shape, plot_grid
from arc25.code_execution import safe_code_execution, validate_code
from arc25.utils import set_random_seed, get_timestamp
from arc25.code_analysis import analyze_dsl_usage

import sys
sys.path.append(os.path.realpath("../scripts"))
from finetuning import get_data_collator


plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 12

## Code

In [None]:
with open('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_training_challenges.json', 'r') as f:
    training_challenges = json.load(f)

In [None]:
def get_task(task_name):
    if task_name in training_challenges:
        task_data = training_challenges[task_name]
        inputs = [Img(sample['input']) for sample in task_data['train']]
        outputs = [Img(sample['output']) for sample in task_data['train']]
        return Task(inputs=inputs, outputs=outputs, code='', name=task_name)

    tasks = []

    input_img = create_img((9, 9), color=0)
    output_img = input_img.copy()
    for x in range(0, input_img.shape[1], 1):
        draw_vertical_line(output_img, x, color=x+1)
    tasks.append(Task(inputs=[input_img], outputs=[output_img], code='', name='9-vertical-lines'))

    input_img = create_img((10, 8), color=0)
    output_img = input_img.copy()
    color = 0
    for x in range(0, input_img.shape[1], 2):
        for y in range(0, input_img.shape[0], 2):
            color = (color + 1) % 10
            if color == 0: color = 1
            draw_rectangle(output_img, (y, x), (y+1, x+1), color=color)
    tasks.append(Task(inputs=[input_img], outputs=[output_img], code='', name='20-squares'))

    input_img = create_img((6, 8), color=0)
    output_img = input_img.copy()
    color = 0
    for x in range(0, input_img.shape[1], 2):
        for y in range(0, input_img.shape[0], 2):
            color = (color + 1) % 10
            if color == 0: color = 1
            draw_rectangle(output_img, (y, x), (y+1, x+1), color=color)
    tasks.append(Task(inputs=[input_img], outputs=[output_img], code='', name='12-squares'))

    for n in range(4, 11):
        max_colors = 10 if n < 9 else 8
        input_img = create_img((2*n, 2*n), color=0)
        output_img = input_img.copy()
        color = 0
        for x in range(0, input_img.shape[1], 2):
            for y in range(0, input_img.shape[0], 2):
                color = (color + 1) % max_colors
                if color == 0: color = 1
                draw_rectangle(output_img, (y, x), (y+1, x+1), color=color)
        tasks.append(Task(inputs=[input_img], outputs=[output_img], code='', name=f'{n*n}-squares'))

    input_img = create_img((10, 10), color=0)
    output_img = Img([
        [8, 8, 8, 8, 4, 4, 8, 8, 8, 8],
        [8, 8, 4, 4, 4, 4, 4, 4, 8, 8],
        [8, 4, 4, 0, 4, 4, 0, 4, 4, 8],
        [8, 4, 2, 4, 4, 7, 4, 2, 4, 8],
        [8, 4, 4, 4, 7, 7, 4, 4, 4, 8],
        [8, 8, 4, 4, 4, 4, 4, 4, 8, 8],
        [4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
        [4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
        [8, 4, 4, 4, 4, 4, 4, 4, 4, 8],
        [8, 8, 4, 7, 4, 4, 7, 4, 8, 8],
    ])
    tasks.append(Task(inputs=[input_img], outputs=[output_img], code='', name='chick'))

    for task in tasks:
        if task.name == task_name:
            return task
    raise ValueError(f"Task {task_name} not found. Available tasks: {[task.name for task in tasks]}")

In [None]:
EpochResults = namedtuple("EpochResults", ["best_prediction", 'pixel_accuracies', 'new_tasks'])
InferenceParams = namedtuple("InferenceParams", ["num_return_sequences", "temperature", 'top_p'])

@log_execution_time
def hindsight_experience_replay(task, cfg, run_name_prefix=None):
    """
    Use hindsight experience replay to try to solve new tasks
    """
    run_name = f'{task.name}_{cfg.learning_rate:.0e}lr_' + '_'.join([f'{params.num_return_sequences}gen_t{params.temperature}' for params in cfg.inference_params]) + '_' + get_timestamp()
    if run_name_prefix is not None:
        run_name = f'{run_name_prefix}_{run_name}'
    logging.info(f'Run name: {run_name}')
    wandb.init(project=cfg.wandb_project, name=run_name, config=cfg, reinit=True, dir=f'/mnt/hdd0/Kaggle/arc25/trainings/20250510_HER_v2/{run_name}', save_code=True)
    #wandb.run.log_code(os.path.dirname(__file__))
    plot_task(task); plt.suptitle('Task to solve'); plt.tight_layout()
    wandb.log({"task": wandb.Image(plt.gcf())}); plt.show()
    model, tokenizer = load_model(cfg.base_model_path, cfg.lora_path)
    metrics, unique_generated_tasks = [], dict()
    for epoch in range(1, cfg.max_epochs + 1):
        logging.info(f'Starting epoch {epoch}...')
        new_tasks, pixel_accuracies = inference(
            task, model, tokenizer, cfg.grid_encoder, cfg.prompt_version,
            inference_params=cfg.inference_params,
            previously_generated_tasks=unique_generated_tasks)
        metrics.append(EpochResults(best_prediction=new_tasks[-1], pixel_accuracies=pixel_accuracies,
                                    new_tasks=new_tasks))
        plot_metrics_evolution(metrics, task)
        log_progress_to_wandb(metrics, epoch)
        display(Markdown(f'# Best prediction code\n\n```python\n{metrics[-1].best_prediction.code}\n```'))
        if np.max(pixel_accuracies) == 1:
            logger.info(f'Found a perfect prediction at epoch {epoch}!')
            break
        if not cfg.use_accuracy_for_sorting:
            logging.info('Shuffling the tasks, no information about the accuracy is used')
            random.shuffle(new_tasks)
        if cfg.only_train_on_novel_tasks:
            new_tasks, unique_generated_tasks = filter_new_tasks(new_tasks, unique_generated_tasks)
        if epoch == cfg.max_epochs: break # does not have sense to fine-tune if we reached the max epochs
        finetuning(new_tasks, model, tokenizer, cfg.grid_encoder, cfg.prompt_version,
                   learning_rate=cfg.learning_rate, lr_scheduler_type=cfg.lr_scheduler_type)
    display(Markdown(f'# Best prediction code\n\n```python\n{metrics[-1].best_prediction.code}\n```'))
    plot_metrics_evolution(metrics, task, log_to_wandb=True)
    wandb.log({
        'num_generations': epoch*sum([params.num_return_sequences for params in cfg.inference_params])})
    wandb.finish()
    return metrics, unique_generated_tasks


@log_execution_time
def load_model(base_model_path, lora_path):
    logging.info(f"Loading model from {base_model_path} and LoRA from {lora_path}")
    torch.cuda.empty_cache()

    bnb_config = BitsAndBytesConfig(
            load_in_4bit= True,
            bnb_4bit_quant_type= "nf4",
            bnb_4bit_compute_dtype= torch.float16,
            bnb_4bit_use_double_quant= True,
            llm_int8_enable_fp32_cpu_offload= True,
            llm_int8_skip_modules=['gate', 'lm_head'],
    )


    model = AutoModelForCausalLM.from_pretrained(
        base_model_path, torch_dtype="auto", device_map="auto", quantization_config=bnb_config)
    tokenizer = AutoTokenizer.from_pretrained(lora_path)
    model = PeftModel.from_pretrained(model, lora_path, is_trainable=True)
    return model, tokenizer


@log_execution_time
def inference(task, model, tokenizer, grid_encoder, prompt_version, inference_params, previously_generated_tasks):
    prompt = create_prompt_from_task(
        task, prompt_version=prompt_version, grid_encoder=grid_encoder, tokenizer=tokenizer, is_train_prompt=False)
    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    predicted_codes = []
    for params in inference_params:
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=3072,
            do_sample=True,
            temperature=params.temperature,
            top_p=params.top_p,
            num_return_sequences=params.num_return_sequences,
        )
        generated_ids = generated_ids[:, len(model_inputs.input_ids[0]):]
        predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        predicted_codes.extend([prediction.replace('\n```', '') for prediction in predictions])

    new_tasks = []
    pixel_accuracies = []
    for predicted_code in tqdm(predicted_codes):
        try:
            predicted_outputs = safe_code_execution(predicted_code, task.inputs)
            validated_code = validate_code(predicted_code, task.inputs)
            new_tasks.append(Task(inputs=task.inputs, outputs=predicted_outputs, code=validated_code, name=task.name))
            pixel_accuracies.append(compute_pixel_accuracy(task.outputs, predicted_outputs))
        except Exception as e:
                print(f'Error executing code: {predicted_code}')
                print(e)

    output_to_tasks = {}
    for new_task in new_tasks:
        key = str(new_task.outputs)
        if key not in output_to_tasks:
            output_to_tasks[key] = []
        output_to_tasks[key].append(new_task)

    new_tasks_with_unique_outputs = []
    for tasks in output_to_tasks.values():
        # choose the task with the shortest code (measured by the number of lines)
        best_task = min(tasks, key=lambda x: len(x.code.splitlines()))
        new_tasks_with_unique_outputs.append(best_task)


    logging.info(f'Max pixel accuracy: {max(pixel_accuracies)}')
    new_tasks_with_unique_outputs = sorted(new_tasks_with_unique_outputs, key=lambda x: compute_pixel_accuracy(task.outputs, x.outputs), reverse=False)

    novel_tasks, _ = filter_new_tasks(new_tasks, previously_generated_tasks.copy())
    sys.stdout.flush()
    logging.info(f'Number of predictions: {len(predicted_codes)}, Valid tasks: {len(new_tasks)}, Unique tasks: {len(new_tasks_with_unique_outputs)}, Novel tasks: {len(novel_tasks)}')

    return new_tasks_with_unique_outputs, pixel_accuracies


def compute_pixel_accuracy(outputs, predicted_outputs):
    accuracy = []
    for output, predicted_output in zip(outputs, predicted_outputs):
        if np.any(output.shape != predicted_output.shape):
            return -np.abs(np.mean(predicted_output.shape - output.shape))/30
        accuracy.append(float(np.mean(output == predicted_output)))
    return np.mean(accuracy)


@log_execution_time
def finetuning(new_tasks, model, tokenizer, grid_encoder, prompt_version, learning_rate, lr_scheduler_type):
    if not new_tasks:
        logging.info('No new tasks to train on')
        return

    prompts = []
    for task in new_tasks:
        prompts.append(create_prompt_from_task(
    task, prompt_version=prompt_version, grid_encoder=grid_encoder, tokenizer=tokenizer, is_train_prompt=True))
    train_dataset = Dataset.from_dict({'text': prompts})

    training_arguments = SFTConfig(
        output_dir=None, #'/mnt/hdd0/Kaggle/arc25/trainings/20250505_TTT/debug',
        save_strategy='no',
        num_train_epochs=1,
        warmup_ratio=0.1,
        learning_rate=learning_rate,
        lr_scheduler_type=lr_scheduler_type, #constant_with_warmup, cosine, cosine_with_restarts
        # lr_scheduler_kwargs=lr_scheduler_kwargs,
        gradient_checkpointing=False,
        optim="paged_adamw_8bit",
        max_grad_norm=1.0,

        dataset_text_field="text",
        max_seq_length=4096,

        do_eval=True,
        eval_strategy="no", #TODO: previously it was steps
        # save_steps=cfg.save_steps or cfg.eval_steps,
        logging_steps=10, #50,
        log_level="info",
        report_to='none',

        # parameters added to make the code work with accelerate
        # dispatch_batches=False,
        # https://huggingface.co/transformers/v4.9.1/main_classes/trainer.html#trainingarguments
        ddp_find_unused_parameters=False, # only used with accelerate, got a warning saying that it slows down if True

        ignore_data_skip=True, # otherwise it takes too long to start training when resuming from checkpoint

        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,

        use_liger_kernel=True,
    )

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        data_collator=get_data_collator(tokenizer),
        args=training_arguments,
    )
    trainer.train()


def plot_best_prediction(task, best_prediction, accuracy):
    plot_grids_with_shape(task.outputs + best_prediction.outputs, suptitle=f'Best prediction accuracy: {accuracy:.1%}')
    display(Markdown(f'```python\n{best_prediction.code}\n```'))


def plot_metrics_evolution(metrics, task, log_to_wandb=False):
    plot_score_histograms(metrics, log_to_wandb=log_to_wandb)
    n_outputs = len(metrics[0].best_prediction.outputs)
    n_plots = len(metrics) + 1
    for output_idx in range(n_outputs):
        n_cols = min(9, n_plots)
        n_rows = n_plots // n_cols
        if n_plots % n_cols != 0:
            n_rows += 1
        plt.figure(figsize=(n_cols*2.25, n_rows * 2))
        for epoch, epoch_results in enumerate(metrics):
            plt.subplot(n_rows, n_cols, epoch + 1)
            plot_grid(epoch_results.best_prediction.outputs[output_idx])
            accuracy = compute_pixel_accuracy(task.outputs[output_idx: output_idx + 1], epoch_results.best_prediction.outputs[output_idx: output_idx + 1])
            plt.title(f'Epoch {epoch} acc: {accuracy:.1%}')
        plt.subplot(n_rows, n_cols, n_plots)
        plot_grid(task.outputs[output_idx])
        plt.title(f'Ground truth')

        plt.suptitle(f'Evolution of best predictions for output {output_idx + 1}/{n_outputs}')
        plt.tight_layout()
        if log_to_wandb:
            wandb.log({f"best_predictions_evolution_{output_idx}": wandb.Image(plt.gcf())})
        plt.show()


def plot_score_histograms(metrics, offset_scale=0.9, log_to_wandb=False):
    """
    Plots stacked (y-offset) histograms
    """
    cmap = mpl.colormaps['viridis']#get_cmap("viridis")
    norm = plt.Normalize(0, len(metrics) - 1)
    bins = np.linspace(-1, 1, 200)
    bin_centers = 0.5 * (bins[1:] + bins[:-1])

    plt.figure(figsize=(10, 6))
    offset = 0
    for i, epoch_results in enumerate(metrics):
        color = cmap(norm(i))
        counts, _ = np.histogram(epoch_results.pixel_accuracies, bins=bins)
        counts = np.log1p(counts)
        plt.fill_between(bin_centers, offset, counts + offset, color=color, label=f'Epoch {i}', alpha=0.5)
        offset += np.max(counts) * offset_scale  # Update offset for next histogram

    plt.xlabel("Pixel accuracy")
    plt.ylabel("Epoch ->")
    plt.title("Evolution of pixel accuracy")
    plt.yticks([])  # Hide y-ticks since they don't represent absolute values
    plt.grid(axis='x')
    plt.tight_layout()
    if log_to_wandb:
        wandb.log({"pixel_accuracy_evolution": wandb.Image(plt.gcf())})
    plt.show()


def log_progress_to_wandb(metrics, epoch):
    epoch_results = metrics[-1]
    pixel_accuracies = epoch_results.pixel_accuracies
    fig = plt.figure(figsize=(6*len(epoch_results.best_prediction.outputs), 6))
    for plot_idx, img in enumerate(epoch_results.best_prediction.outputs, 1):
        plt.subplot(1, len(epoch_results.best_prediction.outputs), plot_idx)
        plot_grid(img)
    plt.title(f'Epoch {epoch} max accuracy: {max(pixel_accuracies):.1%}')
    really_new_tasks_ratio = compute_really_new_tasks_ratio(metrics)
    wandb.log({
        "epoch": epoch,
        "max_pixel_accuracy": max(pixel_accuracies),
        "mean_pixel_accuracy": np.mean(pixel_accuracies),
        "min_pixel_accuracy": min(pixel_accuracies),
        "best_prediction": wandb.Image(fig),
        'pixel_accuracy': wandb.Histogram(pixel_accuracies),
        'best_code': wandb.Html(f'<pre>{epoch_results.best_prediction.code}</pre>'),
        'unique_new_tasks': len(epoch_results.new_tasks),
        'unique_new_tasks_ratio': len(epoch_results.new_tasks)/len(epoch_results.pixel_accuracies),
        'best_code_lines': len(epoch_results.best_prediction.code.splitlines()),
        'function_lines': wandb.Histogram([len(task.code.splitlines()) for task in epoch_results.new_tasks]),
        'novel_new_tasks_ratio': really_new_tasks_ratio,
        'novel_new_tasks': really_new_tasks_ratio * len(epoch_results.new_tasks),
        },
        step=epoch, commit=True)
    plt.show()


def compute_really_new_tasks_ratio(metrics):
    if len(metrics) < 2:
        return 1.0
    new_tasks = metrics[-1].new_tasks
    really_new_tasks = 0
    for task in new_tasks:
        is_new = True
        for epoch_results in metrics[:-1]:
            if any([str(task.outputs) == str(t.outputs) for t in epoch_results.new_tasks]):
                is_new = False
                break
        if is_new:
            really_new_tasks += 1
    return really_new_tasks / len(new_tasks)


def filter_new_tasks(new_tasks, unique_tasks):
    logging.info(f'Filtering new tasks, {len(new_tasks)} tasks to filter')
    filtered_new_tasks = []
    for task in new_tasks:
        key = str(task.outputs)
        if key not in unique_tasks:
            filtered_new_tasks.append(task)
            unique_tasks[key] = task
    logging.info(f'Found {len(filtered_new_tasks)} unique tasks')
    return filtered_new_tasks, unique_tasks

## First experiments

### Default config

In [None]:
@dataclass
class Config:
    wandb_project: str = 'HER_with_real_tasks'
    base_model_path: str = '/home/gbarbadillo/models/Qwen2.5-Coder-0.5B-Instruct'
    #lora_path: str = '/mnt/hdd0/MEGA/TEMP/2025-06-13-first-real-trainings/2xA6000-Qwen2.5-Coder-0.5B-32000steps-1e-4lr/checkpoint-32000'
    lora_path: str = '/mnt/hdd0/MEGA/TEMP/2025-06-18-more-training-tasks/2xA6000-Qwen2.5-Coder-0.5B-4000steps-1e-4lr/checkpoint-4000'
    prompt_version: str = 'code-from-examples-v3'
    grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')
    max_epochs: int = 2
    use_accuracy_for_sorting: bool = True
    only_train_on_novel_tasks: bool = True
    inference_params: List[InferenceParams] = field(default_factory=lambda: [
        InferenceParams(num_return_sequences=8, temperature=0.1, top_p=0.95),
        InferenceParams(num_return_sequences=128, temperature=0.9, top_p=0.95),
    ])
    learning_rate: float = 1e-5
    lr_scheduler_type: str = 'constant_with_warmup' #constant_with_warmup, cosine, cosine_with_restarts

In [None]:
gpus = 2
steps = 16000
parameters = '0.5B'
for task_name in ['1bfc4729']: #08ed6ac7, 0b148d64, 0ca9ddb6, 0d3d703e, 178fcbfb, 1bfc4729, 1c786137
    task = get_task(task_name)
    cfg = Config(
        base_model_path = f'/home/gbarbadillo/models/Qwen2.5-Coder-{parameters}-Instruct',
        lora_path = f'/mnt/hdd0/MEGA/TEMP/2025-06-18-more-training-tasks/{gpus}xA6000-Qwen2.5-Coder-{parameters}-{steps}steps-1e-4lr/checkpoint-{steps}',
        max_epochs = 5,
        inference_params = [
            # InferenceParams(num_return_sequences=8, temperature=0.1),
            InferenceParams(num_return_sequences=256, temperature=0.9, top_p=0.9),
        ],
    )
    metrics, unique_generated_tasks = hindsight_experience_replay(task, cfg, run_name_prefix='16k-steps')

In [None]:
code = []
for epoch_results in metrics:
    code.extend([task.code for task in epoch_results.new_tasks if task.code not in code])
analyze_dsl_usage(code)

In [None]:
for task in unique_generated_tasks.values():
    if 'draw_vertical_line' in task.code:
        print(task.code)
        plot_task(task); plt.show()

### Learnings

2025-06-18-more-training-tasks

- `0d3d703e` The 0.5B model does not understand that the task is about applying a colormap, this is worrying. However the 1.5B model understood the problem from the beginning and it just took 6 epochs to solve the task completely. This is the first ARC task solved using HER, 20-06-2025.
- `0ca9ddb6` The 0.5B model does not understand that the color of the object matters, understands that it needs to draw something. The 1.5B model understands that the color needs to change, but tries to do it using the area. Furthermore it is only able to draw two pixels, it is unable to draw more. Another generalization worrying sign.
- `178fcbfb`, The 1.5B model just makes 2 unique predictions, does not imagine to combine horizontal and vertical lines. Just makes vertical lines. It is wrong but it is so consistently wrong that it cannot improve. More exploration is needed. If I switch to the 8k steps model, it makes more diverse predictions and understands that it needs to do vertical and horizontal lines, but it does not understand that there is a condition.
- The model trained for 8k steps collapsed to make a single prediction on task `1bfc4729`, increasing the temperature to 2 does not yield more diversity.

| task \ model | 0.5B@1k steps                                                                            | 1.5B@1k steps                                                                                                    | 1.5B@16k steps                                                                             | 3B@1k steps                                                                                          | 7B@1k steps                                                                             |
|--------------|------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|
| 08ed6ac7     |                                                                                          |                                                                                                                  |                                                                                            | does not understand that the task is about changing colors, sorting the objects by area              | does not understand that the task is about changing colors, sorting the objects by area |
| 0b148d64     |                                                                                          |                                                                                                                  |                                                                                            | the most succesfull approach is downscaling instead of selecting and cropping                        | OOM                                                                                     |
| 0ca9ddb6     | draws 3 points, tries to use area for color. Tried an attempt to use the color as input  | draws 2 points, tries to use area for color                                                                      | draws 2 points, tries to use area for color                                                | draws 4 points, but doesn't understand that color depends on the object color, tries to use the area | draws 3 points, then starts to draw lines                                               |
| 0d3d703e     | does not understand that  is about colormaps                                             | does not understand that is about colormaps                                                                      | Solved at epoch 6                                                                          | Solved at epoch 2                                                                                    | Solved at epoch 3                                                                       |
| 178fcbfb     | draws vertical or horizontal lines, but not both                                         | draws vertical and horizontal lines, but does not understand there is a condition                                | only vertical lines, very low diversity                                                    | draws vertical and horizontal lines, but does not understand there is a condition                    | draws vertical and horizontal lines, but does not understand there is a condition       |
| 1bfc4729     | only horizontal lines                                                                    | only horizontal lines                                                                                            | does not understand the task, draws horizontal lines on the points and the rest is garbage | low diversity in predictions, does not improve over horizontal lines                                 | many different predictions, but not in the correct direction                            |
| 1c786137     |                                                                                          | chooses the object using height instead of area, maybe another property is needed. Probably color should be used |                                                                                            | does not understand the task                                                                         | OOM                                                                                     |

**2025-06-13-first-real-trainings**

- `08ed6ac`, does not understand that the task is about sorting and changing colors
- `0b148d6`, Understands that the task is about detecting objects and cropping, but does not know to use the color
- `0ca9ddb6`, seems to understand that the task is about drawing pixels, but it does not use the center as a reference. Neither it uses the color to select certain objects
- `0d3d703e`, the model does not recognize that the task is about apply_colormap. Create more tasks showing how to change colors, not changing all the colors always.
- `178fcbfb` does understand that the task is about drawing horizontal and vertical lines, but does not know to use the center as a reference
- `1bfc4729`, understands that it needs to draw some pattern, but does not have a way to make a different drawing for each image
- `1c786137`, does not understand that the task is about selecting the object

It seems that the main problem is that the model does not have a good intuition of how to solve the tasks. I might introduce diversity in the generations but suggesting to use some DSL primitive functions.

## Learnings

- The model can choose to solve just the first case and ignore the second one.

## TODO

- [x] Allow to use real ARC tasks
- [x] Visualize predictions on multiple samples
- [x] Plot evolution for each output
- [x] Allow to work with different images sizes
- [x] Show accuracy for each sample, not global accuracy in the evolution plot
- [ ] Need to analyze all the code, not just the better one