# Refine solutions

## Goal

Can we use the BARC induction model to refine its incorrect solutions?

## Imports

In [None]:
import os
import gc
import random
import glob
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

from IPython.display import Markdown, display

def display_python_code(code):
    display(Markdown(f"```python\n{code}\n```"))

from arc25.plot import plot_task, plot_grid
from arc25.utils import get_timestamp, load_json, load_arc_dataset_with_solutions, write_json
from arc25.data_augmentation import apply_data_augmentation, get_random_data_augmentation_params
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics

## Code

In [None]:
def load_and_curate_predictions(folder):
    predictions = load_json(os.path.join(folder, 'results.json.gz'))
    for task_id, task_predictions in predictions.items():
        # fix color maps in the first place
        for prediction in task_predictions:
            if 'data_augmentation_params' in prediction:
                prediction['data_augmentation_params']['color_map'] = {int(k): v for k, v in prediction['data_augmentation_params']['color_map'].items()}
            if 'train_is_correct' in prediction:
                prediction['is_correct'] = prediction['train_is_correct'] and prediction['test_is_correct']
            else:
                prediction['is_correct'] = 0
        # sort by train score
        task_predictions = sorted(task_predictions, key=lambda x: (x.get('train_correct_grids', -1), x.get('train_pixel_score', -1)), reverse=True)
        predictions[task_id] = task_predictions
    return predictions

In [None]:
def analyze_task_performance(task_id, index=0):
    task = dataset[task_id]
    task_predictions = predictions[task_id]
    prediction = task_predictions[index]

    display(df[df.index == task_id])
    plot_task_metrics(task_predictions)
    print_prediction_summary(prediction)
    plot_task_and_predictions(apply_data_augmentation(task, **prediction.get('data_augmentation_params', None)), prediction['output_grids'])
    display_python_code(prediction['code'])


def plot_task_metrics(task_predictions):
    metric_groups = [
        ['train_correct_grids', 'test_correct_grids', 'is_correct'],
        ['train_pixel_score', 'test_pixel_score'],
    ]
    plt.figure(figsize=(15, 3))
    for plot_idx, metrics in enumerate(metric_groups, 1):
        plt.subplot(1, len(metric_groups), plot_idx)
        for metric in metrics:
            values = []
            for pred in task_predictions:
                if metric in pred:
                    values.append(pred[metric])
            plt.hist(values, bins=np.linspace(0, 1, 20), alpha=0.5, label=metric, density=True)
        plt.legend()
    plt.tight_layout()
    plt.show()


def print_prediction_summary(prediction):
    metrics = ['train_correct_grids', 'test_correct_grids', 'is_correct'
               'train_pixel_score', 'test_pixel_score']
    relevant_metrics = {metric: prediction.get(metric, None) for metric in metrics}
    display_python_code(f"Metrics: {relevant_metrics}")


def plot_task_and_predictions(task, output_grids):
    plt.figure(figsize=(15, 5))
    plot_task(task, n_rows=3)
    for plot_idx, grid in enumerate(output_grids, 1):
        plt.subplot(3, len(output_grids), plot_idx + len(output_grids)*2)
        plot_grid(grid)
    plt.tight_layout()
    plt.show()


## Analysis

In [None]:
dataset = load_arc_dataset_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_evaluation_challenges.json')

In [None]:
folder = '/mnt/hdd0/Kaggle/arc25/trainings/2025-10-08-generate-predictions-to-refine/128i'
df = pd.read_csv(os.path.join(folder, 'metrics.csv'), index_col=0)
predictions = load_and_curate_predictions(folder)
display(df.tail(1))
df = df.head(len(df) - 1)
df.sort_values(['is_correct', 'train_correct_grids'], ascending=False, inplace=True)

In [None]:
analyze_task_performance(df.index[10], index=0)