## Collect inference results

## Imports

In [None]:
import os
import glob
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import matplotlib as mpl

from arc25.utils import load_json
from arc25.metrics import aggregate_metrics, error_analysis

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 3)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 12

## Code

In [None]:
def load_predictions(filepaths, max_files=None):
    data = None
    if max_files is not None:
        filepaths = filepaths[:max_files]
    for filepath in tqdm(filepaths):
        file_data = load_json(filepath)
        if data is None:
            data = file_data
        else:
            for task_id, result in file_data.items():
                data[task_id].extend(result)
    return data

## Results

### 2025-10-14-rl-barc

In [None]:
max_files = 4
dfs = dict()
filepaths = glob.glob('/mnt/hdd0/Kaggle/arc25/predictions/2025-10-14-rl-barc/baseline/evaluation/*.json.gz')
alias = 'baseline'
predictions = load_predictions(filepaths, max_files=max_files)
dfs[alias] = aggregate_metrics(predictions)

In [None]:
folders = {
    # '8lora-16gen': '/mnt/hdd0/Kaggle/arc25/predictions/2025-10-14-rl-barc/1lora_lr4e-6_arc-v2-no-pixel-score_epochs1_16gen_2accum-steps_repetition-penalty-1.02_masked-truncate_unquantized_beta0.01',
    '1lora-16gen': '/mnt/hdd0/Kaggle/arc25/predictions/2025-10-14-rl-barc/8lora_lr2e-6_arc-v2-no-pixel-score_epochs1_16gen_2accum-steps_repetition-penalty-1.02_masked-truncate_unquantized_beta0.01',
    '1lora-32gen': '/mnt/hdd0/Kaggle/arc25/predictions/2025-10-14-rl-barc/1lora_lr4e-6_0.05max-grad-norm_arc-v2-no-pixel-score_32gen_4accum-steps_repetition-penalty-1.02_masked-truncate_unquantized_beta0.02',
    '1lora-64gen': '/mnt/hdd0/Kaggle/arc25/predictions/2025-10-14-rl-barc/1lora_lr4e-6_0.02max-grad-norm_arc-v2-no-pixel-score_64gen_8accum-steps_repetition-penalty-1.01_masked-truncate_unquantized_beta0.04',
    '1lora-128gen': '/mnt/hdd0/Kaggle/arc25/predictions/2025-10-14-rl-barc/1lora_lr4e-6_0.02max-grad-norm_arc-v2-no-pixel-score_128gen_16accum-steps_repetition-penalty-1.01_masked-truncate_unquantized_beta0.04',
}

for alias, folder in folders.items():
    subfolders = sorted(glob.glob(os.path.join(folder, '*')), key=lambda x: int(os.path.basename(x).split('-')[-1]))
    for subfolder in subfolders:
        print(f"Processing folder: {subfolder}")
        filepaths = glob.glob(os.path.join(subfolder, 'evaluation/*.json.gz'))
        if not filepaths:
            continue
        predictions = load_predictions(filepaths, max_files=max_files)
        dfs[alias + '_' + os.path.basename(subfolder).split('_')[0]] = aggregate_metrics(predictions)

In [None]:
results_df = None
for alias, df in dfs.items():
    if results_df is None:
        results_df = pd.DataFrame(columns=df.columns)
    results_df.loc[alias] = df.loc['MEAN']

In [None]:
experiment_results = dict()
for key in folders.keys():
    experiment_results[key] = results_df.loc[['baseline'] + [k for k in results_df.index if k.startswith(key)]]
    experiment_results[key]['training_steps'] = list(map(lambda x: int(x.split('-')[-1]) if x != 'baseline' else 0, experiment_results[key].index))

In [None]:
experiment_results['1lora-16gen']

In [None]:
metric_groups = [
    [column for column in results_df.columns if column.startswith('train_')],
    [column for column in results_df.columns if column.startswith('test_')],
    ['valid code', 'valid outputs', 'unique outputs', 'is_correct']
]

for metrics in metric_groups:
    for plot_idx, column in enumerate(metrics, 1):
        plt.subplot(1, len(metrics), plot_idx)
        for key in folders.keys():
            plt.plot(
                experiment_results[key]['training_steps'],
                experiment_results[key][column],
                marker='o',
                label=key
            )
        plt.title(column)
        plt.xlabel('Training Steps')
        plt.ylabel(column)
        plt.legend()
        plt.grid()
    plt.tight_layout()
    plt.show()