# Model evaluation

## Goal

Has the model learned to draw?

## Imports

In [None]:
from tqdm.auto import tqdm
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
from transformers import AutoTokenizer, AutoConfig
import matplotlib.pyplot as plt
import matplotlib as mpl

from arc25.training_tasks import *
from arc25.encoders import create_grid_encoder
from arc25.prompting import create_prompt_from_task, pretty_print_prompt
from arc25.plot import plot_task
from arc25.code_execution import safe_code_execution
from arc25.utils import set_random_seed

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 12

## Load model

In [None]:
base_model_path = '/home/gbarbadillo/models/Qwen2.5-Coder-0.5B-Instruct'
base_model_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_6400/model-6400'
# base_model_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_3200/model-3200'
# base_model_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_1600/model-1600'
# base_model_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_800/model-800'
# base_model_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_400/model-400'
# base_model_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/random_seed_9/model-200'
# lora_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_6400/checkpoint-6400'
# lora_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/random_seed_5_no_dora/checkpoint-200'
lora_path = '/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/random_seed_4_no_dora_rank16/checkpoint-50'

In [None]:
llm = LLM(
    model=base_model_path,
    enable_lora=True,
    trust_remote_code=True,
    dtype='auto',
    tensor_parallel_size=1, # to use 2 gpus
    max_model_len=10240,
    disable_log_stats=True,
    max_num_seqs=255, # default is supposed to be 256 I have used it to solve some weird illegal memory error
    enforce_eager=True,
)

In [None]:
lora_request = LoRARequest(lora_name='lora', lora_int_id=1, lora_path=lora_path)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(lora_path)

## Inference

In [None]:
prompt_version = 'code-from-examples-v3'
grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')

In [None]:
def evaluate_model(n_tasks, task_generator, sampling_params, random_seed=42, verbose=False):
    set_random_seed(random_seed)
    tasks = [task_generator.sample() for _ in range(n_tasks)]
    prompts = [
        create_prompt_from_task(
            task, prompt_version=prompt_version, grid_encoder=grid_encoder, tokenizer=tokenizer, is_train_prompt=False)
        for task in tasks
    ]
    request_output = llm.generate(prompts, sampling_params, use_tqdm=True)
    predicted_codes = [[output.text.replace('\n```', '') for output in task_output.outputs] for task_output in request_output]

    pass_n, accuracy, mean_correct_pixels, max_correct_pixels, valid_predictions = [], [], [], [], []
    for task, task_predicted_codes in tqdm(zip(tasks, predicted_codes), total=len(tasks), desc='evaluating'):
        predicted_outputs = []
        for predicted_code in task_predicted_codes:
            try:
                predicted_output = safe_code_execution(predicted_code, task.inputs)
                predicted_outputs.append(predicted_output)
            except Exception as e:
                if verbose:
                    print(f'Error executing code: {predicted_code}')
                    print(e)
        if not predicted_outputs:
            print(f'No valid outputs for task {task}')
            pass_n.append(0)
            accuracy.append(0)
            mean_correct_pixels.append(0)
            max_correct_pixels.append(0)
            valid_predictions.append(0)
            continue
        valid_predictions.append(len(predicted_outputs)/len(task_predicted_codes))
        pass_n.append(np.mean([any(np.all(output == predicted_output[idx]) for predicted_output in predicted_outputs)for idx, output in enumerate(task.outputs)]))
        accuracy.append(np.mean([np.mean([np.all(output == predicted_output[idx]) for predicted_output in predicted_outputs]) for idx, output in enumerate(task.outputs)]))
        mean_correct_pixels.append(np.mean([np.mean([np.mean(output == predicted_output[idx]) for predicted_output in predicted_outputs]) for idx, output in enumerate(task.outputs)]))
        max_correct_pixels.append(np.mean([np.max([np.mean(output == predicted_output[idx]) for predicted_output in predicted_outputs]) for idx, output in enumerate(task.outputs)]))
    metrics = {
        f'acc@{sampling_params.n}': np.mean(accuracy),
        f'pass@{sampling_params.n}': np.mean(pass_n),
        f'mean_correct_pixels': np.mean(mean_correct_pixels),
        f'max_correct_pixels': np.mean(max_correct_pixels),
        'valid_predictions': np.mean(valid_predictions),
    }
    metrics = {key: float(value) for key, value in metrics.items()}
    for key, value in metrics.items():
        print(f'{key}: {value:.2%}')
    return metrics

In [None]:
n_draws = range(1, 10)
metrics = []
for n in n_draws:
    task_generator = RandomDrawingTaskOnEmptyImg(min_draws=n, max_draws=n)
    sampling_params = SamplingParams(n=8, temperature=0.5, top_p=0.95, max_tokens=1024, logprobs=0, skip_special_tokens=False)
    print(f'Running {task_generator.__class__.__name__} with {n} draws')
    metrics.append(evaluate_model(n_tasks=128, task_generator=task_generator, sampling_params=sampling_params))

In [None]:
for plot_idx, key in enumerate(metrics[0], 1):
    plt.subplot(1, len(metrics[0]), plot_idx)
    plt.title(key)
    values = [metric[key] for metric in metrics]
    plt.plot(n_draws, values, marker='o')
    plt.fill_between([1, 5], np.min(values), np.max(values), color='green', alpha=0.2)
    plt.fill_between([5, 9], np.min(values), np.max(values), color='orange', alpha=0.2)
    plt.xlabel('temperature')
    plt.ylabel(key)
    plt.grid()
plt.suptitle('Effect of the number of drawings on the metrics')
plt.tight_layout()

In [None]:
raise

In [None]:
sampling_params = SamplingParams(n=8, temperature=0.5, top_p=0.95, max_tokens=1024, logprobs=0, skip_special_tokens=False)
metrics = evaluate_model(n_tasks=512, task_generator=task_generator, sampling_params=sampling_params)
print(base_model_path)
print(metrics)

In [None]:
raise

In [None]:
n_tasks = 512
metrics = []
temperatures = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.4]
for temperature in temperatures:
    sampling_params = SamplingParams(n=8, temperature=temperature, top_p=0.95, max_tokens=1024, logprobs=0, skip_special_tokens=False)
    print(f"Evaluating with temperature {temperature}")
    metrics.append(evaluate_model(n_tasks=n_tasks, task_generator=task_generator, sampling_params=sampling_params))

In [None]:
for plot_idx, key in enumerate(metrics[0], 1):
    plt.subplot(1, len(metrics[0]), plot_idx)
    plt.title(key)
    plt.plot(temperatures, [metric[key] for metric in metrics], marker='o')
    plt.xlabel('temperature')
    plt.ylabel(key)
    plt.grid()
plt.suptitle('Effect of the temperature on the metrics')
plt.tight_layout()

In [None]:
n_range = [1, 2, 4, 8, 16, 32, 64]
metrics = []
for n in n_range:
    sampling_params = SamplingParams(n=n, temperature=0.5, top_p=0.95, max_tokens=1024, logprobs=0, skip_special_tokens=False)
    print(f"Evaluating with n={n}")
    metrics.append(evaluate_model(n_tasks=n_tasks, task_generator=task_generator, sampling_params=sampling_params))

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(n_range, [metrics[idx][f'pass@{n}'] for idx, n in enumerate(n_range)], marker='o')
plt.xscale('log')
plt.xticks(n_range, n_range);
plt.grid()
plt.title('Effect of the n on the pass rate')
plt.xlabel('n')
plt.ylabel('pass rate');

In [None]:
for plot_idx, key in enumerate(metrics[0], 1):
    plt.subplot(1, len(metrics[0]), plot_idx)
    plt.title(key)
    plt.plot(n_range, [metric[key] for metric in metrics], marker='o')
    plt.xlabel('n')
    plt.ylabel(key)
    plt.grid()
plt.suptitle('Effect of the number of predictions on the metrics')
plt.tight_layout()

In [None]:
"""
/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/random_seed_9/model-200
{'acc@8': 0.278076171875, 'pass@8': 0.45703125, 'mean_correct_pixels': 0.8456284646562617, 'max_correct_pixels': 0.9308726824786628, 'valid_predictions': 0.999755859375}
/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_400/model-400
{'acc@8': 0.45166015625, 'pass@8': 0.658203125, 'mean_correct_pixels': 0.9149275320163149, 'max_correct_pixels': 0.9706963654602466, 'valid_predictions': 1.0}
/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_800/model-800
{'acc@8': 0.601318359375, 'pass@8': 0.7890625, 'mean_correct_pixels': 0.951842735600009, 'max_correct_pixels': 0.9852759201192052, 'valid_predictions': 1.0}
/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_1600/model-1600
{'acc@8': 0.7294921875, 'pass@8': 0.89453125, 'mean_correct_pixels': 0.9746637164726459, 'max_correct_pixels': 0.9936216837718884, 'valid_predictions': 1.0}
/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_3200/model-3200
{'acc@8': 0.826416015625, 'pass@8': 0.9453125, 'mean_correct_pixels': 0.9869367756823143, 'max_correct_pixels': 0.997600836413848, 'valid_predictions': 1.0}
/mnt/hdd0/Kaggle/arc25/trainings/20250430_first_trainings/steps_6400/model-6400
{'acc@8': 0.875732421875, 'pass@8': 0.966796875, 'mean_correct_pixels': 0.9911085811662086, 'max_correct_pixels': 0.9987733240721057, 'valid_predictions': 1.0}
"""
metrics = {
    200: {'acc@8': 0.278076171875, 'pass@8': 0.45703125, 'mean_correct_pixels': 0.8456284646562617, 'max_correct_pixels': 0.9308726824786628, 'valid_predictions': 0.999755859375},
    400: {'acc@8': 0.45166015625, 'pass@8': 0.658203125, 'mean_correct_pixels': 0.9149275320163149, 'max_correct_pixels': 0.9706963654602466, 'valid_predictions': 1.0},
    800: {'acc@8': 0.601318359375, 'pass@8': 0.7890625, 'mean_correct_pixels': 0.951842735600009, 'max_correct_pixels': 0.9852759201192052, 'valid_predictions': 1.0},
    1600: {'acc@8': 0.7294921875, 'pass@8': 0.89453125, 'mean_correct_pixels': 0.9746637164726459, 'max_correct_pixels': 0.9936216837718884, 'valid_predictions': 1.0},
    3200: {'acc@8': 0.826416015625, 'pass@8': 0.9453125, 'mean_correct_pixels': 0.9869367756823143, 'max_correct_pixels': 0.997600836413848, 'valid_predictions': 1.0},
    6400: {'acc@8': 0.875732421875, 'pass@8': 0.966796875, 'mean_correct_pixels': 0.9911085811662086, 'max_correct_pixels': 0.9987733240721057, 'valid_predictions': 1.0},
}
training_steps = sorted(metrics.keys())
for plot_idx, key in enumerate(metrics[training_steps[0]], 1):
    plt.subplot(1, len(metrics[training_steps[0]]), plot_idx)
    plt.title(key)
    plt.plot(training_steps, [metrics[steps][key] for steps in training_steps], marker='o')
    plt.xlabel('training_steps')
    plt.xscale('log')
    plt.ylabel(key)
    plt.grid()
plt.suptitle('Effect of the training_steps on the metrics')
plt.tight_layout()

In [None]:
metrics = {
    200: {'acc@8': 0.278076171875, 'pass@8': 0.45703125, 'mean_correct_pixels': 0.8456284646562617, 'max_correct_pixels': 0.9308726824786628, 'valid_predictions': 0.999755859375},
    400: {'acc@8': 0.45166015625, 'pass@8': 0.658203125, 'mean_correct_pixels': 0.9149275320163149, 'max_correct_pixels': 0.9706963654602466, 'valid_predictions': 1.0},
    800: {'acc@8': 0.601318359375, 'pass@8': 0.7890625, 'mean_correct_pixels': 0.951842735600009, 'max_correct_pixels': 0.9852759201192052, 'valid_predictions': 1.0},
    1600: {'acc@8': 0.7294921875, 'pass@8': 0.89453125, 'mean_correct_pixels': 0.9746637164726459, 'max_correct_pixels': 0.9936216837718884, 'valid_predictions': 1.0},
    3200: {'acc@8': 0.826416015625, 'pass@8': 0.9453125, 'mean_correct_pixels': 0.9869367756823143, 'max_correct_pixels': 0.997600836413848, 'valid_predictions': 1.0},
    6400: {'acc@8': 0.875732421875, 'pass@8': 0.966796875, 'mean_correct_pixels': 0.9911085811662086, 'max_correct_pixels': 0.9987733240721057, 'valid_predictions': 1.0},
}
training_steps = sorted(metrics.keys())
training_samples = [steps*16 for steps in training_steps] # batch size is 16
for plot_idx, key in enumerate(metrics[training_steps[0]], 1):
    plt.subplot(1, len(metrics[training_steps[0]]), plot_idx)
    plt.title(key)
    plt.plot(training_samples, [metrics[steps][key] for steps in training_steps], marker='o')
    plt.xlabel('training samples')
    plt.xscale('log')
    plt.ylabel(key)
    plt.grid()
plt.suptitle('Effect of the training samples on the metrics')
plt.tight_layout()

In [None]:
raise

## Manual created tasks

In [None]:
#outputs = llm.generate([prompt], sampling_params, use_tqdm=True, lora_request=lora_request)

In [None]:
input_img = create_img((10, 10), color=0)
output_img = draw_rectangle(input_img.copy(), (0, 0), (4, 4), color=1)
output_img = draw_rectangle(output_img, (5, 5), (9, 9), color=1)

task = Task(inputs=[input_img], outputs=[output_img], code='', name='manual')
plot_task(task)
prompt = create_prompt_from_task(
    task, prompt_version=prompt_version, grid_encoder=grid_encoder, tokenizer=tokenizer, is_train_prompt=False)
outputs = llm.generate([prompt], sampling_params, use_tqdm=True)
print(outputs[0].outputs[0].text)

In [None]:
input_img = create_img((10, 10), color=0)
output_img = input_img.copy()
for x in range(0, input_img.shape[1], 2):
    draw_vertical_line(output_img, x, color=x)

task = Task(inputs=[input_img], outputs=[output_img], code='', name='manual')
plot_task(task)
prompt = create_prompt_from_task(
    task, prompt_version=prompt_version, grid_encoder=grid_encoder, tokenizer=tokenizer, is_train_prompt=False)
outputs = llm.generate([prompt], sampling_params, use_tqdm=True)
print(outputs[0].outputs[0].text)

In [None]:
input_img = create_img((10, 10), color=0)
output_img = input_img.copy()
for x in range(0, input_img.shape[1], 1):
    draw_vertical_line(output_img, x, color=x)

task = Task(inputs=[input_img], outputs=[output_img], code='', name='manual')
plot_task(task)
prompt = create_prompt_from_task(
    task, prompt_version=prompt_version, grid_encoder=grid_encoder, tokenizer=tokenizer, is_train_prompt=False)
outputs = llm.generate([prompt], sampling_params, use_tqdm=True)
print(outputs[0].outputs[0].text)

In [None]:
input_img = create_img((10, 10), color=0)
draw_horizontal_line(input_img, 4, color=1)
output_img = input_img.copy()
for x in range(0, input_img.shape[1], 2):
    draw_vertical_line(output_img, x, color=x)

task = Task(inputs=[input_img], outputs=[output_img], code='', name='manual')
plot_task(task)
prompt = create_prompt_from_task(
    task, prompt_version=prompt_version, grid_encoder=grid_encoder, tokenizer=tokenizer, is_train_prompt=False)
outputs = llm.generate([prompt], sampling_params, use_tqdm=True)
print(outputs[0].outputs[0].text)

- It is limited by the number of drawing functions in the train set
- It has only been trained with blank images, that does not require a good comparison between the images.

TODO:

- I want to visualize the transformation of the code.
- Also compute some metrics of accuracy