# Refine solutions

## Goal

Can we use the BARC induction model to refine its incorrect solutions?

## Imports

In [None]:
import os
import logging
from arc25.utils import get_least_used_gpu_index
from arc25.logging import configure_logging, log_execution_time

configure_logging()
os.environ['CUDA_VISIBLE_DEVICES'] = str(get_least_used_gpu_index())

# Add VLLM specific environment variables to avoid common issues
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

In [None]:
import gc
import random
import glob
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import torch
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

from arc25.encoders import create_grid_encoder
from arc25.prompting import create_prompt_from_task, pretty_print_prompt, create_refine_prompt
from arc25.utils import get_timestamp, load_json, load_arc_dataset_with_solutions, write_json
from arc25.data_augmentation import apply_data_augmentation, get_random_data_augmentation_params
from arc25.parallel_code_execution import CodeRunner
from arc25.metrics import aggregate_metrics

## Code

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=True, tensor_parallel_size=1,
               max_model_len=14500, enable_lora=False, max_lora_rank=16,
               gpu_memory_utilization=0.92,
               max_num_seqs=128, # default is 256
            ):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=gpu_memory_utilization,  # Use less GPU memory
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
        enable_lora=enable_lora,
        max_lora_rank=max_lora_rank,
        max_num_seqs=max_num_seqs, # default is 256
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

In [None]:
raise

## Do we have enough VRAM?

### VLLM

Check how much memory VLLM requires to make predictions with a sequence lenght of 14500.

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
model, tokenizer = load_model(
    model_path, use_4bit_quantization=False, tensor_parallel_size=1, gpu_memory_utilization=0.95)

Results for 4-bit quantization:

```
0.5 memory
(EngineCore_DP0 pid=12555) INFO 10-02 17:21:14 [gpu_worker.py:298] Available KV cache memory: 4.88 GiB
(EngineCore_DP0 pid=12555) INFO 10-02 17:21:14 [kv_cache_utils.py:864] GPU KV cache size: 40,000 tokens
(EngineCore_DP0 pid=12555) INFO 10-02 17:21:14 [kv_cache_utils.py:868] Maximum concurrency for 14,500 tokens per request: 2.76x

# 0.75 memory
(EngineCore_DP0 pid=12029) INFO 10-02 17:19:16 [gpu_worker.py:298] Available KV cache memory: 10.78 GiB
(EngineCore_DP0 pid=12029) INFO 10-02 17:19:16 [kv_cache_utils.py:864] GPU KV cache size: 88,272 tokens
(EngineCore_DP0 pid=12029) INFO 10-02 17:19:16 [kv_cache_utils.py:868] Maximum concurrency for 14,500 tokens per request: 6.08x

# 0.92 memory
(EngineCore_DP0 pid=10241) INFO 10-02 17:11:05 [gpu_worker.py:298] Available KV cache memory: 14.78 GiB
(EngineCore_DP0 pid=10241) INFO 10-02 17:11:06 [kv_cache_utils.py:864] GPU KV cache size: 121,104 tokens
(EngineCore_DP0 pid=10241) INFO 10-02 17:11:06 [kv_cache_utils.py:868] Maximum concurrency for 14,500 tokens per request: 8.35x
```

Results for unquantized model:

```
# 0.5 memory -> OOM
# 0.7 memory -> OOM

# 0.8 memory
(EngineCore_DP0 pid=9446) INFO 10-08 14:53:33 [gpu_worker.py:298] Available KV cache memory: 2.61 GiB
(EngineCore_DP0 pid=9446) INFO 10-08 14:53:33 [kv_cache_utils.py:864] GPU KV cache size: 21,408 tokens
(EngineCore_DP0 pid=9446) INFO 10-08 14:53:33 [kv_cache_utils.py:868] Maximum concurrency for 14,500 tokens per request: 1.48x

# 0.9 memory
(EngineCore_DP0 pid=10147) INFO 10-08 14:55:46 [gpu_worker.py:298] Available KV cache memory: 4.97 GiB
(EngineCore_DP0 pid=10147) INFO 10-08 14:55:46 [kv_cache_utils.py:864] GPU KV cache size: 40,720 tokens
(EngineCore_DP0 pid=10147) INFO 10-08 14:55:46 [kv_cache_utils.py:868] Maximum concurrency for 14,500 tokens per request: 2.81x

# 0.95 memory
(EngineCore_DP0 pid=10628) INFO 10-08 14:57:29 [gpu_worker.py:298] Available KV cache memory: 6.15 GiB
(EngineCore_DP0 pid=10628) INFO 10-08 14:57:29 [kv_cache_utils.py:864] GPU KV cache size: 50,368 tokens
(EngineCore_DP0 pid=10628) INFO 10-08 14:57:29 [kv_cache_utils.py:868] Maximum concurrency for 14,500 tokens per request: 3.47x
```

VLLM seems to allow longer sequence lengths than unsloth.

In [None]:
raise

### Unsloth

In [None]:
from unsloth import FastLanguageModel

model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"

llm, tokenizer = FastLanguageModel.from_pretrained(
    model_path, load_in_4bit=True, max_seq_length=14500,
    fast_inference=True, gpu_memory_utilization=0.90,
    float8_kv_cache=True)

In [None]:
from unsloth import FastLanguageModel
help(FastLanguageModel.from_pretrained)

```
# gpu_memory_utilization=0.70
Unsloth: Your GPU cannot handle sequence lengths of 14500 due to limited GPU memory.
Unsloth: Your GPU can only handle approximately the maximum sequence length of 14500.
Unsloth: vLLM loading /home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B with actual GPU utilization = 69.08%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.57 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 7680. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 1.01 GB. Also swap space = 6 GB.

# gpu_memory_utilization=0.75
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.57 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 14500. Num Sequences = 160.
Unsloth: vLLM's KV Cache can use up to 2.18 GB. Also swap space = 6 GB.

# gpu_memory_utilization=0.90
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.57 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 14500. Num Sequences = 201.
Unsloth: vLLM's KV Cache can use up to 5.66 GB. Also swap space = 6 GB.
```

```
Kaggle gpu_memory_utilization=0.75, max_seq_length=9674
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 22.28 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 9674. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 1.27 GB. Also swap space = 6 GB.
```

## Refine predictions

### Model

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(
    model_path, use_4bit_quantization=False, tensor_parallel_size=1, gpu_memory_utilization=0.95, max_num_seqs=16)

In [None]:
grid_encoder =  create_grid_encoder('ColorNameEncoder()')

### Load data

In [None]:
dataset = load_arc_dataset_with_solutions('/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_evaluation_challenges.json')

In [None]:
folder = '/mnt/hdd0/Kaggle/arc25/trainings/2025-10-08-generate-predictions-to-refine/64i'
df = pd.read_csv(os.path.join(folder, 'metrics.csv'))
df.tail(1)

Pass rate is below 2%, thus it should be easy to pick predictions that did not solved the task.

In [None]:
predictions = load_json(os.path.join(folder, 'results.json.gz'))

In [None]:
len(predictions['00576224'])

In [None]:
predictions['00576224'][0]

In [None]:
def curate_predictions(predictions):
    curated_predictions = dict()
    for task_id, task_predictions in tqdm(predictions.items(), total=len(predictions), desc="Curating predictions"):
        # keep only predictions that generate valid outputs but do not solve the train set completely
        task_predictions = [prediction for prediction in task_predictions if 'train_correct_grids' in prediction and prediction['train_correct_grids'] < 1.0]
        task_predictions = sorted(task_predictions, key=lambda x: (x['train_correct_grids'], x['train_pixel_score'] ), reverse=True)
        unique_task_predictions = []
        for prediction in task_predictions:
            if prediction['fingerprint'] not in {p['fingerprint'] for p in unique_task_predictions}:
                unique_task_predictions.append(prediction)
        curated_predictions[task_id] = unique_task_predictions
    return curated_predictions

curated_predictions = curate_predictions(predictions)

In [None]:
len(curated_predictions['00576224'])

In [None]:
plt.hist([len(v) for v in curated_predictions.values()])

In [None]:
curated_predictions['00576224'][-1]

In [None]:
help(create_prompt_from_task)

### Create prompts

In [None]:
batch_size = 8
prompts_per_task = 8
prompts, inference_task_ids, data_augmentation_params = [], [], []
for task_id, task_predictions in tqdm(curated_predictions.items(), total=len(curated_predictions), desc="Creating prompts"):
    if not task_predictions:
        print(f"No valid predictions for task {task_id}, skipping...")
    for prediction_idx in range(prompts_per_task):
        data_augmentation_kwargs = task_predictions[prediction_idx % len(task_predictions)]['data_augmentation_params']
        data_augmentation_kwargs['color_map'] = {int(k): v for k, v in data_augmentation_kwargs['color_map'].items()}
        augmented_task = apply_data_augmentation(dataset[task_id], **data_augmentation_kwargs)

        prompt = create_refine_prompt(
            augmented_task, grid_encoder, tokenizer,
            task_predictions[prediction_idx % len(task_predictions)]['text_prediction'],
            task_predictions[prediction_idx % len(task_predictions)]['output_grids'])
        prompts.append(prompt)
        inference_task_ids.extend([task_id] * batch_size)
        data_augmentation_params.extend([data_augmentation_kwargs] * batch_size)
len(prompts), len(inference_task_ids), len(data_augmentation_params)

### Inference

In [None]:
sampling_params = SamplingParams(n=batch_size, temperature=1.0, top_p=0.95, max_tokens=1024)
generations = llm.generate(prompts, sampling_params)

In [None]:
text_predictions = []
for generation in generations:
    for output in generation.outputs:
        text_predictions.append(output.text)

In [None]:
code_runner = CodeRunner()

In [None]:
results = code_runner.run([dataset[task_id] for task_id in inference_task_ids], task_ids=inference_task_ids, text_predictions=text_predictions, data_augmentation_params=data_augmentation_params)

In [None]:
for task_id, task_results in results.items():
        for result in task_results:
            for key in ['input_grids', 'output_grids', 'test_output_grids']:
                if key in result:
                    result[key] = [grid.tolist() for grid in result[key]]
write_json(results, '/mnt/hdd0/Kaggle/arc25/predictions/2025-10-11-refinement/results.json.gz')

### Compare against the baseline

In [None]:
folder = '/mnt/hdd0/Kaggle/arc25/trainings/2025-10-08-generate-predictions-to-refine/128i'
df = pd.read_csv(os.path.join(folder, 'metrics.csv'))
df.tail(1)

In [None]:
folder = '/mnt/hdd0/Kaggle/arc25/trainings/2025-10-08-generate-predictions-to-refine/64i'
df = pd.read_csv(os.path.join(folder, 'metrics.csv'))
df.tail(1)

In [None]:
predictions_64i = load_json(os.path.join(folder, 'results.json.gz'))
predictions_refinement = load_json('/mnt/hdd0/Kaggle/arc25/predictions/2025-10-11-refinement/results.json.gz')
combined_predictions = dict()
for task_id in predictions_64i.keys():
    combined_predictions[task_id] = predictions_64i[task_id] + predictions_refinement.get(task_id, [])

In [None]:
metrics = aggregate_metrics(combined_predictions)
metrics.tail(1)

| initial predictions | refinement predictions | valid code | valid outputs | unique outputs | train_pixel_score | train_correct_grids | train_pass_rate | train_is_correct | test_pixel_score | test_correct_grids | test_pass_rate | test_is_correct | is_correct |
|---------------------|------------------------|------------|---------------|----------------|-------------------|---------------------|-----------------|------------------|------------------|--------------------|----------------|-----------------|------------|
| 128                 | 0                      | **99.9%**  | 71.7%         | **49.8%**      | 42.1%             | **2.4%**            | **1.6%**        | 16.3%            | 40.9%            | **2.0%**           | **2.0%**       | **23.0%**       | **16.3%**  |
| 64                  | 64                     | 99.7%      | **74.0%**     | 43.7%          | **45.8%**         | 2.1%                | 1.1%            | **16.5%**        | **44.4%**        | 1.7%               | 1.6%           | 21.5%           | 16.0%      |

### Predictions

## TODO

- [x] How to create a prompt for refinement. Start with the base prompt, then add the response, then the output and requirement to refine