# Pipeline: LLM-powered program generation for solving ARC-AGI

## Imports

In [40]:
import numpy as np
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import ast
import re
import importlib.util
import time

## Prompts

### Shared Variables

The system prompt contains basic information essential for every prompt sent to the LLM.

In [None]:
system_prompt = """
You are a visual reasoning and Python programming expert solving ARC-AGI (Abstraction and Reasoning Corpus - Artificial General Intelligence) tasks.

Each integer in the grid represents a color:
0 = black, 1 = blue, 2 = red, 3 = green, 4 = yellow,
5 = grey, 6 = pink, 7 = orange, 8 = light blue, 9 = brown.
"""

### Base Prompt

The following prompt tasks the LLM with program generation.

In [42]:
base_prompt = """
Write a Python function that correctly transforms each input grid into its corresponding output grid based on the given examples.

- ONLY return code. No explanations or anything other than code.
- The function must be named: `solve(grid: List[List[int]]) -> List[List[int]]`
- Use only pure Python — do not import or use libraries like NumPy
- Do not include comments, explanations, or print statements
- Do not hard-code values or specific grid sizes — the function must generalize based on the patterns in the examples
- The function must return a plain 2D list of integers with consistent row lengths (List[List[int]])
- Do not return arrays, nested arrays, floats, or 3D structures
- Ensure your solution works for all provided input-output pairs
"""

In [43]:
print(base_prompt)


Write a Python function that correctly transforms each input grid into its corresponding output grid based on the given examples.

- ONLY return code. No explanations or anything other than code.
- The function must be named: `solve(grid: List[List[int]]) -> List[List[int]]`
- Use only pure Python — do not import or use libraries like NumPy
- Do not include comments, explanations, or print statements
- Do not hard-code values or specific grid sizes — the function must generalize based on the patterns in the examples
- The function must return a plain 2D list of integers with consistent row lengths (List[List[int]])
- Do not return arrays, nested arrays, floats, or 3D structures
- Ensure your solution works for all provided input-output pairs



## Functions

### Load Tasks

Loads the tasks from the specified folder.

In [44]:
def load_tasks(folder):
    tasks = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith(".json"):
            with open(os.path.join(folder, filename), "r") as f:
                data = json.load(f)
                tasks.append({"filename": filename, "data": data})
    return tasks

### Load API Key

Loads the API key from the .env file.

In [45]:
def load_api_key(file_path="key.env"):
    load_dotenv(file_path)
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")
    if not openai.api_key:
        print("No API key found. Please set OPENAI_API_KEY in key.env.")
    global client
    client = OpenAI()

### Call GPT

Used to send requests to the LLM.

In [None]:
import time
import openai

def call_gpt(prompt, model="o4-mini", retries=10):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                # Only for GPT-4o
                # temperature=0.0
            )
            return response.choices[0].message.content.strip()
        
        except openai.RateLimitError as e:
            wait_time = 5 + attempt * 5
            print(f"Rate limit hit. Waiting {wait_time} seconds before retrying...")
            time.sleep(wait_time)

    raise Exception("Rate limit retries exhausted.")

### Building and Combining Prompts

#### Add Tasks to the Prompt

Adds the task data to the prompt.

In [47]:
def add_tasks(prompt, task_data):
    full_prompt = prompt.strip() + "\n\nHere are the demonstration pairs (JSON data):\n"
    for i, pair in enumerate(task_data['train']):
        full_prompt += f"\nTrain Input {i+1}: {pair['input']}\n"
        full_prompt += f"Train Output {i+1}: {pair['output']}\n"
    return full_prompt

### Save Programs

The following function saves the generated programs in the specified folder using the task's name. Additionally, the LLMs response is cleaned, ensuring only runnable Python code is saved.

In [None]:
def save_program(program_text, actual_task_id, suffix=""):
    
    base_folder = "Candidate_programs_basic_prompts_4"
    task_folder = os.path.join(base_folder, actual_task_id)
    
    os.makedirs(task_folder, exist_ok=True)

    # Clean the LLMs response (e.g. remove ```python or ```)
    cleaned_text = re.sub(r"^```(?:python)?\s*|```$", "", program_text.strip(), flags=re.MULTILINE)

    # Determine program version for naming
    existing_files = os.listdir(task_folder)
    version_numbers = [
        int(re.search(r"solution_v(\d+)", fname).group(1))
        for fname in existing_files
        if re.match(r"solution_v\d+", fname)
    ]
    next_version = max(version_numbers, default=0) + 1
    
    # Save program to file
    file_path = os.path.join(task_folder, f"solution_v{next_version}{suffix}.py")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(cleaned_text.strip())

    print(f"Saved program for task {actual_task_id} as version {next_version}{suffix}: {file_path}")

### Create Programs

The following function passes the prompt for program creation to the LLM for n amount of times and saves the response.

In [None]:
def create_programs(tailored_prompt, actual_task_id, amount):
    for i in range(amount):
        response = call_gpt(tailored_prompt)
        save_program(response, actual_task_id)

### Evaluate Programs

Loads and executes a Python program from a specified file path.

In [None]:
def load_program(file_path):
    spec = importlib.util.spec_from_file_location("program", file_path)
    program = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(program)
    return program.solve

Checks if the program is valid Python code.

In [None]:
def is_valid_python_code(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            source = f.read()
        ast.parse(source)
        return True
    except SyntaxError:
        return False

The following function runs the evaluation of the generated programs. This includes:

- Checking if the program is valid Python code and deleting it if it isn't. 
- Checking if there are any valid programs among the generated ones (if there aren't at least two valid programs, more will be created until there are at least two [correctness of programs doesn't matter, only execution])
- Comparing the generated outputs with the correct outputs.
- Calculating a score for each program.

The score is defined by the amount of correct transformations / the total amount of demonstration pairs of a task. This score is used to determine whether a program should be revised later on (score < 1).

In [None]:
def evaluate_programs(task_data, task_folder):
    programs = []
    program_files = [f for f in os.listdir(task_folder) if f.endswith(".py")]

    # Track whether any valid programs exist
    any_valid = False

    for program_file in program_files:
        program_path = os.path.join(task_folder, program_file)

        # Check if the program is valid Python code if not delete it
        if not is_valid_python_code(program_path):
            print(f"Deleting invalid file: {program_file}")
            os.remove(program_path)
            continue

        try:
            solve_function = load_program(program_path)
        except Exception as e:
            print(f"Error loading program {program_file}: {e}")
            os.remove(program_path)
            continue

        # Set to true if at least one valid program is found
        any_valid = True

        details = []
        correct_count = 0
        total_pairs = len(task_data['train'])

        # Evaluate programs against the training pairs
        for pair in task_data['train']:
            input_grid = pair['input']
            expected_output = pair['output']
            try:
                candidate_output = solve_function(input_grid)
                if np.array_equal(np.array(candidate_output), np.array(expected_output)):
                    correct_count += 1
            except Exception as e:
                candidate_output = f"Error: {e}"
            details.append({
                "input": input_grid,
                "candidate_output": candidate_output,
                "expected_output": expected_output
            })

        # Calculate the score and store the results
        score = correct_count / total_pairs if total_pairs > 0 else 0
        programs.append({
            "program_name": program_file,
            "score": score,
            "correct_pairs": correct_count,
            "total_pairs": total_pairs,
            "details": details
        })

    return programs if any_valid else 0

### Generation of Predictions on Test Inputs

#### Identification of Best Programs

Selects the two best-performing programs based on their score (= performance on the demonstration pairs). If scores are equal over multiple programs, the first few programs will be picked (e.g., if all are 0, then solution_v1 and solution_v2 will be picked).

In [None]:
def get_best_programs(evaluation_results, actual_task_id, n=2):
    # Sort programs by score descending; if scores are equal, the original order is preserved.
    sorted_programs = sorted(evaluation_results, key=lambda x: x['score'], reverse=True)
    task_folder = os.path.join("Candidate_programs_basic_prompts_4", actual_task_id)
    best_program_files = [os.path.join(task_folder, prog['program_name']) for prog in sorted_programs[:n]]
    for i in best_program_files:
        print(f"Best program: {i}")
    return best_program_files

#### Creation of Predictions on Test Inputs

The following function loads the two best-performing programs to create predictions in the test inputs of the task. The resulting outputs are saved in a submission dictionary to be appended to the submission.json file later on.

In [None]:
def generate_test_predictions(task_data, actual_task_id, best_program_files):
    # Load candidate programs
    best_solvers = [load_program(prog_file) for prog_file in best_program_files]
    
    predictions = []
    # Iterate over each test pair and generate predictions
    for i, pair in enumerate(task_data["test"]):
        input_grid = pair["input"]
        attempt_predictions = {}
        for idx, solver in enumerate(best_solvers, start=1):
            try:
                output = solver(input_grid)
            except Exception as e:
                output = f"Error: {e}"
            attempt_predictions[f"attempt_{idx}"] = output
        predictions.append(attempt_predictions)
    
    # Save the predictions in the submission format
    submission = {str(actual_task_id): predictions}
    
    return submission

## Pipeline

The following is the flow of the pipeline. All the above functions and prompts are used and work together to create predictions for test inputs. The predictions are saved in a submission.json file for calculating the accuracies.

In [None]:
# Load tasks and API key
tasks = load_tasks("evaluation_set")
load_api_key()


# Load existing submission file if it exists
submission_file = "submission_basic_prompts_4.json"
if os.path.exists(submission_file):
    with open(submission_file, "r") as f:
        final_submission = json.load(f)
else:
    final_submission = {}


# Loop through each task (adjustable range)
for i, task in enumerate(tasks[50:100]):
    actual_task_id = task["filename"].split(".")[0]
    
    ### PROMPT CREATION ###
    full_prompt = add_tasks(base_prompt, task['data'])
    create_programs(full_prompt, actual_task_id, amount=2)  # Create first 2 programs
    
    
    ### INITIAL EVALUATION ###
    task_folder = os.path.join("Candidate_programs_basic_prompts_4", actual_task_id)
    evaluation_results = evaluate_programs(task['data'], task_folder)
    
    
    # If all programs are invalid, retry creating programs
    while evaluation_results == 0:
        create_programs(full_prompt, actual_task_id, amount=2)
        evaluation_results = evaluate_programs(task['data'], task_folder)
    
    
    ### CONTINUED PROGRAM CREATION UNTIL 2 VALID OR 6 TOTAL PROGRAMS ###
    correct_programs = sum(1 for result in evaluation_results if result['score'] == 1.0)
    number_of_programs = len(evaluation_results)

    while (correct_programs < 2) and (number_of_programs < 4):
        create_programs(full_prompt, actual_task_id, amount=2)
        evaluation_results = evaluate_programs(task['data'], task_folder)
        
        if evaluation_results == 0:
            continue

        correct_programs = sum(1 for result in evaluation_results if result['score'] == 1.0)
        number_of_programs = len(evaluation_results)
        
        
    ### FINAL EVALUATION ###
    evaluation_results_2 = evaluate_programs(task['data'], task_folder)
    print(f"Task {actual_task_id} evaluation results:")
    for result in evaluation_results_2:
        print(f"Program {result['program_name']} solved {result['correct_pairs']} out of {result['total_pairs']} pairs. Score: {result['score']:.2f}")
        print("="*50)


    ### PROGRAM SELECTION + PREDICTIONS ###
    best_program_files = get_best_programs(evaluation_results_2, actual_task_id, n=2)
    submission = generate_test_predictions(task['data'], actual_task_id, best_program_files)
    final_submission.update(submission)


# Final output file
with open(submission_file, "w") as f:
    json.dump(final_submission, f)


Saved program for task 963f59bc as version 1: Candidate_programs_basic_prompts_4\963f59bc\solution_v1.py
Saved program for task 963f59bc as version 2: Candidate_programs_basic_prompts_4\963f59bc\solution_v2.py
Saved program for task 963f59bc as version 3: Candidate_programs_basic_prompts_4\963f59bc\solution_v3.py
Saved program for task 963f59bc as version 4: Candidate_programs_basic_prompts_4\963f59bc\solution_v4.py
Task 963f59bc evaluation results:
Program solution_v1.py solved 0 out of 4 pairs. Score: 0.00
Program solution_v2.py solved 1 out of 4 pairs. Score: 0.25
Program solution_v3.py solved 0 out of 4 pairs. Score: 0.00
Program solution_v4.py solved 0 out of 4 pairs. Score: 0.00
Best program: Candidate_programs_basic_prompts_4\963f59bc\solution_v2.py
Best program: Candidate_programs_basic_prompts_4\963f59bc\solution_v1.py
Saved program for task 96a8c0cd as version 1: Candidate_programs_basic_prompts_4\96a8c0cd\solution_v1.py
Saved program for task 96a8c0cd as version 2: Candidate