## Pipeline: LLM-powered program generation for solving ARC-AGI

### Imports

In [106]:
import numpy as np
import ollama
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import re
import importlib.util

### Shared Variables

In [107]:
# Shared system prompt for all tasks
system_prompt = """
You are a visual reasoning and Python programming expert solving ARC-AGI (Abstraction and Reasoning Corpus - Artificial General Intelligence) tasks.

Each integer in the grid represents a color:
0 = black, 1 = blue, 2 = red, 3 = green, 4 = yellow,
5 = grey, 6 = pink, 7 = orange, 8 = light blue, 9 = brown.
"""


### Prompts

#### Basic Prompt

In [108]:
base_prompt = """
Write a Python function that correctly transforms each input grid into its corresponding output grid based on the given examples.

- The function must be named: `solve(grid: List[List[int]]) -> List[List[int]]`
- Include only the code and necessary imports (e.g., `import numpy as np`)
- Do not include comments, explanations, or print statements
- Do not hard-code values or specific grid sizes — the function must generalize based on the patterns in the examples
- Ensure your solution works for all provided input-output pairs
"""

In [109]:
print(base_prompt)


Write a Python function that correctly transforms each input grid into its corresponding output grid based on the given examples.

- The function must be named: `solve(grid: List[List[int]]) -> List[List[int]]`
- Include only the code and necessary imports (e.g., `import numpy as np`)
- Do not include comments, explanations, or print statements
- Do not hard-code values or specific grid sizes — the function must generalize based on the patterns in the examples
- Ensure your solution works for all provided input-output pairs



#### Prompt 1

In [110]:
prompt_1 = """
List visual observations from the training pairs.

- Use bullet points (max 10).
- Focus on colors, shapes, object counts, positions, and differences.
- Avoid reasoning or explanations.
- Be concise. No full sentences, no extra formatting.
"""

In [111]:
print(prompt_1)


List visual observations from the training pairs.

- Use bullet points (max 10).
- Focus on colors, shapes, object counts, positions, and differences.
- Avoid reasoning or explanations.
- Be concise. No full sentences, no extra formatting.



#### Prompt 2

In [112]:
prompt_2 = """
Describe the transformation(s) from input to output grids.

- Use 3 to 5 short sentences.
- Focus on what changes: movement, color, shape, duplication, etc.
- Mention if the transformation is based on position, context, or rules.
- Avoid implementation hints or code.
"""

In [113]:
print(prompt_2)


Describe the transformation(s) from input to output grids.

- Use 3 to 5 short sentences.
- Focus on what changes: movement, color, shape, duplication, etc.
- Mention if the transformation is based on position, context, or rules.
- Avoid implementation hints or code.



#### Prompt 3

In [114]:
prompt_3 = """
Reflect on how you would solve the task in Python.

- Use 3 to 5 sentences.
- Mention your overall approach, logical steps, and possible uncertainties.
- Do not return code or pseudocode.
"""

In [115]:
print(prompt_3)


Reflect on how you would solve the task in Python.

- Use 3 to 5 sentences.
- Mention your overall approach, logical steps, and possible uncertainties.
- Do not return code or pseudocode.



#### Prompt 4

In [116]:
#TODO: Add outputs of secondary prompts to other secondary prompts (especially for prompt 3) Maybe "buildPrompt" function.
#TODO: Create Revision prompt.

In [117]:
prompt_4 = ""

### Functions

#### Load Tasks

In [118]:
def load_tasks(folder):
    tasks = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith(".json"):
            with open(os.path.join(folder, filename), "r") as f:
                data = json.load(f)
                tasks.append({"filename": filename, "data": data})
    return tasks

#### Load API-Key

In [119]:
def load_api_key(file_path="key.env"):
    load_dotenv(file_path)
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")
    if not openai.api_key:
        print("No API key found. Please set OPENAI_API_KEY in key.env.")
    global client
    client = OpenAI()

#### Call GPT

In [120]:
def call_gpt(prompt):
    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content.strip()

#### Building and Combining Prompts

Adds the tasks demonstration pairs to the prompt:

In [121]:
def add_tasks(prompt, task_data):
    full_prompt = prompt.strip() + "\n\nHere are the demonstration pairs (JSON data):\n"
    for i, pair in enumerate(task_data['train']):
        full_prompt += f"\nTrain Input {i+1}: {pair['input']}\n"
        full_prompt += f"Train Output {i+1}: {pair['output']}\n"
    return full_prompt

Combines secondary prompt 1 and 2:

In [122]:
def combine_prompts_1_and_2(prompt_1_response, prompt_2_template):
    combined_prompt = f"""{prompt_2_template.strip()}

Here are visual observations of the task at hand, that may assist you in identifying the transformation:

{prompt_1_response.strip()}

Now provide your transformation analysis based on these observations."""
    return combined_prompt

Combines secondary prompt 1, 2 and 3:

In [123]:
def combine_prompts_1_2_and_3(prompt_1_response, prompt_2_response, prompt_3_template):
    combined_prompt = f"""{prompt_3_template.strip()}

Here are visual observations of the task that may help inform your implementation:
{prompt_1_response.strip()}

Here are the transformation rules that have been identified based on the task:
{prompt_2_response.strip()}

Now reflect on how you would implement a solution to this task in Python, following the instructions above.
"""
    return combined_prompt


Combines secondary prompt 3 with the base prompt

In [124]:
def combine_prompts_3_and_base(prompt_3_response, prompt_base_template):
    combined_prompt = f"""
Implementation Reflection:
{prompt_3_response.strip()}

{prompt_base_template.strip()}
"""
    return combined_prompt.strip()

Combine responses of the secondary prompt to the base prompt to create task-tailored prompt.

In [125]:
def build_prompts(task_data):
    # Build secondary prompt 1
    full_prompt_1 = add_tasks(prompt_1, task_data)
    response_1 = call_gpt(full_prompt_1)
    print('Built prompt 1')
    
    # Build secondary prompt 2
    combined_prompt_2 = combine_prompts_1_and_2(response_1, prompt_2)
    full_prompt_2 = add_tasks(combined_prompt_2, task_data)
    response_2 = call_gpt(full_prompt_2)
    print('Built prompt 2')
    
    # Build secondary prompt 3
    combined_prompt_3 = combine_prompts_1_2_and_3(response_1, response_2, prompt_3)
    full_prompt_3 = add_tasks(combined_prompt_3, task_data)
    response_3 = call_gpt(full_prompt_3)
    print('Built prompt 3')
    
    # Build task-tailored prompt
    combined_prompt_base = combine_prompts_3_and_base(response_3, base_prompt)
    tailored_prompt = add_tasks(combined_prompt_base, task_data)
    
    return tailored_prompt

#### Save Programs

In [126]:
def save_program(program_text, task_id):
    import re

    # Define the base and task-specific folder paths
    base_folder = "Candidate_programs"
    task_folder = os.path.join(base_folder, f"task_{task_id}")
    
    # Create the task-specific folder if it doesn't exist
    os.makedirs(task_folder, exist_ok=True)

    # Remove ```python or ``` if present
    cleaned_text = re.sub(r"^```(?:python)?\s*|```$", "", program_text.strip(), flags=re.MULTILINE)

    # Find the next available version number
    existing_files = os.listdir(task_folder)
    version_numbers = [
        int(re.search(r"solution_v(\d+)\.py", fname).group(1))
        for fname in existing_files
        if re.match(r"solution_v\d+\.py", fname)
    ]
    next_version = max(version_numbers, default=0) + 1
    
    # Define the full path to the new Python file
    file_path = os.path.join(task_folder, f"solution_v{next_version}.py")
    
    # Save the program text to the file
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(cleaned_text.strip())

    
    print(f"Saved program for task {task_id} as version {next_version}: {file_path}")

#### Create Programs

In [127]:
def create_programs(tailored_prompt, task_index):
    # Create two programs (change range for n programs)
    for i in range(2):  # You can adjust the range to create more programs
        response = call_gpt(tailored_prompt)
        
        # Save the program and store its name
        program_name = save_program(response, task_index)

#### Evaluate Programs

In [128]:
def load_program(file_path):
    """Load and execute a Python program from a file."""
    spec = importlib.util.spec_from_file_location("program", file_path)
    program = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(program)
    return program.solve  # Assumes the function is named `solve`


In [129]:
def evaluate_programs(task_data, task_index):
    """Evaluate the programs against the demonstration pairs."""
    programs = []
    task_folder = os.path.join("Candidate_programs", f"task_{task_index}")
    
    # List all saved Python programs (solutions)
    program_files = [f for f in os.listdir(task_folder) if f.endswith(".py")]
    
    for program_file in program_files:
        program_path = os.path.join(task_folder, program_file)
        solve_function = load_program(program_path)  # Load the solve function
        
        correct_count = 0
        total_pairs = len(task_data['train'])
        
        # Test each demonstration pair
        for i, pair in enumerate(task_data['train']):
            input_grid = np.array(pair['input'])
            expected_output = np.array(pair['output'])
            
            try:
                program_output = solve_function(input_grid.tolist())  # Assuming `solve()` expects a list
                program_output = np.array(program_output)
                
                # Compare the output
                if np.array_equal(program_output, expected_output):
                    correct_count += 1
            except Exception as e:
                print(f"Error while executing program {program_file} for pair {i+1}: {e}")
        
        # Calculate score
        score = correct_count / total_pairs if total_pairs > 0 else 0
        programs.append({
            'program_name': program_file,
            'score': score,
            'correct_pairs': correct_count,
            'total_pairs': total_pairs
        })
    
    return programs

### Pipeline

In [130]:
# Step 1: Load tasks and API key
tasks = load_tasks("evaluation_set")
load_api_key()

# Step 2: Loop through each task and create programs, then evaluate them
for i, task in enumerate(tasks[:11]):  # Adjust the range to process more tasks (index 82 = task 83)
    # Step 2.1: Build secondary prompts and create tailored prompt from their responses
    tailored_prompt = build_prompts(task['data'])
    
    # Step 2.2: Create two programs based on the tailored prompt
    create_programs(tailored_prompt, i+1)
    
    # Step 2.3: Evaluate the created programs
    evaluation_results = evaluate_programs(task['data'], i+1)
    
    # Step 2.4: Print evaluation results for each program
    for result in evaluation_results:
        print(f"Program {result['program_name']} solved {result['correct_pairs']} out of {result['total_pairs']} pairs.")
        print(f"Score: {result['score']:.2f}")
        print("="*50)

Built prompt 1
Built prompt 2
Built prompt 3
Saved program for task 1 as version 1: Candidate_programs\task_1\solution_v1.py
Saved program for task 1 as version 2: Candidate_programs\task_1\solution_v2.py
Program solution_v1.py solved 0 out of 3 pairs.
Score: 0.00
Program solution_v2.py solved 0 out of 3 pairs.
Score: 0.00
Built prompt 1
Built prompt 2
Built prompt 3
Saved program for task 2 as version 1: Candidate_programs\task_2\solution_v1.py
Saved program for task 2 as version 2: Candidate_programs\task_2\solution_v2.py
Program solution_v1.py solved 0 out of 4 pairs.
Score: 0.00
Program solution_v2.py solved 0 out of 4 pairs.
Score: 0.00
Built prompt 1
Built prompt 2
Built prompt 3
Saved program for task 3 as version 1: Candidate_programs\task_3\solution_v1.py
Saved program for task 3 as version 2: Candidate_programs\task_3\solution_v2.py
Program solution_v1.py solved 0 out of 3 pairs.
Score: 0.00
Program solution_v2.py solved 0 out of 3 pairs.
Score: 0.00
Built prompt 1
Built prom