In [1]:
import os
import json
from openai import OpenAI
from dotenv import load_dotenv
import time

def load_conceptarc_tasks(corpus_dir="corpus"):
    tasks = []
    for concept in os.listdir(corpus_dir):
        concept_path = os.path.join(corpus_dir, concept)
        if os.path.isdir(concept_path):
            for filename in os.listdir(concept_path):
                if filename.endswith(".json"):
                    filepath = os.path.join(concept_path, filename)
                    with open(filepath, "r") as f:
                        task_data = json.load(f)
                    tasks.append({
                        "concept": concept,
                        "filename": filename,
                        "task": task_data
                    })
    return tasks

In [2]:
system_prompt = """
You are a visual reasoning and Python programming expert solving ARC-AGI (Abstraction and Reasoning Corpus - Artificial General Intelligence) tasks.

Each integer in the grid represents a color:
0 = black, 1 = blue, 2 = red, 3 = green, 4 = yellow,
5 = grey, 6 = pink, 7 = orange, 8 = light blue, 9 = brown.
"""


In [None]:
classification_prompt = """
You will receive a set of demonstration pairs (input and output grids) from a visual reasoning task.

Your task is to classify the transformation into **one of the following 16 concepts**:

- AboveBelow - Objects or patterns are arranged vertically, with relationships defined by what's above or below something else.
- Center - Elements are moved to or arranged around the center of the grid.
- CleanUp - The task removes noise or extraneous elements to leave a cleaner or more regular structure.
- CompleteShape - A partial or broken shape is completed to form a full geometric object.
- Copy - A shape or pattern is duplicated, often to another location in the grid.
- Count - The number of certain elements is counted to determine placement, output quantity, or transformation.
- ExtendToBoundary - Shapes or lines are extended until they touch the edge of the grid.
- ExtractObjects - Specific objects are isolated and copied or transformed while others are ignored.
- FilledNotFilled - The task distinguishes between filled and hollow shapes or fills in uncolored areas.
- HorizontalVertical - Patterns follow or are transformed along horizontal or vertical axes, often involving symmetry or alignment.
- InsideOutside - A relationship is determined based on whether elements are inside or outside a defined boundary.
- MoveToBoundary - Objects are shifted to the nearest edge of the grid without rotation or change in shape.
- Order - Items are rearranged according to size, color, frequency, or another ordinal property.
- SameDifferent - Objects are retained or manipulated based on whether they match or differ in some attribute (e.g., color, shape).
- TopBottom2D - A flat 2D interpretation of objects where the top and bottom halves of the grid are compared or modified.
- TopBottom3D - The task simulates a 3D stacking or layering behavior, such as viewing objects from above or combining vertical slices.

Instructions:
- Respond ONLY with the exact name of the matching concept from the list above.
- Do not explain your answer, just return the concept.
- If uncertain, choose the concept that fits best based on the input-output transformations.
"""


In [4]:
def load_api_key(file_path="key.env"):
    load_dotenv(file_path)
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")
    if not openai.api_key:
        print("No API key found. Please set OPENAI_API_KEY in key.env.")
    global client
    client = OpenAI()

In [None]:
import time
import openai

def call_gpt(prompt, model="o4-mini", retries=3):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                # Only for GPT-4o
                # temperature=0.0
            )
            return response.choices[0].message.content.strip()
        
        except openai.RateLimitError as e:
            wait_time = 5 + attempt * 5
            print(f"Rate limit hit. Waiting {wait_time} seconds before retrying...")
            time.sleep(wait_time)

    raise Exception("Rate limit retries exhausted.")

In [None]:
def create_classification_prompt_with_task(task_data, base_prompt):
    formatted_examples = "\n\nHere are the input-output pairs for the task:\n\n"

    for i, pair in enumerate(task_data.get("train", [])):
        formatted_examples += f"Train Input {i+1}: {pair['input']}\n"
        formatted_examples += f"Train Output {i+1}: {pair['output']}\n\n"

    for i, pair in enumerate(task_data.get("test", [])):
        formatted_examples += f"Test Input {i+1}: {pair['input']}\n"
        if "output" in pair:
            formatted_examples += f"Test Output {i+1}: {pair['output']}\n\n"

    full_prompt = base_prompt.strip() + "\n\n" + formatted_examples.strip()
    return full_prompt


In [None]:
def evaluate_concept_classification(corpus_path="corpus"):
    conceptarc_tasks = load_conceptarc_tasks(corpus_path)
    correct = 0
    total = 0
    predictions = []

    for task in conceptarc_tasks:
        full_prompt = create_classification_prompt_with_task(task["task"], classification_prompt)
        predicted_concept = call_gpt(full_prompt).strip()

        predictions.append({
            "filename": task["filename"],
            "true_concept": task["concept"],
            "predicted_concept": predicted_concept
        })

        if predicted_concept.lower() == task["concept"].lower():
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    print(f"\nAccuracy: {accuracy:.2%} ({correct} out of {total})")

    return predictions, accuracy


In [None]:
from collections import defaultdict

load_api_key()
predictions, accuracy = evaluate_concept_classification("corpus")

total_per_concept = defaultdict(int)
correct_per_concept = defaultdict(int)
total_all = 0
correct_all = 0

# Loop through predictions
for entry in predictions:
    true = entry["true_concept"]
    pred = entry["predicted_concept"]
    total_per_concept[true] += 1
    total_all += 1
    if true.lower() == pred.lower():
        correct_per_concept[true] += 1
        correct_all += 1

# Print results
print("\nAccuracy per concept:")
for concept in sorted(total_per_concept):
    correct = correct_per_concept[concept]
    total = total_per_concept[concept]
    print(f"{concept}: {correct}/{total} correct ({(correct / total):.2%})")

overall_accuracy = correct_all / total_all if total_all > 0 else 0
print(f"\nTotal accuracy: {correct_all}/{total_all} correct ({overall_accuracy:.2%})")



Accuracy: 61.25% (98 out of 160)

Accuracy per concept:
AboveBelow: 5/10 correct (50.00%)
Center: 5/10 correct (50.00%)
CleanUp: 7/10 correct (70.00%)
CompleteShape: 8/10 correct (80.00%)
Copy: 9/10 correct (90.00%)
Count: 10/10 correct (100.00%)
ExtendToBoundary: 10/10 correct (100.00%)
ExtractObjects: 9/10 correct (90.00%)
FilledNotFilled: 6/10 correct (60.00%)
HorizontalVertical: 5/10 correct (50.00%)
InsideOutside: 6/10 correct (60.00%)
MoveToBoundary: 9/10 correct (90.00%)
Order: 6/10 correct (60.00%)
SameDifferent: 3/10 correct (30.00%)
TopBottom2D: 0/10 correct (0.00%)
TopBottom3D: 0/10 correct (0.00%)

Total accuracy: 98/160 correct (61.25%)
