In [1]:
import os
import json
from openai import OpenAI
from dotenv import load_dotenv
import time

def load_conceptarc_tasks(corpus_dir="corpus"):
    tasks = []
    for concept in os.listdir(corpus_dir):
        concept_path = os.path.join(corpus_dir, concept)
        if os.path.isdir(concept_path):
            for filename in os.listdir(concept_path):
                if filename.endswith(".json"):
                    filepath = os.path.join(concept_path, filename)
                    with open(filepath, "r") as f:
                        task_data = json.load(f)
                    tasks.append({
                        "concept": concept,
                        "filename": filename,
                        "task": task_data
                    })
    return tasks

In [2]:
system_prompt = """
You are a visual reasoning and Python programming expert solving ARC-AGI (Abstraction and Reasoning Corpus - Artificial General Intelligence) tasks.

Each integer in the grid represents a color:
0 = black, 1 = blue, 2 = red, 3 = green, 4 = yellow,
5 = grey, 6 = pink, 7 = orange, 8 = light blue, 9 = brown.
"""


In [3]:
# Check manually entering one demonstration pair for each concept
classification_prompt = """
You will receive a set of demonstration pairs (input and output grids) from a visual reasoning task.

Your task is to classify the transformation into **one of the following 16 concepts**:

- AboveBelow - Objects or patterns are arranged vertically, with relationships defined by what's above or below something else.
- Center - Elements are moved to or arranged around the center of the grid.
- CleanUp - The task removes noise or extraneous elements to leave a cleaner or more regular structure.
- CompleteShape - A partial or broken shape is completed to form a full geometric object.
- Copy - A shape or pattern is duplicated, often to another location in the grid.
- Count - The number of certain elements is counted to determine placement, output quantity, or transformation.
- ExtendToBoundary - Shapes or lines are extended until they touch the edge of the grid.
- ExtractObjects - Specific objects are isolated and copied or transformed while others are ignored.
- FilledNotFilled - The task distinguishes between filled and hollow shapes or fills in uncolored areas.
- HorizontalVertical - Patterns follow or are transformed along horizontal or vertical axes, often involving symmetry or alignment.
- InsideOutside - A relationship is determined based on whether elements are inside or outside a defined boundary.
- MoveToBoundary - Objects are shifted to the nearest edge of the grid without rotation or change in shape.
- Order - Items are rearranged according to size, color, frequency, or another ordinal property.
- SameDifferent - Objects are retained or manipulated based on whether they match or differ in some attribute (e.g., color, shape).
- TopBottom2D - A flat 2D interpretation of objects where the top and bottom halves of the grid are compared or modified.
- TopBottom3D - The task simulates a 3D stacking or layering behavior, such as viewing objects from above or combining vertical slices.

Instructions:
- Respond ONLY with the exact name of the matching concept from the list above.
- Do not explain your answer, just return the concept.
- If uncertain, choose the concept that fits best based on the input-output transformations.
"""


In [4]:
def load_api_key(file_path="key.env"):
    load_dotenv(file_path)
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")
    if not openai.api_key:
        print("No API key found. Please set OPENAI_API_KEY in key.env.")
    global client
    client = OpenAI()

In [5]:
import time
import openai

def call_gpt(prompt, model="o4-mini", retries=3):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                # Only for GPT-4o
                # temperature=0.0
            )
            return response.choices[0].message.content.strip()
        
        except openai.RateLimitError as e:
            wait_time = 5 + attempt * 5  # exponential backoff
            print(f"Rate limit hit. Waiting {wait_time} seconds before retrying...")
            time.sleep(wait_time)

    raise Exception("Rate limit retries exhausted.")

In [6]:
def create_classification_prompt_with_task(task_data, base_prompt):
    formatted_examples = "\n\nHere are the input-output pairs for the task:\n\n"

    # Add train examples
    for i, pair in enumerate(task_data.get("train", [])):
        formatted_examples += f"Train Input {i+1}: {pair['input']}\n"
        formatted_examples += f"Train Output {i+1}: {pair['output']}\n\n"

    # Add test examples
    for i, pair in enumerate(task_data.get("test", [])):
        formatted_examples += f"Test Input {i+1}: {pair['input']}\n"
        if "output" in pair:
            formatted_examples += f"Test Output {i+1}: {pair['output']}\n\n"

    full_prompt = base_prompt.strip() + "\n\n" + formatted_examples.strip()
    return full_prompt


In [7]:
def evaluate_concept_classification(corpus_path="corpus"):
    conceptarc_tasks = load_conceptarc_tasks(corpus_path)
    correct = 0
    total = 0
    predictions = []

    for task in conceptarc_tasks:
        full_prompt = create_classification_prompt_with_task(task["task"], classification_prompt)
        predicted_concept = call_gpt(full_prompt).strip()

        # Save prediction info in the required format
        predictions.append({
            "filename": task["filename"],
            "true_concept": task["concept"],
            "predicted_concept": predicted_concept
        })

        # Compare
        if predicted_concept.lower() == task["concept"].lower():
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    print(f"\nAccuracy: {accuracy:.2%} ({correct} out of {total})")

    return predictions, accuracy


In [29]:
prediction = ([
    {'filename': 'AboveBelow1.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'TopBottom2D'},
    {'filename': 'AboveBelow10.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'AboveBelow2.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'TopBottom2D'},
    {'filename': 'AboveBelow3.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'AboveBelow'},
    {'filename': 'AboveBelow4.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'AboveBelow'},
    {'filename': 'AboveBelow5.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'AboveBelow6.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'AboveBelow7.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'AboveBelow'},
    {'filename': 'AboveBelow8.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'AboveBelow9.json', 'true_concept': 'AboveBelow', 'predicted_concept': 'AboveBelow'},
    {'filename': 'Center1.json', 'true_concept': 'Center', 'predicted_concept': 'CompleteShape'},
    {'filename': 'Center10.json', 'true_concept': 'Center', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'Center2.json', 'true_concept': 'Center', 'predicted_concept': 'Center'},
    {'filename': 'Center3.json', 'true_concept': 'Center', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'Center4.json', 'true_concept': 'Center', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'Center5.json', 'true_concept': 'Center', 'predicted_concept': 'Center'},
    {'filename': 'Center6.json', 'true_concept': 'Center', 'predicted_concept': 'Center'},
    {'filename': 'Center7.json', 'true_concept': 'Center', 'predicted_concept': 'Count'},
    {'filename': 'Center8.json', 'true_concept': 'Center', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'Center9.json', 'true_concept': 'Center', 'predicted_concept': 'Center'},
    {'filename': 'CleanUp1.json', 'true_concept': 'CleanUp', 'predicted_concept': 'CleanUp'},
    {'filename': 'CleanUp10.json', 'true_concept': 'CleanUp', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'CleanUp2.json', 'true_concept': 'CleanUp', 'predicted_concept': 'CleanUp'},
    {'filename': 'CleanUp3.json', 'true_concept': 'CleanUp', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'CleanUp4.json', 'true_concept': 'CleanUp', 'predicted_concept': 'CleanUp'},
    {'filename': 'CleanUp5.json', 'true_concept': 'CleanUp', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'CleanUp6.json', 'true_concept': 'CleanUp', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'CleanUp7.json', 'true_concept': 'CleanUp', 'predicted_concept': 'Copy'},
    {'filename': 'CleanUp8.json', 'true_concept': 'CleanUp', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CleanUp9.json', 'true_concept': 'CleanUp', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape1.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape10.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape2.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'CompleteShape3.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape4.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'CompleteShape5.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape6.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape7.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape8.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'CompleteShape9.json', 'true_concept': 'CompleteShape', 'predicted_concept': 'CompleteShape'},
    {'filename': 'Copy1.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy10.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy2.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy3.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy4.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy5.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy6.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy7.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy8.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Copy9.json', 'true_concept': 'Copy', 'predicted_concept': 'Copy'},
    {'filename': 'Count1.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count10.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count2.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count3.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count4.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count5.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count6.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count7.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count8.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'Count9.json', 'true_concept': 'Count', 'predicted_concept': 'Count'},
    {'filename': 'ExtendToBoundary1.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary10.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary2.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary3.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'ExtendToBoundary4.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary5.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary6.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary7.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary8.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtendToBoundary9.json', 'true_concept': 'ExtendToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'ExtractObjects1.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'ExtractObjects10.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'ExtractObjects2.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'ExtractObjects3.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'ExtractObjects4.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'ExtractObjects5.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'ExtractObjects6.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'ExtractObjects7.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'ExtractObjects8.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'InsideOutside'},
    {'filename': 'ExtractObjects9.json', 'true_concept': 'ExtractObjects', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'FilledNotFilled1.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'FilledNotFilled10.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'CompleteShape'},
    {'filename': 'FilledNotFilled2.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'FilledNotFilled3.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'InsideOutside'},
    {'filename': 'FilledNotFilled4.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'FilledNotFilled5.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'FilledNotFilled6.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'FilledNotFilled7.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'FilledNotFilled8.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'FilledNotFilled9.json', 'true_concept': 'FilledNotFilled', 'predicted_concept': 'CompleteShape'},
    {'filename': 'HorizontalVertical1.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'HorizontalVertical10.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'HorizontalVertical2.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'HorizontalVertical3.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'HorizontalVertical4.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'HorizontalVertical5.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'HorizontalVertical6.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'HorizontalVertical7.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'HorizontalVertical8.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'CleanUp'},
    {'filename': 'HorizontalVertical9.json', 'true_concept': 'HorizontalVertical', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'InsideOutside1.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'Count'},
    {'filename': 'InsideOutside10.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'CleanUp'},
    {'filename': 'InsideOutside2.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'Order'},
    {'filename': 'InsideOutside3.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'InsideOutside'},
    {'filename': 'InsideOutside4.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'CleanUp'},
    {'filename': 'InsideOutside5.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'InsideOutside'},
    {'filename': 'InsideOutside6.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'InsideOutside7.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'Count'},
    {'filename': 'InsideOutside8.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'InsideOutside'},
    {'filename': 'InsideOutside9.json', 'true_concept': 'InsideOutside', 'predicted_concept': 'InsideOutside'},
    {'filename': 'MoveToBoundary1.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'MoveToBoundary10.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'MoveToBoundary2.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'MoveToBoundary3.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'MoveToBoundary4.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'MoveToBoundary5.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'Center'},
    {'filename': 'MoveToBoundary6.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'MoveToBoundary7.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'MoveToBoundary8.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'MoveToBoundary9.json', 'true_concept': 'MoveToBoundary', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'Order1.json', 'true_concept': 'Order', 'predicted_concept': 'Order'},
    {'filename': 'Order10.json', 'true_concept': 'Order', 'predicted_concept': 'Order'},
    {'filename': 'Order2.json', 'true_concept': 'Order', 'predicted_concept': 'CompleteShape'},
    {'filename': 'Order3.json', 'true_concept': 'Order', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'Order4.json', 'true_concept': 'Order', 'predicted_concept': 'Order'},
    {'filename': 'Order5.json', 'true_concept': 'Order', 'predicted_concept': 'Order'},
    {'filename': 'Order6.json', 'true_concept': 'Order', 'predicted_concept': 'Order'},
    {'filename': 'Order7.json', 'true_concept': 'Order', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'Order8.json', 'true_concept': 'Order', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'Order9.json', 'true_concept': 'Order', 'predicted_concept': 'CleanUp'},
    {'filename': 'SameDifferent1.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'SameDifferent10.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'SameDifferent'},
    {'filename': 'SameDifferent2.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'SameDifferent3.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'SameDifferent'},
    {'filename': 'SameDifferent4.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'HorizontalVertical'},
    {'filename': 'SameDifferent5.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'Count'},
    {'filename': 'SameDifferent6.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'SameDifferent'},
    {'filename': 'SameDifferent7.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'SameDifferent8.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'SameDifferent9.json', 'true_concept': 'SameDifferent', 'predicted_concept': 'SameDifferent'},
    {'filename': 'TopBottom2D1.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'AboveBelow'},
    {'filename': 'TopBottom2D10.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'AboveBelow'},
    {'filename': 'TopBottom2D2.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'TopBottom2D3.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'AboveBelow'},
    {'filename': 'TopBottom2D4.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'AboveBelow'},
    {'filename': 'TopBottom2D5.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'TopBottom2D6.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'TopBottom2D7.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'AboveBelow'},
    {'filename': 'TopBottom2D8.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'SameDifferent'},
    {'filename': 'TopBottom2D9.json', 'true_concept': 'TopBottom2D', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'TopBottom3D1.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'FilledNotFilled'},
    {'filename': 'TopBottom3D10.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'Copy'},
    {'filename': 'TopBottom3D2.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'TopBottom3D3.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'CompleteShape'},
    {'filename': 'TopBottom3D4.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'MoveToBoundary'},
    {'filename': 'TopBottom3D5.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'ExtractObjects'},
    {'filename': 'TopBottom3D6.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'CompleteShape'},
    {'filename': 'TopBottom3D7.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'ExtendToBoundary'},
    {'filename': 'TopBottom3D8.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'SameDifferent'},
    {'filename': 'TopBottom3D9.json', 'true_concept': 'TopBottom3D', 'predicted_concept': 'InsideOutside'}])

In [8]:
from collections import defaultdict

# Load API key and run the evaluation
load_api_key()
predictions, accuracy = evaluate_concept_classification("corpus")

# Initialize counters
total_per_concept = defaultdict(int)
correct_per_concept = defaultdict(int)
total_all = 0
correct_all = 0

# Loop through predictions
for entry in predictions:
    true = entry["true_concept"]
    pred = entry["predicted_concept"]
    total_per_concept[true] += 1
    total_all += 1
    if true.lower() == pred.lower():
        correct_per_concept[true] += 1
        correct_all += 1

# Print results
print("\nAccuracy per concept:")
for concept in sorted(total_per_concept):
    correct = correct_per_concept[concept]
    total = total_per_concept[concept]
    print(f"{concept}: {correct}/{total} correct ({(correct / total):.2%})")

# Print total accuracy
overall_accuracy = correct_all / total_all if total_all > 0 else 0
print(f"\nTotal accuracy: {correct_all}/{total_all} correct ({overall_accuracy:.2%})")



Accuracy: 61.25% (98 out of 160)

Accuracy per concept:
AboveBelow: 5/10 correct (50.00%)
Center: 5/10 correct (50.00%)
CleanUp: 7/10 correct (70.00%)
CompleteShape: 8/10 correct (80.00%)
Copy: 9/10 correct (90.00%)
Count: 10/10 correct (100.00%)
ExtendToBoundary: 10/10 correct (100.00%)
ExtractObjects: 9/10 correct (90.00%)
FilledNotFilled: 6/10 correct (60.00%)
HorizontalVertical: 5/10 correct (50.00%)
InsideOutside: 6/10 correct (60.00%)
MoveToBoundary: 9/10 correct (90.00%)
Order: 6/10 correct (60.00%)
SameDifferent: 3/10 correct (30.00%)
TopBottom2D: 0/10 correct (0.00%)
TopBottom3D: 0/10 correct (0.00%)

Total accuracy: 98/160 correct (61.25%)
