### Imports

In [1]:
import json
import os

### Functions for Calculation and Loading

In [2]:
def load_tasks(folder):
    tasks = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith(".json"):
            with open(os.path.join(folder, filename), "r") as f:
                data = json.load(f)
                tasks.append({"filename": filename, "data": data})
    return tasks

In [3]:
def are_equal(a, b):
    return json.dumps(a) == json.dumps(b)

def compute_accuracy(submission_path, tasks_dict, task_concepts):
    with open(submission_path, 'r') as f:
        submission = json.load(f)

    total_tasks = len(submission)
    correct_tasks = 0

    concept_totals = {}
    concept_corrects = {}

    for task_id, prediction_list in submission.items():
        task_data = tasks_dict[task_id]
        test_outputs = [test['output'] for test in task_data['test']]

        all_test_cases_correct = True
        for idx, expected_output in enumerate(test_outputs):
            attempts = prediction_list[idx]
            pred1 = attempts.get("attempt_1")
            pred2 = attempts.get("attempt_2")  # may be None

            if not (are_equal(pred1, expected_output) or (pred2 is not None and are_equal(pred2, expected_output))):
                all_test_cases_correct = False
                break

        concept = task_concepts.get(task_id, "Unknown Concept")
        concept_totals[concept] = concept_totals.get(concept, 0) + 1
        if all_test_cases_correct:
            correct_tasks += 1
            concept_corrects[concept] = concept_corrects.get(concept, 0) + 1

    accuracy = correct_tasks / total_tasks

    print(f"\nOverall Accuracy: {accuracy:.2%}\n")

    print("Accuracy per concept:")
    for concept, total in concept_totals.items():
        correct = concept_corrects.get(concept, 0)
        concept_accuracy = correct / total
        print(f"- {concept}: {correct}/{total} ({concept_accuracy:.2%})")

    return accuracy


### Accuracy and Algorithm Selection

If you want to get the accuracy of a different pipeline/task group, change the following variables:

In [4]:
task_selection = "evaluation_set"
pipeline_submission = "submission_revision-loop_4o.json"

In [5]:
tasks = load_tasks(task_selection)
with open("classified_concepts.json") as f:
    task_concepts = json.load(f)

with open(pipeline_submission) as f:
    submission = json.load(f)

tasks_dict = {task["filename"].split(".")[0]: task["data"] for task in tasks}

accuracy = compute_accuracy(pipeline_submission, tasks_dict, task_concepts)


Overall Accuracy: 5.00%

Accuracy per concept:
- CleanUp: 0/2 (0.00%)
- ExtractObjects: 0/18 (0.00%)
- FilledNotFilled: 0/10 (0.00%)
- ExtendToBoundary: 0/13 (0.00%)
- HorizontalVertical: 1/6 (16.67%)
- Copy: 1/19 (5.26%)
- SameDifferent: 0/3 (0.00%)
- Order: 0/6 (0.00%)
- Count: 3/5 (60.00%)
- CompleteShape: 0/10 (0.00%)
- AboveBelow: 0/2 (0.00%)
- InsideOutside: 0/3 (0.00%)
- MoveToBoundary: 0/3 (0.00%)
