In [2]:
import os
import importlib.util
import json
from collections import Counter

TASK_DIR = "Evaluation_set"
CODE_DIR = "Candidate_programs_revision-loop"

# Category counters
counter = Counter({
    'score_1_correct': 0,
    'score_1_incorrect': 0,
    'score_lt1_correct': 0,
    'score_lt1_incorrect': 0
})

def load_task(path):
    with open(path, 'r') as f:
        return json.load(f)

def run_on_pairs(solution_path, task_data, key='train'):
    spec = importlib.util.spec_from_file_location("solution", solution_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    
    correct = 0
    total = len(task_data[key])
    for pair in task_data[key]:
        predicted = module.solve(pair["input"])
        if predicted == pair["output"]:
            correct += 1
    return correct / total

def test_prediction(solution_path, task_data):
    spec = importlib.util.spec_from_file_location("solution", solution_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)

    for pair in task_data["test"]:
        predicted = module.solve(pair["input"])
        if predicted != pair["output"]:
            return False
    return True

# === Loop through all task JSONs ===
for filename in os.listdir(TASK_DIR):
    if not filename.endswith(".json"):
        continue

    task_id = filename.replace(".json", "")
    task_path = os.path.join(TASK_DIR, filename)
    code_folder = os.path.join(CODE_DIR, task_id)

    if not os.path.exists(code_folder):
        print(f"[!] No solution folder for task {task_id}")
        continue

    try:
        task = load_task(task_path)
    except Exception as e:
        print(f"[!] Failed to load task {task_id}: {e}")
        continue

    for code_file in os.listdir(code_folder):
        if not code_file.endswith(".py") or not code_file.startswith("solution_"):
            continue

        solution_path = os.path.join(code_folder, code_file)
        try:
            score = run_on_pairs(solution_path, task, key='train')
            correct = test_prediction(solution_path, task)

            if score == 1.0 and correct:
                counter['score_1_correct'] += 1
            elif score == 1.0 and not correct:
                counter['score_1_incorrect'] += 1
            elif score < 1.0 and correct:
                counter['score_lt1_correct'] += 1
            else:
                counter['score_lt1_incorrect'] += 1

            print(f"{task_id}/{code_file} | Score: {score:.2f} | Correct: {correct}")

        except Exception as e:
            print(f"[!] Error in {task_id}/{code_file}: {e}")

# === Print Final Tally ===
print("\n=== FINAL SUMMARY ===")
for k, v in counter.items():
    print(f"{k}: {v}")


0607ce86/solution_v1.py | Score: 0.00 | Correct: False
0607ce86/solution_v2.py | Score: 0.00 | Correct: False
0607ce86/solution_v3_rev1.py | Score: 0.00 | Correct: False
0607ce86/solution_v4_rev1.py | Score: 0.00 | Correct: False
0607ce86/solution_v5_rev2.py | Score: 0.00 | Correct: False
0607ce86/solution_v6_rev2.py | Score: 0.00 | Correct: False
0934a4d8/solution_v1.py | Score: 0.00 | Correct: False
0934a4d8/solution_v2.py | Score: 0.00 | Correct: False
0934a4d8/solution_v3_rev1.py | Score: 0.00 | Correct: False
0934a4d8/solution_v4_rev1.py | Score: 0.00 | Correct: False
[!] Error in 0934a4d8/solution_v5_rev2.py: 'pop from an empty set'
0934a4d8/solution_v6_rev2.py | Score: 0.00 | Correct: False
09c534e7/solution_v1.py | Score: 0.00 | Correct: False
09c534e7/solution_v2.py | Score: 0.00 | Correct: False
09c534e7/solution_v3_rev1.py | Score: 0.00 | Correct: False
09c534e7/solution_v4_rev1.py | Score: 0.00 | Correct: False
09c534e7/solution_v5_rev2.py | Score: 0.00 | Correct: False
09c

In [3]:
def count_py_files_in_subfolders(base_dir):
    total_count = 0
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.py'):
                total_count += 1
    return total_count

count_revision_loop = count_py_files_in_subfolders("Candidate_programs_revision-loop")
count_revision_loop_arc2 = count_py_files_in_subfolders("Candidate_programs_revision-loop_ARC2")

print(f"Total .py files in Candidate_programs_revision-loop: {count_revision_loop}")
print(f"Total .py files in Candidate_programs_revision-loop_ARC2: {count_revision_loop_arc2}")

Total .py files in Candidate_programs_revision-loop: 393
Total .py files in Candidate_programs_revision-loop_ARC2: 556
