In [None]:
!pip install -r requirements.txt

ALL_PROBLEM_CLASSES = [
    "algebra",
    "counting_and_probability",
    "geometry",
    "intermediate_algebra",
    "number_theory",
    "prealgebra",
    "precalculus",
]

In [1]:
import logging
import json
from testing_our_method import test_accuracy
from constants import ALL_PROBLEM_CLASSES
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

LLMS TO TRY = [
    "llama3-70b-8192",
    "llama3-8b-8192",
    "mixtral-8x7b-32768",
    "gemma-7b-it",
]

## LLM = "llama3-70b-8192"

In [2]:
# Define the models to use
CODING_LLM = "deepseek-coder:6.7b-instruct-q8_0"
MAIN_SOLVER_LLM = "llama3-70b-8192"
JUDGING_LLM = "gpt-3.5-turbo"

# Use RAG
use_rag = True

In [3]:
def test_class(problem_class_number):
    problem_class = ALL_PROBLEM_CLASSES[problem_class_number]
    dataset_path = os.path.join(
        os.getcwd(), "merged_dataset", "test", problem_class, "final.json"
    )
    dataset = json.load(open(dataset_path))
    correct_num, prob_num, fail_coding_num, fail_judging_num = test_accuracy(
        data_class=problem_class,
        dataset=dataset,
        coding_llm=CODING_LLM,
        main_solver_llm=MAIN_SOLVER_LLM,
        judging_llm=JUDGING_LLM,
        levels=[1, 2, 3, 4, 5],
        use_rag=use_rag,
        logging_level=logging.ERROR,
    )
    print(f"Correctly Solved Problems: {correct_num}")
    print(f"Total Number of Problems: {prob_num}")
    print(f"Problems where Coding execution fails: {fail_coding_num}")
    print(f"Problems where Judging LLM fails: {fail_judging_num}")

In [4]:
test_class(0)

Testing on dataset 'algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 1, 2, 1, 1]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 0, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [5]:
test_class(1)

Testing on dataset 'counting_and_probability' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 1, 0, 1, 1]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 0, 0, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [4]:
test_class(2)

Testing on dataset 'geometry' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 2, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 1, 1, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [4]:
test_class(3)

Testing on dataset 'intermediate_algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 2, 1, 0, 1]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 1, 1, 2]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [5]:
test_class(4)

Testing on dataset 'number_theory' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 0, 2, 1, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 0, 2, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [6]:
test_class(5)

Testing on dataset 'prealgebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 2, 1, 1, 2]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 1, 0, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [7]:
test_class(6)

Testing on dataset 'precalculus' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 2, 2, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [1, 1, 0, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


## LLM = "llama3-8b-8192"

In [2]:
# Define the models to use
CODING_LLM = "deepseek-coder:6.7b-instruct-q8_0"
MAIN_SOLVER_LLM = "llama3-8b-8192"
JUDGING_LLM = "gpt-3.5-turbo"

# Use RAG
use_rag = True

In [3]:
def test_class(problem_class_number):
    problem_class = ALL_PROBLEM_CLASSES[problem_class_number]
    dataset_path = os.path.join(
        os.getcwd(), "merged_dataset", "test", problem_class, "final.json"
    )
    dataset = json.load(open(dataset_path))
    correct_num, prob_num, fail_coding_num, fail_judging_num = test_accuracy(
        data_class=problem_class,
        dataset=dataset,
        coding_llm=CODING_LLM,
        main_solver_llm=MAIN_SOLVER_LLM,
        judging_llm=JUDGING_LLM,
        levels=[1, 2, 3, 4, 5],
        use_rag=use_rag,
        logging_level=logging.ERROR,
    )
    print(f"Correctly Solved Problems: {correct_num}")
    print(f"Total Number of Problems: {prob_num}")
    print(f"Problems where Coding execution fails: {fail_coding_num}")
    print(f"Problems where Judging LLM fails: {fail_judging_num}")

In [4]:
test_class(0)

Testing on dataset 'algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 1, 1, 0, 1]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 0, 1, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [5]:
test_class(1)

Testing on dataset 'counting_and_probability' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 1, 0, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [1, 1, 1, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [6]:
test_class(2)

Testing on dataset 'geometry' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [0, 0, 0, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 1, 0, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [7]:
test_class(3)

Testing on dataset 'intermediate_algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%


  y *= step
  y += start


Correctly Solved Problems: [1, 0, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 0, 2, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [8]:
test_class(4)

Testing on dataset 'number_theory' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 0, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 2, 0, 0, 2]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [9]:
test_class(5)

Testing on dataset 'prealgebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [0, 2, 0, 1, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 0, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [10]:
test_class(6)

Testing on dataset 'precalculus' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [0, 1, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 0, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


## LLM = "mixtral-8x7b-32768"

In [2]:
# Define the models to use
CODING_LLM = "deepseek-coder:6.7b-instruct-q8_0"
MAIN_SOLVER_LLM = "mixtral-8x7b-32768"
JUDGING_LLM = "gpt-3.5-turbo"

# Use RAG
use_rag = True

In [3]:
def test_class(problem_class_number):
    problem_class = ALL_PROBLEM_CLASSES[problem_class_number]
    dataset_path = os.path.join(
        os.getcwd(), "merged_dataset", "test", problem_class, "final.json"
    )
    dataset = json.load(open(dataset_path))
    correct_num, prob_num, fail_coding_num, fail_judging_num = test_accuracy(
        data_class=problem_class,
        dataset=dataset,
        coding_llm=CODING_LLM,
        main_solver_llm=MAIN_SOLVER_LLM,
        judging_llm=JUDGING_LLM,
        levels=[1, 2, 3, 4, 5],
        use_rag=use_rag,
        logging_level=logging.ERROR,
    )
    print(f"Correctly Solved Problems: {correct_num}")
    print(f"Total Number of Problems: {prob_num}")
    print(f"Problems where Coding execution fails: {fail_coding_num}")
    print(f"Problems where Judging LLM fails: {fail_judging_num}")

In [13]:
test_class(0)

Testing on dataset 'algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 1, 0, 0, 1]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 0, 0, 2]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [14]:
test_class(1)

Testing on dataset 'counting_and_probability' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [0, 0, 0, 1, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 2, 0, 1, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [15]:
test_class(2)

Testing on dataset 'geometry' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [0, 1, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 1, 1, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [16]:
test_class(3)

Testing on dataset 'intermediate_algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 0, 0, 1, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [1, 0, 1, 2, 2]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [17]:
test_class(4)

Testing on dataset 'number_theory' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 0, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 2, 0, 2, 2]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [18]:
test_class(5)

Testing on dataset 'prealgebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [0, 0, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 0, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [4]:
test_class(6)

Testing on dataset 'precalculus' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 1, 1, 1, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [1, 1, 0, 1, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


## LLM = "gemma-7b-it"

In [2]:
# Define the models to use
CODING_LLM = "deepseek-coder:6.7b-instruct-q8_0"
MAIN_SOLVER_LLM = "gemma-7b-it"
JUDGING_LLM = "gpt-3.5-turbo"

# Use RAG
use_rag = True

In [3]:
def test_class(problem_class_number):
    problem_class = ALL_PROBLEM_CLASSES[problem_class_number]
    dataset_path = os.path.join(
        os.getcwd(), "merged_dataset", "test", problem_class, "final.json"
    )
    dataset = json.load(open(dataset_path))
    correct_num, prob_num, fail_coding_num, fail_judging_num = test_accuracy(
        data_class=problem_class,
        dataset=dataset,
        coding_llm=CODING_LLM,
        main_solver_llm=MAIN_SOLVER_LLM,
        judging_llm=JUDGING_LLM,
        levels=[1, 2, 3, 4, 5],
        use_rag=use_rag,
        logging_level=logging.ERROR,
    )
    print(f"Correctly Solved Problems: {correct_num}")
    print(f"Total Number of Problems: {prob_num}")
    print(f"Problems where Coding execution fails: {fail_coding_num}")
    print(f"Problems where Judging LLM fails: {fail_judging_num}")

In [4]:
test_class(0)

Testing on dataset 'algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [2, 1, 1, 0, 1]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 1, 0, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [5]:
test_class(1)

Testing on dataset 'counting_and_probability' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 1, 0, 1, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 2, 1, 0, 2]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [4]:
test_class(2)

Testing on dataset 'geometry' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 2, 0, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 0, 1, 1, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [4]:
test_class(3)

Testing on dataset 'intermediate_algebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 0, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [1, 1, 1, 2, 1]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [None]:
test_class(4)

Testing on dataset 'number_theory' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 0, 1, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 2, 1, 1, 2]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [5]:
test_class(5)

Testing on dataset 'prealgebra' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [1, 1, 0, 0, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [0, 1, 0, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]


In [6]:
test_class(6)

Testing on dataset 'precalculus' with levels [1, 2, 3, 4, 5].
Percentage complete: 0.0%
Percentage complete: 10.0%
Percentage complete: 20.0%
Percentage complete: 30.0%
Percentage complete: 40.0%
Percentage complete: 50.0%
Percentage complete: 60.0%
Percentage complete: 70.0%
Percentage complete: 80.0%
Percentage complete: 90.0%
Correctly Solved Problems: [0, 0, 2, 1, 0]
Total Number of Problems: [2, 2, 2, 2, 2]
Problems where Coding execution fails: [1, 1, 0, 0, 0]
Problems where Judging LLM fails: [0, 0, 0, 0, 0]
