# Grade Solutions to Math Problems

In [None]:
import json
import os
import re
import subprocess
import sys

## Run Qwen2.5-Math

#### Clone the Repo

In [None]:
! git clone https://github.com/QwenLM/Qwen2.5-Math

In [None]:
def run_command(command, cwd=None):
    """
    Runs a shell command in the given working directory.
    """
    print(f"Running: {' '.join(command)} in {cwd or os.getcwd()}")
    subprocess.check_call(command, cwd=cwd)

#### Install Dependencies

In [None]:
try:
    # Change to "Qwen2.5-Math/evalution/latex2sympy" and run "pip install -e ."
    run_command(["pip", "install", "-e", "."], cwd=os.path.join("Qwen2.5-Math", "evaluation", "latex2sympy"))

    # Change to the parent directory "Qwen2.5-Math/evalution"
    evalution_dir = os.path.join("Qwen2.5-Math", "evaluation")

    # Run "pip install -r requirements.txt" in the evalution directory
    run_command(["pip", "install", "-r", "requirements.txt"], cwd=evalution_dir)

    # Install specific versions of vllm and transformers
    run_command(["pip", "install", "vllm==0.5.1", "--no-build-isolation"], cwd=evalution_dir)
    run_command(["pip", "install", "transformers"], cwd=evalution_dir)

except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")

#### Set up

In [None]:
PROMPT_TYPE = "qwen25-math-cot"
MODEL_NAME_OR_PATH = "Qwen/Qwen2.5-Math-1.5B-Instruct"
DATA_NAME = "aime24"
TOKENIZERS_PARALLELISM = False
OUTPUT_DIR = "./results"
SPLIT = "test"
NUM_TEST_SAMPLE = "-1"

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0")

#### Run Eval Pipeline

In [None]:
run_command([
    "python3", "-u", "math_eval.py",
    "--model_name_or_path", MODEL_NAME_OR_PATH,
    "--data_name", DATA_NAME,
    "--output_dir", OUTPUT_DIR,
    "--split", SPLIT,
    "--prompt_type", PROMPT_TYPE,
    "--num_test_sample", NUM_TEST_SAMPLE,
    "--seed", "0",
    "--temperature", "0",
    "--n_sampling", "1",
    "--top_p", "1",
    "--start", "0",
    "--end", "-1",
    "--use_vllm",
    "--save_outputs",
    "--overwrite"
], cwd=evalution_dir)

#### Results

In [None]:
output_path = os.path.join(evalution_dir + '/outputs', OUTPUT_DIR + '/aime24')
print(output_path)

#### Statistics

In [None]:
stats_file = [f for f in os.listdir(output_path) if f.endswith('.json')][0]
stats_file = os.path.join(output_path, stats_file) # Build the complete file path

with open(stats_file, 'r') as f:
    stats = json.load(f)
stats


#### Generated Responses

In [None]:
results_file = [f for f in os.listdir(output_path) if f.endswith('.jsonl')][0]
results_file = os.path.join(output_path, results_file) # Build the complete file path

eval_results = []
with open(results_file, "r") as g:
    for line in g:
        record = json.loads(line)
        eval_results.append(record)

In [None]:
eval_results[0]

In [None]:
len(eval_results)

## Detailed Evaluation : o1 grader

#### Your OpenAI API Key

In [None]:
os.environ["OPENAI_API_KEY"] = ""

In [None]:
from openai import OpenAI
client = OpenAI()

#### System Instructions

In [None]:
SYSTEM_PROMPT = '''
You are a math teacher tasked to grade a student's solution to a math problem. You will be provided the following data:
1. Problem statement.
2. The student's solution to the problem.
2. Correct answer to the problem.

## Instructions: You will grade the student's solution on following aspects:
1. Problem Understanding:
2. Setup and Strategy:
3. Mathematical Execution:
4. Correctness

For each of the aspects, produce a score between 0 to 4. Provide reasoning for your grading. Provide your grading as a JSON object with the following format: 
{"problem_understanding": <reasoning>..</reasoning><score>4</score>, "setup_and_strategy": <reasoning>..</reasoning><score>2</score>, "mathematical_execution": <reasoning>..</reasoning><score>1</score>, "correctness": <reasoning>..</reasoning><score>0</score>}
'''

#### Add data to the prompt

In [None]:
def build_prompt(prompt, test_example):
    prompt += f"\n\nProblem: {test_example['question']}\n\nSolution: {test_example['code'][0]}\n\nCorrect Answer: {test_example['answer']}\n\n"
    return prompt

In [None]:
print(build_prompt(SYSTEM_PROMPT, eval_results[0]))

In [None]:
print(build_prompt(SYSTEM_PROMPT, eval_results[1]))

#### Perform Grading

In [None]:
for i, test_example in enumerate(eval_results):
    print(f"Processing {i}-th example")
    prompt = build_prompt(SYSTEM_PROMPT, test_example)
    response = client.chat.completions.create(
        model="o1-preview",
        messages=[
            {
                "role": "user", 
                "content": prompt
            }
        ]
    )
    test_example["grade"] = response.choices[0].message.content

In [None]:
len(eval_results)

In [None]:
eval_results[0]

In [None]:
eval_results[1]

#### Write to file

In [None]:
with open("eval_results_with_grades.json", "w") as f:
    json.dump(eval_results, f)

#### Post Processing

In [None]:
FEATURES = ['problem_understanding', 'setup_and_strategy', 'mathematical_execution', 'correctness']

In [None]:
def process_content(example):
    reasonings = [x.split("</reasoning>")[0] for x in example['grade'].split("<reasoning>") if "</reasoning>" in x]
    scores = [int(x.split("</score>")[0]) for x in example['grade'].split("<score>") if "</score>" in x]
    score_dict = dict()
    for f,score in zip(FEATURES, scores):
        score_dict[f] = score
    reasoning_dict = dict()
    for f,reasoning in zip(FEATURES, reasonings):
        reasoning_dict[f] = reasoning
    return score_dict, reasoning_dict

In [None]:
x,y = process_content(eval_results[0])
x

In [None]:
y

#### Compute Statistics

In [None]:
aggregate_scores = {f:0 for f in FEATURES}
for i,test_example in enumerate(eval_results):
    try:
        x,y = process_content(test_example)
        for f in FEATURES:
            if f == "correctness":
                x[f] = 1 if x[f] == 4 else 0
            aggregate_scores[f] += x[f]
    except:
        print(f"Error processing {i}-th example")

aggregate_scores = {f:aggregate_scores[f]/len(eval_results) for f in FEATURES}
aggregate_scores

In [None]:
for key,val in aggregate_scores.items():
    print(f"{key}: {val}")