## 3. Measuring Zero-Shot MATH Performance

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")

In [None]:
# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]

# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["\n"]
)

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)

In [None]:
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

In [None]:
# https://huggingface.co/datasets/EleutherAI/hendrycks_math

import pandas as pd

# All MATH dataset subjects
subjects = [
    "algebra", "counting_and_probability", "geometry", 
    "intermediate_algebra", "number_theory", "prealgebra", "precalculus"
]

# Load train data from all subjects
train_dfs = []
for subject in subjects:
    df = pd.read_parquet(f"hf://datasets/EleutherAI/hendrycks_math/{subject}/train-00000-of-00001.parquet")
    df['subject'] = subject
    train_dfs.append(df)

df_train = pd.concat(train_dfs, ignore_index=True)
print(f"Total train examples: {len(df_train)}")

# Load test data from all subjects
test_dfs = []
for subject in subjects:
    df = pd.read_parquet(f"hf://datasets/EleutherAI/hendrycks_math/{subject}/test-00000-of-00001.parquet")
    df['subject'] = subject
    test_dfs.append(df)

df_test = pd.concat(test_dfs, ignore_index=True)
print(f"Total test examples: {len(df_test)}")
# print(f"Columns: {df_test.columns.tolist()}")

In [None]:
from vllm import LLM, SamplingParams


llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")


In [None]:
prompt_r1_zero_file = "/home/azureuser/localfiles/cs336-assignment5-alignment-mine/cs336_alignment/prompts/r1_zero.prompt"

with open(prompt_r1_zero_file) as f:
    prompt_r1_zero = f.read()

def get_prompts_and_solutions(df):
    prompts = [prompt_r1_zero.replace("{question}", p) for p in df.problem.tolist()]
    return prompts, df.solution.tolist()

In [None]:
from typing import Callable, List
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn
import json

def evaluate_vllm(
    vllm_model: LLM,
    eval_sampling_params: SamplingParams,
    prompts: List[str],
    reward_fn: Callable[[str, str], dict[str, float]],
    solutions: List[str],
    output_file: str = "eval_results.jsonl"
) -> None:
    """
    Evaluatea languagemodelon a listof prompts,
    compute evaluation metrics, and serialize results to disk.
    """
    responses = vllm_model.generate(prompts, eval_sampling_params)
    solutions_generated = [opt.outputs[0].text for opt in responses]

    evals = [reward_fn(sol_gen, sol) for sol_gen, sol in zip(solutions_generated, solutions)]

    # Serialize the prompts, solutions, solutions generated, and corresponding evals to disk
    with open(output_file, 'w') as f:
        for prompt, solution, sol_gen, eval_dict in zip(prompts, solutions, solutions_generated, evals):
            result = {
                "prompt": prompt,
                "ground_truth": solution,
                "generated": sol_gen,
                "eval": eval_dict
            }
            f.write(json.dumps(result) + '\n')

    return evals, solutions_generated

In [None]:
# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["</answer>"],
    include_stop_str_in_output=True,
)

prompts, solutions = get_prompts_and_solutions(df_test)
evals, solutions_generated = evaluate_vllm(
    llm, sampling_params, prompts, r1_zero_reward_fn, solutions)

## 3.1 Using vLLM for offline language model inference
### math_baseline
1. Done
1. Commentary on model and reward func perf
    1. See cell below for distribution. 
    1. For cases with zero format reward: most are because of the model failed to generate the answer tags or not in the right format. For cases with non-zero format reward but zero answer reward: 50/50 of wrong answer and parser failure
1. less than 3% get both format and answer rewards

In [None]:
# math_baseline.2
df_eval = pd.DataFrame(evals)
# Check rows where format_reward is 1 and answer_reward is 1
print(f"Format reward 1, Answer reward 1: {((df_eval["format_reward"] == 1) & (df_eval["answer_reward"] == 1)).sum()}")
print(f"Format reward 1, Answer reward 0: {((df_eval["format_reward"] == 1) & (df_eval["answer_reward"] == 0)).sum()}")
print(f"Format reward 0, Answer reward 0: {((df_eval["format_reward"] == 0) & (df_eval["answer_reward"] == 0)).sum()}")

In [None]:
sample_ids = df_eval[(df_eval.format_reward==1) & (df_eval.answer_reward==0)].sample(10).index
sample_problems = (df_test["problem"].tolist()[i] for i in sample_ids)
sample_solutions = (df_test["solution"].tolist()[i] for i in sample_ids)
sample_solutions_generated = (solutions_generated[i] for i in sample_ids)

In [None]:
counts_by_category = df_test.groupby("type")["problem"].count()
accurate_by_category = df_test[df_eval.reward == 1].groupby("type")["problem"].count()
accuracy_percentage = (accurate_by_category / counts_by_category) * 100
accuracy_percentage