## 3. Measuring Zero-Shot MATH Performance

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")

In [None]:
# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]

# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["\n"]
)

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)

In [None]:
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

In [None]:
# https://huggingface.co/datasets/EleutherAI/hendrycks_math

import pandas as pd

# All MATH dataset subjects
subjects = [
    "algebra", "counting_and_probability", "geometry", 
    "intermediate_algebra", "number_theory", "prealgebra", "precalculus"
]

# Load train data from all subjects
train_dfs = []
for subject in subjects:
    df = pd.read_parquet(f"hf://datasets/EleutherAI/hendrycks_math/{subject}/train-00000-of-00001.parquet")
    df['subject'] = subject
    train_dfs.append(df)

df_train = pd.concat(train_dfs, ignore_index=True)
print(f"Total train examples: {len(df_train)}")

# Load test data from all subjects
test_dfs = []
for subject in subjects:
    df = pd.read_parquet(f"hf://datasets/EleutherAI/hendrycks_math/{subject}/test-00000-of-00001.parquet")
    df['subject'] = subject
    test_dfs.append(df)

df_test = pd.concat(test_dfs, ignore_index=True)
print(f"Total test examples: {len(df_test)}")
# print(f"Columns: {df_test.columns.tolist()}")

In [None]:
from vllm import LLM, SamplingParams
from typing import Callable, List
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn
import json
import pandas as pd

def get_prompts_and_answers(prompt_template, df):
    prompts = [prompt_template.replace("{question}", p) for p in df.problem.tolist()]
    return prompts, df.answer.tolist()

def evaluate_vllm(
    vllm_model: LLM,
    eval_sampling_params: SamplingParams,
    prompts: List[str],
    reward_fn: Callable[[str, str], dict[str, float]],
    solutions: List[str],
    output_file: str = "eval_results.jsonl"
) -> None:
    """
    Evaluatea languagemodelon a listof prompts,
    compute evaluation metrics, and serialize results to disk.
    """
    responses = vllm_model.generate(prompts, eval_sampling_params)
    solutions_generated = [opt.outputs[0].text for opt in responses]

    evals = [reward_fn(sol_gen, sol) for sol_gen, sol in zip(solutions_generated, solutions)]

    # Serialize the prompts, solutions, solutions generated, and corresponding evals to disk
    with open(output_file, 'w') as f:
        for prompt, solution, sol_gen, eval_dict in zip(prompts, solutions, solutions_generated, evals):
            result = {
                "prompt": prompt,
                "ground_truth": solution,
                "generated": sol_gen,
                "eval": eval_dict
            }
            f.write(json.dumps(result) + '\n')

    return evals, solutions_generated

In [None]:
llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")

prompt_r1_zero_file = "/home/azureuser/localfiles/cs336-assignment5-alignment-mine/cs336_alignment/prompts/r1_zero.prompt"
with open(prompt_r1_zero_file) as f:
    prompt_r1_zero = f.read()

df = pd.read_json("/home/azureuser/localfiles/cs336-assignment5-alignment-mine/data/validation.jsonl", lines=True)

In [None]:
df.head()

In [None]:
# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["</answer>"],
    include_stop_str_in_output=True,
)

prompts, answers = get_prompts_and_answers(prompt_r1_zero, df)
evals, answers_generated = evaluate_vllm(
    llm, sampling_params, prompts, r1_zero_reward_fn, answers)

## 3.1 Using vLLM for offline language model inference
### math_baseline
1. Done
1. Commentary on model and reward func perf
    1. See cell below for distribution. 
    1. For cases with zero format reward: most are because of the model failed to generate the answer tags or not in the right format. For cases with non-zero format reward but zero answer reward: 50/50 of wrong answer and parser failure
1. less than 3% get both format and answer rewards

In [None]:
# math_baseline.2
df_eval = pd.DataFrame(evals)
# Check rows where format_reward is 1 and answer_reward is 1
print(f"Format reward 1, Answer reward 1: {((df_eval["format_reward"] == 1) & (df_eval["answer_reward"] == 1)).sum()}")
print(f"Format reward 1, Answer reward 0: {((df_eval["format_reward"] == 1) & (df_eval["answer_reward"] == 0)).sum()}")
print(f"Format reward 0, Answer reward 0: {((df_eval["format_reward"] == 0) & (df_eval["answer_reward"] == 0)).sum()}")

In [None]:
counts_by_category = df.groupby("subject")["problem"].count()
accurate_by_category = df[df_eval.reward == 1].groupby("subject")["problem"].count()
accuracy_percentage = (accurate_by_category / counts_by_category) * 100
accuracy_percentage

In [None]:
sample_ids = df_eval[(df_eval.format_reward==1) & (df_eval.answer_reward==0)].sample(10).index
sample_problems = (df["problem"].tolist()[i] for i in sample_ids)
sample_answers = (df["answer"].tolist()[i] for i in sample_ids)
sample_answers_generated = (answers_generated[i] for i in sample_ids)

In [None]:
print(next(sample_problems))
print()
print(next(sample_answers))
print()
print(next(sample_answers_generated))


## 4. Supervised Finetuning for MATH

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
import torch
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Math-1.5B",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B")

### 4.2 SFT helper methods

In [None]:
import torch
from transformers import AutoTokenizer, PreTrainedTokenizer
import torch.nn.functional as F

def tokenize_prompt_and_output(
        prompt_strs: list[str],
        output_strs: list[str],
        tokenizer: PreTrainedTokenizer,
) -> dict[str, torch.Tensor]:
    max_seqlen = 0
    tokens_list = []
    masks = []
    for prompt_str, output_str in zip(prompt_strs, output_strs):
        tokens_prompt = tokenizer.encode(prompt_str) 
        tokens_output = tokenizer.encode(output_str)
        tokens = tokens_prompt + tokens_output
        mask = [False] * len(tokens_prompt) + [True] * len(tokens_output)
        tokens_list.append(tokens)
        masks.append(mask)

        if len(tokens) > max_seqlen:
            max_seqlen = len(tokens)
    
    tokens_padded = [tokens + [tokenizer.pad_token_id] * (max_seqlen - len(tokens)) for tokens in tokens_list]
    masks_padded = [mask + [False] * (max_seqlen - len(mask)) for mask in masks]

    result = {
        "input_ids": torch.tensor(tokens_padded)[:, :-1],
        "labels": torch.tensor(tokens_padded)[:, 1:],
        "response_mask": torch.tensor(masks_padded)[:, 1:],
    }
    return result

In [None]:
def compute_entropy(logits: torch.Tensor) -> torch.Tensor:
    logsumexp = torch.logsumexp(logits, dim=-1, keepdim=True)
    logp = logits - logsumexp
    p = torch.exp(logp)
    return -torch.sum(p * logp, axis=-1)

In [None]:
from transformers import PreTrainedModel

def get_response_log_probs(
    model: PreTrainedModel,
    input_ids: torch.Tensor,
    labels: torch.Tensor,
    return_token_entropy: bool = False,
) -> dict[str, torch.Tensor]:
    # Move to same device as model
    input_ids = input_ids.to(model.device)
    
    with torch.no_grad():
        logits = model(input_ids).logits
        logp = F.log_softmax(logits, dim=-1).to(labels.device)

        # Learn: advanced indexing; notice the `unsqueeze`
        batch_idx = torch.arange(labels.shape[0]).unsqueeze(1)
        seq_idx = torch.arange(labels.shape[1]).unsqueeze(0)
        logp = logp[batch_idx, seq_idx, labels]
        if return_token_entropy:
            return logp, compute_entropy(logits.to(logp.device))
        return logp



In [None]:
def mask_normalize(
    tensor: torch.Tensor,
    mask: torch.Tensor,
    normalize_constant: float,
    dim: int | None = None,
) -> torch.Tensor:
    return torch.sum(tensor * mask, dim=dim) / normalize_constant

In [None]:
# my loss is off by a factor of 2 comparing to ground truth.
# should I have divided by `batch_size` as well?
def sft_microbatch_train_step(
    policy_log_probs: torch.Tensor,
    response_mask: torch.Tensor,
    gradient_accumulation_steps: int,
    normalize_constant: float=1.0,
) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
    loss = -1*mask_normalize(
        # policy_log_probs, response_mask, normalize_constant, dim=None
        torch.exp(policy_log_probs) * policy_log_probs, response_mask, normalize_constant, dim=None
    ) / gradient_accumulation_steps
    loss.backward()

    metadata = {
        "loss": loss,
        "policy_log_probs_grad": policy_log_probs.grad
    } 

    return loss, metadata

In [None]:
from os import PathLike

def log_generations(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    prompts: list[str],
    labels: list[str],
    reward_fn,
):
    tokenized = tokenize_prompt_and_output(
        prompts, labels, tokenizer
    )
    logits = model(tokenized["input_ids"])
    average_token_entropy = torch.mean(compute_entropy(logits)).item()
    reward_format, reward_answer, reward_total = reward_fn(logits)

    # result = {
    #     "input_ids": torch.tensor(tokens_padded)[:, :-1],
    #     "labels": torch.tensor(tokens_padded)[:, 1:],
    #     "response_mask": torch.tensor(masks_padded)[:, 1:],
    # }



In [None]:
torch.tensor([1.0]).item()

### 4.3 SFT Experiment

In [None]:
from unittest.mock import patch
from vllm.model_executor import set_random_seed as vllm_set_random_seed

def init_vllm(model_id: str, device: str, seed: int, gpu_memory_utilization: float=0.85):
    """
    Start the inference process, here we use vLLM to hold a model on
    a GPU separate from the policy.   
    """
    vllm_set_random_seed(seed)
    world_size_patch = patch("torch.distributed.get_world_size", return_value=1)
    profiling_patch = patch(
        "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling",
        return_value = None
    )
    with world_size_patch, profiling_patch:
        return LLM(
            model=model_id,
            device=device,
            dtype=torch.bfloat16,
            enable_prefix_caching=True,
            gpu_memory_utilization=gpu_memory_utilization,
        )

def load_policy_into_vllm_instance(policy: PreTrainedModel, llm: LLM):
    state_dict = policy.state_dict()
    llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
    llm_model.load_weights(state_dict.items())



In [None]:
x = torch.randint(0, 10, (2, 2))
msk = torch.Tensor([[0, 1], [1, 0]]).bool()
nc = 1.08

mask_normalize(x, msk, nc)

In [None]:
msk

In [None]:
x

In [None]:
x * msk

In [None]:
torch.sum(x, dim=None)

In [None]:
x[msk==1]

In [None]:
torch.sum(x, dim=-1)