## 3. Measuring Zero-Shot MATH Performance

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")

In [None]:
# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]

# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["\n"]
)

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)

In [None]:
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

In [None]:
from vllm import LLM, SamplingParams
from typing import Callable, List
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn
import json
import pandas as pd
from os import PathLike

def get_prompts(prompt_template, problems):
    prompts = [prompt_template.replace("{question}", p) for p in problems]
    return prompts

def evaluate_vllm(
    vllm_model: LLM,
    eval_sampling_params: SamplingParams,
    prompts: List[str],
    solutions: List[str],
    reward_fn: Callable[[str, str], dict[str, float]],
    output_file: str | PathLike | None = None
) -> tuple[list, list]:
    """
    Evaluatea languagemodelon a listof prompts,
    compute evaluation metrics, and serialize results to disk.
    """
    responses = vllm_model.generate(prompts, eval_sampling_params, use_tqdm=False)
    solutions_generated = [opt.outputs[0].text for opt in responses]

    evals = [reward_fn(sol_gen, sol) for sol_gen, sol in zip(solutions_generated, solutions)]

    # Serialize the prompts, solutions, solutions generated, and corresponding evals to disk
    if output_file:
        with open(output_file, 'w') as f:
            for prompt, solution, sol_gen, eval_dict in zip(prompts, solutions, solutions_generated, evals):
                result = {
                    "prompt": prompt,
                    "ground_truth": solution,
                    "generated": sol_gen,
                    "eval": eval_dict
                }
                f.write(json.dumps(result) + '\n')

    return evals, solutions_generated

In [None]:
llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")

prompt_r1_zero_file = "/home/azureuser/localfiles/cs336-assignment5-alignment-mine/cs336_alignment/prompts/r1_zero.prompt"
with open(prompt_r1_zero_file) as f:
    prompt_r1_zero = f.read()

df = pd.read_json("/home/azureuser/localfiles/cs336-assignment5-alignment-mine/data/validation.jsonl", lines=True)
# df.head()

In [None]:
# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["</answer>"],
    include_stop_str_in_output=True,
)

prompts, answers = get_prompts(prompt_r1_zero, df.problem.tolist())
evals, answers_generated = evaluate_vllm(
    llm, sampling_params, prompts, df.answer.tolist(), r1_zero_reward_fn, output_file="eval_results.jsonl")

## 3.1 Using vLLM for offline language model inference
### math_baseline
1. Done
1. Commentary on model and reward func perf
    1. See cell below for distribution. 
    1. For cases with zero format reward: most are because of the model failed to generate the answer tags or not in the right format. For cases with non-zero format reward but zero answer reward: 50/50 of wrong answer and parser failure
1. less than 3% get both format and answer rewards

In [None]:
# math_baseline.2
df_eval = pd.DataFrame(evals)
# Check rows where format_reward is 1 and answer_reward is 1
print(f"Format reward 1, Answer reward 1: {((df_eval["format_reward"] == 1) & (df_eval["answer_reward"] == 1)).sum()}")
print(f"Format reward 1, Answer reward 0: {((df_eval["format_reward"] == 1) & (df_eval["answer_reward"] == 0)).sum()}")
print(f"Format reward 0, Answer reward 0: {((df_eval["format_reward"] == 0) & (df_eval["answer_reward"] == 0)).sum()}")

In [None]:
counts_by_category = df.groupby("subject")["problem"].count()
accurate_by_category = df[df_eval.reward == 1].groupby("subject")["problem"].count()
accuracy_percentage = (accurate_by_category / counts_by_category) * 100
accuracy_percentage

In [None]:
sample_ids = df_eval[(df_eval.format_reward==1) & (df_eval.answer_reward==0)].sample(10).index
sample_problems = (df["problem"].tolist()[i] for i in sample_ids)
sample_answers = (df["answer"].tolist()[i] for i in sample_ids)
sample_answers_generated = (answers_generated[i] for i in sample_ids)

In [None]:
print(next(sample_problems))
print()
print(next(sample_answers))
print()
print(next(sample_answers_generated))


## 4. Supervised Finetuning for MATH

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Math-1.5B",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B")

### 4.2 SFT helper methods

In [None]:
import torch
from transformers import AutoTokenizer, PreTrainedTokenizer
import torch.nn.functional as F
from sft_helper_methods import *
from os import PathLike

def log_generations(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    prompts: list[str],
    labels: list[str],
    reward_fn,
    log_file: str | PathLike,
):
    pass
    # tokenized = tokenize_prompt_and_output(
    #     prompts, labels, tokenizer
    # )
    # logits = model(tokenized["input_ids"]).logits
    # average_token_entropy = torch.mean(compute_entropy(logits)).item()
    # reward_format, reward_answer, reward_total = reward_fn(logits)

    # result = {
    #     "input_ids": torch.tensor(tokens_padded)[:, :-1],
    #     "labels": torch.tensor(tokens_padded)[:, 1:],
    #     "response_mask": torch.tensor(masks_padded)[:, 1:],
    # }



### 4.3 SFT Experiment

In [None]:
from dataclasses import dataclass
@dataclass
class TrainingConfig:
    model_id = "Qwen/Qwen2.5-Math-1.5B"
    prompt_r1_zero = "/home/azureuser/localfiles/cs336-assignment5-alignment-mine/cs336_alignment/prompts/r1_zero.prompt"
    file_train = "/home/azureuser/localfiles/cs336-assignment5-alignment-mine/data/sft.jsonl"
    file_eval = "/home/azureuser/localfiles/cs336-assignment5-alignment-mine/data/validation.jsonl"
    seed: int = 0
    dtype: str = "bfloat16"
    train_batch_size: int = 4
    eval_batch_size: int = 4
    train_steps: int = 64
    gradient_accumulation_steps: int = 2
    compile: bool = True
    eval_iterations: int = 1_000
    eval_interval: int = 16
    max_grad_norm: float | None = 1.0
    device_train: str = "cuda:0"
    device_eval: str = "cuda:1"
    lr: float = 1e-3
    warmup_ratio: float = 0.01
    weight_decay: float = 0.1
    adam_beta1: float = 0.9
    adam_beta2: float = 0.98
    adam_eps: float = 1e-9
    wandb_project: str | None = None
    wandb_entity: str | None = None
    log_interval: int = 20
    save_checkpoints: bool = False

cfg = TrainingConfig

In [None]:
import torch
from vllm import LLM
from unittest.mock import patch
from vllm.model_executor import set_random_seed as vllm_set_random_seed
from transformers import PreTrainedModel

def init_vllm(model_id: str, device: str, seed: int, gpu_memory_utilization: float=0.85):
    """
    Start the inference process, here we use vLLM to hold a model on
    a GPU separate from the policy.   
    """
    vllm_set_random_seed(seed)
    world_size_patch = patch("torch.distributed.get_world_size", return_value=1)
    profiling_patch = patch(
        "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling",
        return_value = None
    )
    with world_size_patch, profiling_patch:
        return LLM(
            model=model_id,
            device=device,
            dtype=torch.bfloat16,
            enable_prefix_caching=True,
            gpu_memory_utilization=gpu_memory_utilization,
        )

def load_policy_into_vllm_instance(policy: PreTrainedModel, llm: LLM):
    state_dict = policy.state_dict()
    llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
    llm_model.load_weights(state_dict.items())



In [None]:
vllm_model = init_vllm(cfg.model_id, device=cfg.device_eval, seed=42)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    cfg.model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map=cfg.device_train,
)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)

In [None]:
from pathlib import Path
import pandas as pd

data_dir = Path("/home/azureuser/localfiles/cs336-assignment5-alignment-mine/data/")
df_train = pd.read_json(data_dir/"sft.jsonl", lines=True)
print(df_train.shape)
df_train = df_train.drop_duplicates().reset_index(drop=True)
print(df_train.shape)

In [None]:
# Set up the AdamW optimizer.
# First, we need to group the parameters that should
# be decayed and those that shouldn't.
# In particular, we do not apply decay on 1D parameters (e.g., biases and RMSNorms)
# filter out those that do not require grad
param_dict = {pn: p for pn, p in model.named_parameters() if p.requires_grad}
params_to_decay = [p for _, p in param_dict.items() if p.dim() >= 2]
params_to_not_decay = [p for _, p in param_dict.items() if p.dim() < 2]
optim_groups = [
    {"params": params_to_decay, "weight_decay": cfg.weight_decay},
    {"params": params_to_not_decay, "weight_decay": 0.0},
]
# Create AdamW optimizer and use the fused version if it is available
optimizer = torch.optim.AdamW(
    optim_groups,
    lr=cfg.lr,
    betas=(cfg.adam_beta1, cfg.adam_beta2),
    eps=cfg.adam_eps,
    fused=True,
)

In [None]:
from sft_helper_methods import *

assert cfg.train_steps % cfg.gradient_accumulation_steps == 0
assert cfg.eval_interval % cfg.gradient_accumulation_steps == 0

with open(cfg.prompt_r1_zero) as f:
    prompt_r1_zero = f.read()

df_eval = pd.read_json(cfg.file_eval, lines=True).iloc[:100]

# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["</answer>"],
    include_stop_str_in_output=True,
)

for step in range(cfg.train_steps):
    batch = df_train.sample(cfg.train_batch_size)
    tokenized_dict = tokenize_prompt_and_output(
        batch.prompt.tolist(),
        batch.response.tolist(),
        tokenizer,
    )
    input_ids = tokenized_dict["input_ids"].to(cfg.device_train)
    labels = tokenized_dict["labels"].to(cfg.device_train)
    response_masks = tokenized_dict["response_masks"].to(cfg.device_train)

    policy_log_probs = get_response_log_probs(
        model, input_ids, labels, return_token_entropy=False
    )["log_probs"]

    # requires gradient
    policy_log_probs.requires_grad_(True)

    # loss.backward() is inside `sft_microbatch_train_step`
    loss, metadata = sft_microbatch_train_step(
        policy_log_probs, response_masks, cfg.gradient_accumulation_steps
    )
    # log loss and metadata...

    if (step + 1) % cfg.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

    if (step + 1) % cfg.eval_interval == 0:
        load_policy_into_vllm_instance(model, vllm_model)
        
        prompts = get_prompts(
            prompt_r1_zero, df_eval.problem.tolist())
        evals, _ = evaluate_vllm(
            llm, sampling_params, prompts, df_eval.answer.tolist(), r1_zero_reward_fn)

        accuracy = sum([eval["reward"] for eval in evals]) / len(evals)
        print(f"Step {step}: accuracy {accuracy}")

In [None]:
step

In [None]:
llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")

prompt_r1_zero_file = "/home/azureuser/localfiles/cs336-assignment5-alignment-mine/cs336_alignment/prompts/r1_zero.prompt"
with open(prompt_r1_zero_file) as f:
    prompt_r1_zero = f.read()

df = pd.read_json("/home/azureuser/localfiles/cs336-assignment5-alignment-mine/data/validation.jsonl", lines=True)
# df.head()
# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["</answer>"],
    include_stop_str_in_output=True,
)

prompts, answers = get_prompts_and_answers(prompt_r1_zero, df)
evals, answers_generated = evaluate_vllm(
    llm, sampling_params, prompts, r1_zero_reward_fn, answers)