# HW 3

(public notebook OK)

**Model:** GPT2 pretrained weights using API provided in NanoGPT's `model.py.`

In [None]:
%load_ext autoreload
%autoreload 2

import re
import numpy as np
import torch
import torch.nn.functional as F
import tiktoken
from tqdm import tqdm
import json
import random
import scipy.stats as sp
from model import GPT

device = 'cpu'

enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
decode = lambda l: enc.decode(l)

In [None]:
# demo from sample.py
model = GPT.from_pretrained('gpt2', dict(dropout=0.0))

# device = 'mps' if torch.backends.mps.is_available() else 'cpu'
model = model.to(device)

## Q1 Alignment

## 1.1 Heuristic

I'd like my model to craft **longer sentences** (by word count). I feel like everyone is always asking chat bots to be more concise, so let's play the devil's advocate for a moment.

In [None]:
def avg_sent_len(text: str):
    sentences = re.split(r'[.!?]', text)
    return np.mean([len(s.split()) for s in sentences if s.strip()])

## 1.2 Train reward model

My reward is a deterministic heuristic: `average_sentence_length,` which takes in text and produces a scalar.


## 1.3 Test on NanoGPT (and simultaneously generate training data)

In [None]:
start = "\n"
num_samples = 100
max_new_tokens_1_3 = 100  # subwords, not chars

temperature = 0.8 
seed = 1337
top_k = 200

SKIP_TRAINING = False

In [None]:
def generate_and_score(start, num_samples, max_new_tokens, temperature=temperature, top_k=top_k):
    
    output_scores = [] # (text, score) pairs, aka training data for 1.2
    torch.manual_seed(seed)
    x = (torch.tensor(encode(start), dtype=torch.long, device=device)[None, ...])

    with torch.no_grad():
        print("generating outputs")
        for k in tqdm(range(num_samples)): 
            y = model.generate(x, 
                                max_new_tokens, 
                                temperature=temperature, 
                                top_k=top_k)
            text = decode(y[0].tolist())
            score = avg_sent_len(text)
            output_scores.append((text, score))
    
    output_scores = [os for os in output_scores if os[1] > 0] 

    return output_scores

output_scores1_2 = generate_and_score(start, num_samples, max_new_tokens_1_3)
output_scores1_2 = [os for os in output_scores1_2 if os[1] > 0] 

In [None]:
def write_scores_json(output_scores, filename, sort=True):
    if sort:
        output_scores = sorted(output_scores, key=lambda x: x[1], reverse=True)

    scores = [score for _, score in output_scores]
    res = {}
    res['mean score'] = float(np.mean(scores))
    res['median score'] = float(np.median(scores))
    res['std scores'] = float(np.std(scores))
    res['outputs'] = []

    for text, score in output_scores:
        res['outputs'].append({'score': score, 'text': text})

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(json.dumps(res, ensure_ascii=False, indent=4))


if not SKIP_TRAINING:
    write_scores_json(output_scores1_2, filename="hw3_1.2_output.json")

## 1.4  RLHF - vanilla policy gradient

This implements a sequence-level reward instead of a token-level reward.

In [None]:
steps = 25
max_new_tokens_rlhf = 50

# torch.mps.empty_cache()
rl_optim = torch.optim.AdamW(model.parameters())
start_tokens = torch.tensor([[1]], dtype=torch.long).to(device)


step_stats = []
for step in tqdm(range(steps)):
    y, log_probs = model.generate(
        start_tokens, 
        max_new_tokens=max_new_tokens_rlhf,
        return_log_probs=True
    )

    text = decode(y[0].tolist())

    reward_scalar = avg_sent_len(text)
    reward_tensor = torch.tensor(reward_scalar, dtype=torch.float32).to(log_probs.device)
    policy_loss = -(log_probs.sum() * reward_tensor)
    rl_optim.zero_grad()
    policy_loss.backward()
    rl_optim.step()
    
    step_stats.append({'reward': reward_scalar, 'loss': policy_loss.item()})
    

In [None]:
for i, stat in enumerate(step_stats):
    print(f"Step {i}: Reward = {stat['reward']}, Loss = {stat['loss']}")

Now look at the output generated after RL:

In [None]:
output_scores1_4 = generate_and_score(start=start, num_samples=num_samples, max_new_tokens=max_new_tokens_1_3)
write_scores_json(output_scores1_4, filename="hw3_1.4_output.json")

In [None]:
# is this a statically significant change?

scores_1_2 = [score for _, score in output_scores1_2]
scores_1_4 = [score for _, score in output_scores1_4]


# pre-RL normality
stat, p = sp.shapiro(scores_1_2)
print(f"Before RL:  p-value: {p}.   normal? {'yes' if p > 0.05 else 'no'}.")

# post-RL 
stat, p = sp.shapiro(scores_1_4)
print(f"After RL:  p-value: {p}.   normal? {'yes' if p > 0.05 else 'no'}.")


stat, p = sp.mannwhitneyu(scores_1_4, scores_1_2, alternative='greater')
print(f"Mann-Whitney: p-value: {p}.   significant? {'yes' if p < 0.05 else 'no'}.")

## Q2 - RLVR

## 2.1 verifier scoring function

I use a very similar function to `avg_sent_len` as defined before, except add a cap. Let $S$ be a set of sentences (strings) when the output, $y$, is split by on `.!?`. Let numWords be the number of words (split by space ' ') in a string.
Let $R_{\text{max}}$ be a cap (30).

$v(y) = \text{min}(~\frac{1}{|S|} * ~\Sigma_{s \in S} ~~\text{numWords}(s), ~~ R_{\text{max}}) $

In [None]:
def verifier(y, rmax=50):
    return min(avg_sent_len(y), rmax)

## 2.2 Prompts and Baseline

read in 100 prompts from `prompts.txt`, sample 10, and observe outputs.

In [None]:
model2 = GPT.from_pretrained('gpt2', dict(dropout=0.0))
model2 = model2.to(device)

In [None]:
with open('hw3_prompts.txt', 'r', encoding='utf-8') as f:
    prompts = f.readlines()

prompt_samp = random.sample(prompts, 10)
print(len(prompt_samp))

In [None]:
max_new_tokens_2_2 = 200

res = []
for i in tqdm(range(len(prompt_samp))):
    prompt = prompt_samp[i].strip()
    x = (torch.tensor(encode(prompt), dtype=torch.long, device=device)[None, ...])
    y = model2.generate(x, max_new_tokens=max_new_tokens_2_2, temperature=temperature, top_k=top_k)
    text = decode(y[0].tolist())
    score = verifier(text)
    res.append({'prompt': prompt, 'output': text, 'score': score})

In [None]:
res_scores = [r['score'] for r in res]
print("mean score: ", np.mean(res_scores))


with open('hw3_2.2_outputs.json', 'w', encoding='utf-8') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

## 2.3 GRPO / RLVR

In [None]:
group_size = 5 
steps = 5
max_new_tokens_rlvr = 100
temperature = 0.8 # same as before
top_k = 200


rl_optim2 = torch.optim.AdamW(model2.parameters())
rlvr_stats = []

for step in range(steps):
    print("step ", step)
    rewards = []
    log_probs = []
    group_outputs = []

    prompt = random.choice(prompt_samp).strip()
    x = (torch.tensor(encode(prompt), dtype=torch.long, device=device)[None, ...])

    print("generating outputs for promtpt: ", prompt)
    for i in tqdm(range(group_size)):
        y, log_prob = model2.generate(
            x, 
            max_new_tokens=max_new_tokens_rlvr, 
            temperature=temperature,
            top_k=top_k,
            return_log_probs=True)
        log_probs.append(log_prob)

        text = decode(y[0].tolist())
        score = verifier(text)
        rewards.append(score)
        group_outputs.append({"score": score, "output": text})
    
    rewards = np.array(rewards)
    advantage = (rewards - rewards.mean()) / (max(rewards.std(), .00001))
    advantage = torch.tensor(advantage, dtype=torch.float32, device=device)

    policy_loss = 0
    for i in range(group_size):
        policy_loss += -(log_probs[i].sum() * advantage[i])
    policy_loss = policy_loss / group_size

    rl_optim2.zero_grad()
    policy_loss.backward()
    rl_optim2.step()

    rlvr_stats.append({
        'step': step,
        'mean_reward': float(rewards.mean()),
        'std_reward': float(rewards.std()),
        'loss / group_size': float(policy_loss.item()),
        'individual outputs': group_outputs
    })

In [None]:
with open('hw3_2.3_outputs.json', 'w', encoding='utf-8') as f:
    json.dump(rlvr_stats, f, ensure_ascii=False, indent=4)

In [None]:
mean_rewards = [stat['mean_reward'] for stat in rlvr_stats]
print("means: ", mean_rewards)
print("mean of means: ", float(np.mean(mean_rewards)))

Finally, test on previous sample.

In [None]:
with open('hw3_2.2_outputs.json', 'r', encoding='utf-8') as f:
    samp_promps = json.load(f)

prompts = [e['prompt'] for e in samp_promps]
len(prompts)

In [None]:
max_new_tokens_2_3 = 200

res_2_3 = []
for i in tqdm(range(len(prompts))):
    prompt = prompts[i].strip()
    x = (torch.tensor(encode(prompt), dtype=torch.long, device=device)[None, ...])
    y = model2.generate(x, max_new_tokens=max_new_tokens_2_3, temperature=temperature, top_k=top_k)
    text = decode(y[0].tolist())
    score = verifier(text)
    res_2_3.append({'prompt': prompt, 'output': text, 'score': score})

In [None]:
res_2_3