# Week 8 Homework - Reward Model Evaluation

This notebook evaluates our trained reward model using ROUGE and BERTScore metrics on the evaluation dataset.

In [1]:
# Import required libraries
from evaluate import load
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import torch
import json
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm
W0915 17:10:10.964000 32560 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
# Load evaluation data
script_dir = os.path.dirname(os.path.abspath('__file__')) if '__file__' in globals() else os.getcwd()
eval_file = os.path.join(script_dir, "eval_data.jsonl")

# Load dataset
eval_dataset = load_dataset("json", data_files=eval_file, split="train")
print(f"Loaded {len(eval_dataset)} evaluation examples")

Generating train split: 10 examples [00:00, 1666.13 examples/s]

Loaded 10 evaluation examples





In [3]:
# Extract summaries for evaluation
chosen_summaries = eval_dataset["chosen"]
rejected_summaries = eval_dataset["rejected"]

print(f"Number of chosen summaries: {len(chosen_summaries)}")
print(f"Number of rejected summaries: {len(rejected_summaries)}")

# Show first example
print("\nFirst chosen summary:")
print(chosen_summaries[0][:200] + "...")
print("\nFirst rejected summary:")
print(rejected_summaries[0][:200] + "...")

Number of chosen summaries: 10
Number of rejected summaries: 10

First chosen summary:
### Summary

**Problem Addressed:**
The paper addresses the evolving role of pragmatics in the context of large language models (LLMs), particularly their communicative capabilities. It challenges the...

First rejected summary:
### Summary

**Problem Addressed:**
The paper addresses how the advent of large language models (LLMs) necessitates a reconsideration of pragmatic theory, particularly in terms of its traditional posi...


In [6]:
# Load ROUGE and BERTScore metrics (following professor's code)
rouge = load("rouge")
bertscore = load("bertscore")

print("Loaded ROUGE and BERTScore metrics")

Downloading builder script: 7.95kB [00:00, ?B/s]


Loaded ROUGE and BERTScore metrics


In [7]:
# Compute ROUGE scores comparing chosen vs rejected summaries
print("Computing ROUGE scores...")
results_rouge = rouge.compute(predictions=chosen_summaries, references=rejected_summaries)

print("ROUGE Results:")
for key, value in results_rouge.items():
    print(f"{key}: {value:.4f}")

Computing ROUGE scores...
ROUGE Results:
rouge1: 0.6258
rouge2: 0.3906
rougeL: 0.5007
rougeLsum: 0.5701


In [8]:
# Compute BERTScore comparing chosen vs rejected summaries
print("Computing BERTScore...")
results_bertscore = bertscore.compute(predictions=chosen_summaries, references=rejected_summaries, lang="en")

print("BERTScore Results:")
print(f"Precision: {sum(results_bertscore['precision'])/len(results_bertscore['precision']):.4f}")
print(f"Recall: {sum(results_bertscore['recall'])/len(results_bertscore['recall']):.4f}")
print(f"F1: {sum(results_bertscore['f1'])/len(results_bertscore['f1']):.4f}")

Computing BERTScore...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Results:
Precision: 0.9203
Recall: 0.9205
F1: 0.9204


In [9]:
# Load our trained reward model
print("Loading trained reward model...")
model_path = "./reward_model"
reward_model = AutoModelForSequenceClassification.from_pretrained(model_path)
reward_tokenizer = AutoTokenizer.from_pretrained(model_path)

# Set model to evaluation mode
reward_model.eval()
print("Reward model loaded successfully")

Loading trained reward model...
Reward model loaded successfully


In [10]:
# Function to get reward scores
def get_reward_score(text):
    inputs = reward_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    
    with torch.no_grad():
        outputs = reward_model(**inputs)
        reward_score = outputs.logits.item()
    
    return reward_score

In [11]:
# Compute reward scores for chosen and rejected summaries
print("Computing reward model scores...")

chosen_scores = []
rejected_scores = []

for i, (chosen, rejected) in enumerate(zip(chosen_summaries, rejected_summaries)):
    if i % 5 == 0:  # Progress indicator
        print(f"Processing example {i+1}/{len(chosen_summaries)}")
    
    chosen_score = get_reward_score(chosen)
    rejected_score = get_reward_score(rejected)
    
    chosen_scores.append(chosen_score)
    rejected_scores.append(rejected_score)

print("Reward scoring completed")

Computing reward model scores...
Processing example 1/10
Processing example 6/10
Reward scoring completed


In [12]:
# Analyze reward model results
import numpy as np

chosen_mean = np.mean(chosen_scores)
rejected_mean = np.mean(rejected_scores)

print("Reward Model Results:")
print(f"Average score for chosen summaries: {chosen_mean:.4f}")
print(f"Average score for rejected summaries: {rejected_mean:.4f}")
print(f"Score difference (chosen - rejected): {chosen_mean - rejected_mean:.4f}")

# Count how many times chosen scored higher than rejected
correct_preferences = sum(1 for c, r in zip(chosen_scores, rejected_scores) if c > r)
accuracy = correct_preferences / len(chosen_scores)
print(f"Reward model accuracy (chosen > rejected): {accuracy:.2%}")

Reward Model Results:
Average score for chosen summaries: 0.1990
Average score for rejected summaries: 0.1747
Score difference (chosen - rejected): 0.0243
Reward model accuracy (chosen > rejected): 60.00%


In [13]:
# Create summary table
results_summary = {
    "Metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L", "BERTScore-Precision", "BERTScore-Recall", "BERTScore-F1", 
               "Reward-Chosen", "Reward-Rejected", "Reward-Accuracy"],
    "Score": [
        results_rouge["rouge1"],
        results_rouge["rouge2"], 
        results_rouge["rougeL"],
        sum(results_bertscore['precision'])/len(results_bertscore['precision']),
        sum(results_bertscore['recall'])/len(results_bertscore['recall']),
        sum(results_bertscore['f1'])/len(results_bertscore['f1']),
        chosen_mean,
        rejected_mean,
        accuracy
    ]
}

results_df = pd.DataFrame(results_summary)
print("\nEvaluation Results Summary:")
print(results_df.to_string(index=False, float_format='%.4f'))


Evaluation Results Summary:
             Metric  Score
            ROUGE-1 0.6258
            ROUGE-2 0.3906
            ROUGE-L 0.5007
BERTScore-Precision 0.9203
   BERTScore-Recall 0.9205
       BERTScore-F1 0.9204
      Reward-Chosen 0.1990
    Reward-Rejected 0.1747
    Reward-Accuracy 0.6000


In [14]:
# Show some example comparisons
print("\nExample Comparisons:")
print("=" * 80)

for i in range(min(3, len(chosen_summaries))):
    print(f"\nExample {i+1}:")
    print(f"Chosen Summary (Score: {chosen_scores[i]:.4f}):")
    print(chosen_summaries[i][:200] + "...")
    print(f"\nRejected Summary (Score: {rejected_scores[i]:.4f}):")
    print(rejected_summaries[i][:200] + "...")
    print("-" * 80)


Example Comparisons:

Example 1:
Chosen Summary (Score: 0.1656):
### Summary

**Problem Addressed:**
The paper addresses the evolving role of pragmatics in the context of large language models (LLMs), particularly their communicative capabilities. It challenges the...

Rejected Summary (Score: 0.0711):
### Summary

**Problem Addressed:**
The paper addresses how the advent of large language models (LLMs) necessitates a reconsideration of pragmatic theory, particularly in terms of its traditional posi...
--------------------------------------------------------------------------------

Example 2:
Chosen Summary (Score: 0.2702):
### Summary

**Problem Addressed:** Robust content moderation requires systems that can adapt to evolving policies without costly retraining.

**Core Method:** The paper introduces the **Contextual Po...

Rejected Summary (Score: 0.0722):
**Summary:**

**Problem Addressed:** Robust content moderation requires systems that can adapt to evolving policies without cos