# üß† Reward Modeling with TRL (GPT2 Example)

Converted from the original Hugging Face script for Jupyter Notebook use.

In [None]:
# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License").
#     http://www.apache.org/licenses/LICENSE-2.0


### üß© 1. Install dependencies

In [None]:
!pip install -q trl trackio kernels accelerate datasets transformers

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/423.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m423.1/423.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/872.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m872.5/872.5 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/40.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   

###üìö2. Initialize

In [None]:
model_name = 'gpt2'
# path where reward model should be saved
output_dir = ''

### ‚öôÔ∏è 2. Imports and logging setup

In [None]:
import os
import torch
from accelerate import logging
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, HfArgumentParser

from trl import (
    ModelConfig,
    RewardConfig,
    RewardTrainer,
    ScriptArguments,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)

logger = logging.get_logger(__name__)
os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")

'trl-trackio'

### üì¶ 3. Define helper configs for notebook runs

In [None]:
from dataclasses import dataclass

script_args = ScriptArguments(
    dataset_name="Anthropic/hh-rlhf"
)
training_args = RewardConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    gradient_checkpointing=True,
    learning_rate=1e-5,
    eval_strategy="steps",
    eval_steps=50,
    max_length=1024,
)
model_args = ModelConfig(
    model_name_or_path="gpt2",
    # trust_remote_code=True,
)
training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)

### üß† 4. Load model and tokenizer

In [None]:
dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
model_kwargs = dict(
    revision=model_args.model_revision,
    use_cache=False if training_args.gradient_checkpointing else True,
    dtype=dtype,
)

quantization_config = get_quantization_config(model_args)
if quantization_config is not None:
    model_kwargs["device_map"] = get_kbit_device_map()
    model_kwargs["quantization_config"] = quantization_config

model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    num_labels=1,
    trust_remote_code=model_args.trust_remote_code,
    **model_kwargs,
)

if model_args.use_peft and model_args.lora_task_type != "SEQ_CLS":
    logger.warning(
        "‚ö†Ô∏è PEFT `task_type` differs from SEQ_CLS ‚Äî this can cause silent bugs."
    )

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### üìö 5. Load dataset

In [None]:
dataset = load_dataset(script_args.dataset_name, split="train[:10%]")
eval_dataset = dataset.select(range(min(512, len(dataset))))
print(dataset)

README.md: 0.00B [00:00, ?B/s]

harmless-base/train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

helpful-base/train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

helpful-online/train.jsonl.gz:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

helpful-rejection-sampled/train.jsonl.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

harmless-base/test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

helpful-base/test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

helpful-online/test.jsonl.gz:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

helpful-rejection-sampled/test.jsonl.gz:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 16080
})


### üöÄ 6. Train Reward Model

In [None]:
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    peft_config=get_peft_config(model_args),
)

trainer.train()

### üìä 7. Evaluate and save model

In [None]:
trainer.save_model(training_args.output_dir)

if training_args.eval_strategy != "no":
    metrics = trainer.evaluate()
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

if training_args.push_to_hub:
    trainer.push_to_hub(dataset_name=script_args.dataset_name)

***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =     0.7227
  eval_loss               =     0.6538
  eval_margin             =     1.7722
  eval_max_reward         =     0.8689
  eval_mean_reward        =    -4.7103
  eval_min_reward         =    -10.481
  eval_num_tokens         =  5286003.0
  eval_runtime            = 0:00:07.38
  eval_samples_per_second =       69.3
  eval_steps_per_second   =      8.662


### ‚úÖ 8. Quick inference sanity check

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
prompt = "The assistant is polite and helpful."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    reward = model(**inputs).logits.item()

print(f"Reward score: {reward:.4f}")