In [7]:
import torch
import torch.nn as nn
from trl import RLOOConfig, RLOOTrainer, apply_chat_template
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

secret_value_0 = os.getenv('HF_TOKEN')

from huggingface_hub import login
login(token=secret_value_0)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

# Create a custom reward model that always returns a constant reward
class ConstantRewardModel(nn.Module):
    def __init__(self):
        super(ConstantRewardModel, self).__init__()

    def forward(self, outputs):
        # Return a tensor of constant rewards (1.0)
        return torch.ones(outputs.size(0), 1)  # Shape: (batch_size, 1)

# Instantiate the custom reward model
reward_model = ConstantRewardModel()

# Add necessary attributes to the reward model
reward_model.config = {"num_labels": 1}  # Mimic the config attribute
reward_model.base_model_prefix = "constant_reward_model"

ref_policy = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
policy = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

dataset = load_dataset("trl-lib/ultrafeedback-prompt")
dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})
dataset = dataset.map(lambda x: tokenizer(x["prompt"]), remove_columns="prompt")

training_args = RLOOConfig(output_dir="Qwen2.5-0.5B-RL", report_to="none")
trainer = RLOOTrainer(
    config=training_args,
    processing_class=tokenizer,
    policy=policy,
    ref_policy=ref_policy,
    reward_model=reward_model,  # Use the custom reward model here
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


===training policy===


AttributeError: 'ConstantRewardModel' object has no attribute 'constant_reward_model'

In [1]:
from trl import RLOOConfig, RLOOTrainer, apply_chat_template, PPOConfig, PPOTrainer
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import torch
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

secret_value_0 = os.getenv('HF_TOKEN')

from huggingface_hub import login
login(token = secret_value_0)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
reward_model = AutoModelForSequenceClassification.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct", num_labels=1
)

def custom_reward(output_tensors):
    results = []
    for output in output_tensors:
        txt = tokenizer.decode(output)
        print(txt)
        results.append(1)
    
    return torch.tensor(results)

ref_policy = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
policy = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

dataset = load_dataset("trl-lib/ultrafeedback-prompt")
dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})
dataset = dataset.map(lambda x: tokenizer(x["prompt"]), remove_columns="prompt")

training_args = RLOOConfig(output_dir="Qwen2.5-0.5B-RL", report_to="none")
trainer = RLOOTrainer(
    config=training_args,
    processing_class=tokenizer,
    policy=policy,
    ref_policy=ref_policy,
    reward_model=reward_model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


===training policy===


From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Step,Training Loss


KeyboardInterrupt: 

In [23]:
import torch
tyt = tokenizer.decode(dataset["train"][0]["input_ids"])
inp = torch.tensor([dataset["train"][0]["input_ids"]]).to(reward_model.device)
res = reward_model(input_ids=inp)
res.logits

tensor([[3.0505]], device='cuda:0', grad_fn=<IndexBackward0>)