# TRL

<a target="_blank" href="https://colab.research.google.com/github/life-efficient/Aligning-LLMs-using-RLHF/blob/main/RLHF/Notebook.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<br/>

> Aims: Show how to use TRL to align language models using RLHF

In [None]:
!pip install trl
!pip install torch

In [None]:
import torch

In [None]:
from reward_model import train_and_save_reward_model
import SFT_model import train_and_save_SFT_model

train_and_save_reward_model()
train_and_save_SFT_model()

Now we have our reward model and our supervised fine-tuning model, we can load them in:

In [None]:
reward_model = RewardModel()  # create model
reward_state_dict = torch.load('reward_model.pt')  # load model weights
reward_model.load_state_dict(reward_state_dict)  # set model weights

In [None]:
sft_model = SFTModel()  # create model
sft_state_dict = torch.load('sft_model_params.pt')  # load model weights
sft_model.load_state_dict(sft_state_dict)  # set model weights

In [1]:
import trl

# imports
import torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from trl.core import respond_to_batch

ModuleNotFoundError: No module named 'trl'

In [None]:

# get models
model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2')
model_ref = create_reference_model(model)

tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# initialize trainer
ppo_config = PPOConfig(
    batch_size=1,
)


# create a ppo trainer
ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

def train_and_save_RLHF_model(epochs=10):
    # train for 10 epochs
    for i in range(epochs):

        # encode a query
        query_txt = "This morning I went to the "
        query_tensor = tokenizer.encode(query_txt, return_tensors="pt")

        # get model response
        response_tensor = respond_to_batch(model, query_tensor)

        # define a reward for response
        # (this could be any reward such as human feedback or output from another model)
        reward = [torch.tensor(1.0)]

        # train model for one step with ppo
        train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

        # print training stats
        print(f'epoch {i} training loss: {train_stats["policy_loss"]}')


    # save model
    torch.save(ppo_trainer.model.state_dict(), 'RLHF_model')
    ppo_trainer.model.save_pretrained('RLHF_model')