# TRL

<a target="_blank" href="https://colab.research.google.com/github/life-efficient/Aligning-LLMs-using-RLHF/blob/main/RLHF/Notebook.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<br/>

> Aim: Show how to use TRL to align language models using RLHF

In [None]:
!pip install trl
!pip install torch

In [None]:
import torch

The cell below trains and loads in an initial fine-tuned model.

In [None]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch
import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
import requests


class SFTModel(GPT2LMHeadModel):
    def __init__(self):
        configuration = GPT2Config.from_pretrained(
            'gpt2', output_hidden_states=False)
        super().__init__(config=configuration)
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            "gpt2", config=configuration)  # Load the tokenizer
        self.to(torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu"))
        self.to(self.device)  # Move the model to the GPU

    def forward(self, prompt, response):
        # Encode the data
        entire_text = prompt + response
        context_dict = self.tokenizer(
            '<|startoftext|>' + entire_text + '<|endoftext|>',
            #    truncation=True,
            #    max_length=max_length,
            #    padding="max_length"
        )

        input_ids = torch.tensor(context_dict.input_ids)
        labels = torch.tensor(context_dict.input_ids)
        attention_mask = torch.tensor(context_dict.attention_mask)

        # Move to GPU
        input_ids = input_ids.to(self.device)
        labels = labels.to(self.device)
        attention_mask = attention_mask.to(self.device)

        # Run the model
        outputs = super().forward(
            input_ids=input_ids,
            labels=labels,
            attention_mask=attention_mask,
        )
        return outputs


class SFTDataset(torch.utils.data.Dataset):
    """Supervised Fine-Tuning Dataset

    Returns:
        prompt: str
        response: str
    """

    def __init__(self):
        r = requests.get(
            "https://raw.githubusercontent.com/life-efficient/Aligning-LLMs-using-RLHF/main/RLHF/sft_dataset.json")
        self.data = json.loads(r.content)
        # with open("https://raw.githubusercontent.com/life-efficient/Aligning-LLMs-using-RLHF/main/RLHF/sft_dataset.json") as f:
        #     self.data = json.load(f)

    def __len__(self):
        """Defines the length of the dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        """Defines how to get a sample from the dataset by indexing it.

        Returns:
            prompt: str
            response: str
        """
        return self.data[idx]["prompt"], self.data[idx]["response"]


def train_and_save_SFT_model(epochs=10):

    # Create the model
    model = SFTModel()  # Load the model

    # Create the dataset and dataloader
    dataset = SFTDataset()
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

    # Create the optimizer
    # as used in the InstructGPT paper
    optimizer = torch.optim.Adam(
        model.parameters(), lr=1e-5, betas=(0.9, 0.95))

    # Set up logging
    writer = SummaryWriter()  # for logging our loss to TensorBoard
    # for setting the x-axis of our TensorBoard plots (loss vs. batch index)
    batch_idx = 0

    # Train the model
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        for batch in tqdm(dataloader):
            # Get the data
            prompt, response = batch
            prompt = prompt[0]
            response = response[0]

            # Forward pass
            outputs = model(prompt, response)

            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()

            # Zero the gradients
            optimizer.zero_grad()

            # Log the loss
            # print(f"Loss: {loss.item()}", batch_idx)
            writer.add_scalar("SFT Model Loss/train", loss.item(), batch_idx)
            batch_idx += 1
    torch.save(model.state_dict(), "sft_model_params.pt")


The cell below trains and loads in an initial reward model.

In [None]:
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from transformers import GPT2Model, GPT2Tokenizer
import torch
import pandas as pd


def loss_function(preferred_response_reward, alternate_response_reward):
    return -torch.mean(torch.log(torch.sigmoid(preferred_response_reward - alternate_response_reward)))


def create_response_pairs():

    data = pd.read_csv(
        'https://raw.githubusercontent.com/life-efficient/Aligning-LLMs-using-RLHF/main/RLHF/reward_dataset.csv', sep="|")

    data = data.to_dict(orient="records")
    response_pairs = []

    for row in data:
        prompt = row["Prompt"]
        response_pairs.append(
            (prompt, row["Most preferable response"], row["Somewhat preferable response"]))
        response_pairs.append(
            (prompt, row["Most preferable response"], row["Least preferable response"]))
        response_pairs.append(
            (prompt, row["Somewhat preferable response"], row["Least preferable response"]))

    return response_pairs


class RewardDataset(torch.utils.data.Dataset):
    def __init__(self):
        """Initializes the dataset."""
        self.response_pairs = create_response_pairs()
        print("Number of response pairs:", len(self.response_pairs))

    def __len__(self):
        """Returns the length of the dataset."""
        return len(self.response_pairs)

    def __getitem__(self, idx):
        """Returns the example in the dataset at the given index."""

        # Get the response pair at the given index
        response_pair = self.response_pairs[idx]
        prompt, preferred_response, alternate_response = response_pair

        # Return the preferred response, alternate response
        return prompt, preferred_response, alternate_response


class RewardModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.backbone = GPT2Model.from_pretrained('gpt2')
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.regression_head = torch.nn.Linear(768, 1)
        self.to(self.device)

    def forward(self, context, response):
        """
        Returns a scalar value representing the reward for this response, given the context.
        Args:
            context (str): The context. aka. the prompt.
            response (str): The response. aka. the response to the prompt.
        Returns:
            float: The reward for generating this response given the context.    
        """

        entire_text = context + response
        context_dict = self.tokenizer(
            '<|startoftext|>' + entire_text + '<|endoftext|>',
            #    truncation=True,
            #    max_length=max_length,
            #    padding="max_length"
        )

        input_ids = torch.tensor(context_dict.input_ids)
        attention_mask = torch.tensor(context_dict.attention_mask)

        # Move to GPU
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)

        # Forward pass
        gpt2_outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        all_output_vectors = gpt2_outputs.last_hidden_state
        last_output_vector = all_output_vectors[-1]

        # add batch_size dimension
        last_output_vector = last_output_vector.unsqueeze(0)
        reward = self.regression_head(last_output_vector)

        return reward


def train_and_save_reward_model(epochs=10):

    model = RewardModel()

    # Create the dataset and dataloader
    dataset = RewardDataset()

    # Create the optimizer
    optimizer = torch.optim.Adam(
        model.parameters(), lr=1e-5, betas=(0.9, 0.95))  # as used in the InstructGPT paper

    # Set up logging
    writer = SummaryWriter()  # for logging our loss to TensorBoard
    # for setting the x-axis of our TensorBoard plots (loss vs. batch index)
    batch_idx = 0
    # Train the model
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        for batch in tqdm(dataset):

            prompt, preferred_response, alternate_response = batch

            preferred_response_reward = model(prompt, preferred_response)
            alternate_response_reward = model(prompt, alternate_response)

            loss = loss_function(preferred_response_reward,
                                 alternate_response_reward)

            loss.backward()

            optimizer.step()

            optimizer.zero_grad()

            writer.add_scalar("Reward Model Loss/Train",
                              loss.item(), batch_idx)
            batch_idx += 1
            # torch.save(model.state_dict(),
            #            f"epoch-{epoch}-reward_model_params.pt")
    torch.save(model.state_dict(), "reward_model.pt")


TODO move above cells to files

In [None]:
train_and_save_reward_model(epochs=1)
train_and_save_SFT_model(epochs=1)


Now we have our reward model and our supervised fine-tuning model, we can load them in:

In [None]:
reward_model = RewardModel()  # create model
reward_state_dict = torch.load('reward_model.pt')  # load model weights
reward_model.load_state_dict(reward_state_dict)  # set model weights

In [None]:
sft_model = SFTModel()  # create model
sft_state_dict = torch.load('sft_model_params.pt')  # load model weights
sft_model.load_state_dict(sft_state_dict)  # set model weights

Now we have all of the ingredients to train our final model

In [1]:
import trl

# imports
import torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from trl.core import respond_to_batch

ModuleNotFoundError: No module named 'trl'

In [1]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'


NameError: name 'torch' is not defined

In [None]:

# get models
model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2')
model_ref = create_reference_model(model)

tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# initialize trainer
ppo_config = PPOConfig(
    batch_size=1,
)


# create a ppo trainer
ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

def train_and_save_RLHF_model(epochs=10):
    # train for 10 epochs
    for i in range(epochs):

        # encode a query
        query_txt = "This morning I went to the "
        query_tensor = tokenizer.encode(query_txt, return_tensors="pt")

        # get model response
        response_tensor = respond_to_batch(model, query_tensor)

        # define a reward for response
        # (this could be any reward such as human feedback or output from another model)
        reward = [torch.tensor(1.0)]

        # train model for one step with ppo
        train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

        # print training stats
        print(f'epoch {i} training loss: {train_stats["policy_loss"]}')


    # save model
    torch.save(ppo_trainer.model.state_dict(), 'RLHF_model')
    ppo_trainer.model.save_pretrained('RLHF_model')