# Reward Model Training

## Installing and importing necessry packages

In [1]:
%pip install -U pandas trl plotly -qqq transformers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
argilla 1.9.0 requires pandas<2.0.0,>=1.0.0, but you have pandas 2.0.3 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import random
from operator import itemgetter

import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
)
from trl import RewardTrainer

## Data Creation

In [12]:
import pandas as pd
df = pd.read_csv('./../input/feedback.csv',encoding='latin-1')
df.head()

Unnamed: 0,question,answer,feedback
0,What is your name?,Answer : Amit is my name,2
1,What is your name?,Answer : My name is Amit,5
2,Do you know why turkeys became the official fo...,"To be honest, I donÕt know anything about that...",5
3,Do you know why turkeys became the official fo...,I know that the American Indians were the firs...,1
4,How do I float on my back in the water?Ê,ÊYou should tuck your arms to your sides and a...,5


In [13]:
df['tup'] = list(zip(df['answer'], df['feedback']))
df_g = df.groupby('question')['tup'].apply(list).reset_index()

In [14]:
df_g["sorted_tup"] = df_g["tup"].apply(lambda x :sorted(x,key=itemgetter(1)) )
df_g["chosen"] = df_g["sorted_tup"].apply(lambda x: x[-1][0])
df_g["chosen_score"] = df_g["sorted_tup"].apply(lambda x: x[-1][1])
df_g["rejected"] = df_g["sorted_tup"].apply(lambda x: x[0][0])
df_g["rejected_score"] = df_g["sorted_tup"].apply(lambda x: x[0][1])
df_g = df_g.dropna()
df_g = df_g[(df_g['chosen_score']>=4.0) & (df_g['rejected_score']<4.0)]
df_g.to_csv("./../output/feedback_comparison_dataset.csv")

In [15]:
# build a dataset with chosen and rejected responses
rows = []
for record in df_g.itertuples(index=True, name='Pandas'):
    if record is None or len(record) == 0:
        continue
    rows.append({
        "instruction": record.question,
        "chosen_response": record.chosen,
        "rejected_response": record.rejected
    })

# build dataset for training
prepared_dataset = Dataset.from_list(rows)
prepared_dataset.to_pandas()

Unnamed: 0,instruction,chosen_response,rejected_response
0,Do you know why turkeys became the official fo...,"To be honest, I donÕt know anything about that...",I know that the American Indians were the firs...
1,How do I float on my back in the water?Ê,ÊYou should tuck your arms to your sides and a...,ou want me to tell you the answer to a physics...
2,What is your name?,Answer : My name is Amit,Answer : Amit is my name


## Train the reward model with TRL

In [16]:
#To train Reward Model we need to choose a base model to fine-tune.
model_name = "distilroberta-base"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
#This function combines instructions with chosen and rejected responses, creating two new strings.
#These strings are tokenized, becoming input for a reward model that learns to distinguish between good and bad responses based on these examples.
#The model will be optimized to assign higher values to preferred responses and lower values to rejected responses.
def formatting_func(examples):
    kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}

    # Prepend the prompt and a line break to the original_response and response-1 fields.
    prompt_plus_chosen_response = examples["instruction"] + "\n" + examples["chosen_response"]
    prompt_plus_rejected_response = examples["instruction"] + "\n" + examples["rejected_response"]

    # Then tokenize these modified fields.
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }

formatted_dataset = prepared_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()

training_args = TrainingArguments(
    output_dir="./../output/reward_model",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    logging_steps=1,
    num_train_epochs = 10,
    report_to=None,

)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],

)

trainer.train()


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weigh

Map:   0%|          | 0/3 [00:00<?, ? examples/s]



  0%|          | 0/10 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


{'loss': 0.6833, 'learning_rate': 4.5e-05, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6987985372543335, 'eval_accuracy': 0.0, 'eval_runtime': 0.5053, 'eval_samples_per_second': 1.979, 'eval_steps_per_second': 1.979, 'epoch': 1.0}
{'loss': 0.6636, 'learning_rate': 4e-05, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7014700174331665, 'eval_accuracy': 0.0, 'eval_runtime': 0.545, 'eval_samples_per_second': 1.835, 'eval_steps_per_second': 1.835, 'epoch': 2.0}
{'loss': 0.6073, 'learning_rate': 3.5e-05, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7044491171836853, 'eval_accuracy': 0.0, 'eval_runtime': 0.6424, 'eval_samples_per_second': 1.557, 'eval_steps_per_second': 1.557, 'epoch': 3.0}
{'loss': 0.6745, 'learning_rate': 3e-05, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7082501649856567, 'eval_accuracy': 0.0, 'eval_runtime': 0.5617, 'eval_samples_per_second': 1.78, 'eval_steps_per_second': 1.78, 'epoch': 4.0}
{'loss': 0.5675, 'learning_rate': 2.5e-05, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7113704681396484, 'eval_accuracy': 0.0, 'eval_runtime': 0.6031, 'eval_samples_per_second': 1.658, 'eval_steps_per_second': 1.658, 'epoch': 5.0}
{'loss': 0.6095, 'learning_rate': 2e-05, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7151515483856201, 'eval_accuracy': 0.0, 'eval_runtime': 0.5369, 'eval_samples_per_second': 1.863, 'eval_steps_per_second': 1.863, 'epoch': 6.0}
{'loss': 0.5406, 'learning_rate': 1.5e-05, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7190009355545044, 'eval_accuracy': 0.0, 'eval_runtime': 0.5575, 'eval_samples_per_second': 1.794, 'eval_steps_per_second': 1.794, 'epoch': 7.0}
{'loss': 0.5423, 'learning_rate': 1e-05, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7224991917610168, 'eval_accuracy': 0.0, 'eval_runtime': 0.5498, 'eval_samples_per_second': 1.819, 'eval_steps_per_second': 1.819, 'epoch': 8.0}
{'loss': 0.5068, 'learning_rate': 5e-06, 'epoch': 9.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7252007722854614, 'eval_accuracy': 0.0, 'eval_runtime': 0.5493, 'eval_samples_per_second': 1.82, 'eval_steps_per_second': 1.82, 'epoch': 9.0}
{'loss': 0.5053, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7267659306526184, 'eval_accuracy': 0.0, 'eval_runtime': 0.5657, 'eval_samples_per_second': 1.768, 'eval_steps_per_second': 1.768, 'epoch': 10.0}
{'train_runtime': 52.0156, 'train_samples_per_second': 0.384, 'train_steps_per_second': 0.192, 'train_loss': 0.59006068110466, 'epoch': 10.0}


TrainOutput(global_step=10, training_loss=0.59006068110466, metrics={'train_runtime': 52.0156, 'train_samples_per_second': 0.384, 'train_steps_per_second': 0.192, 'train_loss': 0.59006068110466, 'epoch': 10.0})

In [17]:
trainer.save_model() 