# Fine tune the model with Reinforcement Learning

In [None]:
import os
os.environ["HUGGING_FACE_HUB_TOKEN"] = "Your HuggingFace API token"

In [None]:
!pip install datasets

In [None]:
from datasets import DatasetDict

In [None]:
# Load the DatasetDict back from disk
loaded_dataset_dict = DatasetDict.load_from_disk('Your dataset directory')



In [None]:
train_dataset = loaded_dataset_dict['train']
# valid_dataset = loaded_dataset_dict['validation']
test_dataset = loaded_dataset_dict['test']

In [None]:
test_dataset

Dataset({
    features: ['Question', 'Answer'],
    num_rows: 183
})

In [None]:
!pip install torch --upgrade



In [None]:
!pip install "transformers==4.31.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" --upgrade


In [None]:
import time
import os
import torch
# from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# from datasets import Dataset, DatasetDict, load_dataset

# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


# Hugging Face model id
model_id = "meta-llama/Llama-2-7b-chat-hf" # gated


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

# Load the entire model on the GPU 0
device_map = {"": 0}
# device_map="auto"


In [None]:
# Load model and tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)


In [None]:
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler
from trl.core import respond_to_batch
from trl.core import LengthSampler


import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

# Prepare a function to pull out the number of model parameters

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

Add the adapter to the original llam-2-7b-chat model. In the previous section you were adding the fully trained adapter only for inferences, so there was no need to pass LoRA configurations doing that. Now you need to pass them to the constructed PEFT model, also putting is_trainable=True.

In [None]:

# Reload model in FP16
# Load model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, device_map=device_map)
base_model.config.pretraining_tp = 1

# prepare model for training
base_model = prepare_model_for_kbit_training(base_model)
# base_model = get_peft_model(base_model, peft_config)

peft_model = PeftModel.from_pretrained(base_model,
                                       './model/checkpoint-4620',
                                       lora_config= peft_config,
                                       torch_dtype=torch.bfloat16,
                                       device_map={"": 0},
                                       is_trainable=True)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(peft_model,
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)



PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 33558529
all model parameters: 3533971457
percentage of trainable model parameters: 0.95%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=4096, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [None]:
ref_model = create_reference_model(ppo_model)

print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 3533971457
percentage of trainable model parameters: 0.00%



# 2.2 - Prepare Reward Model
Reinforcement Learning (RL) is one type of machine learning where agents take actions in an environment aimed at maximizing their cumulative rewards. The agent's behavior is defined by the policy. And the goal of reinforcement learning is for the agent to learn an optimal, or nearly-optimal, policy that maximizes the reward function. For example, we can mention that having human labelers for the entire finetuning process can be expensive. A practical way to avoid that is to use a reward model. in our case gpt-3.5-turbo used as a reward model.



In [None]:
!pip install openai
# Installing the Reinforcement Learning library directly from github.
# %pip install git+https://github.com/lvwerra/trl.git@25fa1bd


In [None]:
import os
import openai

In [None]:

# Set your API key
api_key = "Your OpenAI API key"

# Set the OPENAI_API_KEY environment variable
os.environ["OPENAI_API_KEY"] = api_key

# Now you can use the OpenAI API without passing the API key in your code
openai.api_key = api_key

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [None]:

def construct_prompt(predicted_output, target_output):

  return f"""
"Please rate the predicted output on the following scale:"

"If the predicted output is completely accurate and detailed compared to the target output, assign a score between 8-10."

"If the predicted output is partially accurate but missing some key details, assign a score between 5-7."

"If the predicted output is mostly inaccurate or insufficient, assign a score between 1-4."

"Your rating should be based on factors like:"

"- How accurate and relevant the predicted output is based on the target output"

"- How completely the key information is covered compared to the target output"

"- Whether factual errors or critical omissions are present"

"Please make your decision based on the following constitution: \n"
"1) The reward rating shall be as similar as possible to "
"how an experienced immigration advisor would rate the response.\n"
"2) The answer shall align with up-to-date immigration laws, regulations, "
"and official procedures.\n"
"3) The answer is supposed to be from a real-life immigration expert "
providing advice in a professional setting, not from unofficial sources.\n"
"4) The answer shall be accurate, helpful, and empathetic.\n"
"5) The answer shall be similar to what a real-life immigration advisor "


"Predicted output:\n{predicted_output}\n"

"Target output:\n{target_output}\n"

"Please output only scalar rating from 1 to 10 based on the above guidelines."

"""

### Initialize `PPOTrainer`

For the `PPOTrainer` initialization, you will need a collator. Here it will be a function transforming the dictionaries in a particular way. You can define and test it:

Set up the configuration parameters. Load the ppo_model and the tokenizer. You will also load a frozen version of the model ref_model. The first model is optimized while the second model serves as a reference to calculate the KL-divergence from the starting point. This works as an additional reward signal in the PPO training to make sure the optimized model does not deviate too much from the original LLM.

In [None]:
learning_rate=1.41e-5
# max_ppo_epochs=1
# # mini_batch_size=4
# batch_size=1

# config = PPOConfig(
#     model_name=model_id,
#     learning_rate=learning_rate,
#     ppo_epochs=max_ppo_epochs,
#     mini_batch_size=mini_batch_size,
#     batch_size=batch_size
# )
# initialize trainer
ppo_config = PPOConfig(
    batch_size=8,
    learning_rate=learning_rate
)

ppo_trainer = PPOTrainer(ppo_config,
                         model=ppo_model,
                         ref_model=ref_model,
                         tokenizer=tokenizer,
                        )



# Fine-Tune the Model
The fine-tuning loop consists of the following main steps:

Get the query responses from the policy LLM (PEFT model).
Get the score from reward Gpt3.5 as a model in our case.
Optimize policy with PPO using the (query, response, reward) triplet.

In [None]:
dataset = test_dataset.train_test_split(test_size=55, shuffle=False, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer'],
        num_rows: 128
    })
    test: Dataset({
        features: ['Question', 'Answer'],
        num_rows: 55
    })
})

In [None]:
import re

def remove_incomplete_last_sentence(text):
    # Split the text into sentences using a simple regex pattern
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Check if the last sentence ends with a period
    if sentences and not sentences[-1].endswith('.'):
        sentences.pop()  # Remove the incomplete last sentence

    return ' '.join(sentences)

16

In [None]:
# Update generation_kwargs with max_length
generation_kwargs = {
    "top_k": 200,
    "top_p": 0.6,
    "temperature":1.0,
}

batch_size = 8
total_samples = len(dataset['train'])
total_batches = (total_samples + batch_size - 1) // batch_size
for batch_index in range(total_batches):
    print(batch_index)
    start_index = batch_index * batch_size
    end_index = min(start_index + batch_size, total_samples)
    batch_data = dataset['train'][start_index:end_index]  # Get batch data
    # print(batch_data['Answer'])
    # break
    query_tensors = []
    prediction_tensors = []
    reward_tensors = []
    # for ind, data in enumerate(batch_data):
    for q, r in zip(batch_data["Question"], batch_data["Answer"]):
      input_text, target_output = q, r
      query_txt = "[INST]" + input_text.strip() + "[/INST]"
      query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(peft_model.device)
      query_tensors.append(query_tensor[0])
    # print(query_tensor)
      output_min_length = len(query_tensor[0]) + 50
      output_max_length = len(query_tensor[0]) + 100
      output_length_sampler = LengthSampler(output_min_length, output_max_length)
      max_new_tokens = output_length_sampler()
      generation_kwargs["max_new_tokens"] = max_new_tokens


      # print(query_tensor[0])
      # break
      prediction_tensor = ppo_trainer.generate(query_tensor[0], **generation_kwargs)

      prediction_output = tokenizer.decode(prediction_tensor.squeeze(), skip_special_tokens=True)
      prediction_output = prediction_output.split('[/INST]')[1]
      prediction_output = remove_incomplete_last_sentence(prediction_output)
      prediction_tensors.append(prediction_tensor.squeeze()[-max_new_tokens:])
      # print(index)
      print('-'.join('' for x in range(100)))
      # break

      # #2 Evaluation - Define a reward for the response;
      prompt = construct_prompt(prediction_output.strip(), target_output.strip())

      # define a reward for response
      # Get reward from reward Gpt3.5.
      reward_score = get_completion(prompt)
      reward_tensors.append(torch.tensor(float(reward_score)))

    #3 Optimization-  Train model with PPO
    stats = ppo_trainer.step(query_tensors, prediction_tensors, reward_tensors)
    print(stats['objective/kl'])
    # ppo_trainer.log_stats(stats, data, reward_tenso)
    # print(f'objective/kl: {stats["objective/kl"]}')
    # print('-'.join('' for x in range(100)))



0
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


164.1136474609375
1
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
165.4672393798828
2
---------------------------------------------------------------------------------------------------
------------------------------------------------------------



128.29782104492188
8
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
150.2366943359375
9
---------------------------------------------------------------------------------------------------
-----------------------------------------------------------



150.23577880859375
10
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------
154.952880859375
11
---------------------------------------------------------------------------------------------------
----------------------------------------------------------

# Evaluate the Model Quantitatively (with ROUGE Metric)

In [None]:
import pandas as pd

questions = test_dataset[0:10]['Question']
human_baseline_answers = test_dataset[0:10]['Answer']

# questions = dataset['test']['Question']
# human_baseline_answers = dataset['test']['Answer']

original_model_summaries = []
peft_model_summaries = []

for idx, question in enumerate(questions):
    prompt = f"[INST] {question.strip()} [/INST]"

    input_ids = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).input_ids.cuda()

    output_min_length = len(input_ids[0]) + 50
    output_max_length = len(input_ids[0]) + 100
    output_length_sampler = LengthSampler(output_min_length, output_max_length)
    max_new_tokens = output_length_sampler()


    human_baseline_text_output = human_baseline_answers[idx].strip()

    original_model_outputs = ref_model.generate(input_ids=input_ids, max_new_tokens=max_new_tokens, top_k=200, top_p=0.6, temperature=1.0)
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_text_output = remove_incomplete_last_sentence(original_model_text_output)


    ppo_model_outputs = ppo_model.generate(input_ids=input_ids, max_new_tokens=max_new_tokens, top_k=200, top_p=0.6, temperature=1.0)
    ppo_model_text_output = tokenizer.decode(ppo_model_outputs[0], skip_special_tokens=True)
    ppo_model_text_output = remove_incomplete_last_sentence(ppo_model_text_output)

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(ppo_model_text_output)



In [None]:
zipped_summaries = list(zip(human_baseline_answers, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_answers', 'original_model_summaries', 'peft_model_summaries'])


df.to_csv('/content/drive/MyDrive/llama-2-chat--output_dir/llam2_prediction_result_RHLF.csv', index=False)

In [None]:
df.head()

Unnamed: 0,human_baseline_answers,original_model_summaries,peft_model_summaries
0,we have seen people wait 3 to 5 months. in the...,[INST] hola. i just have a quick question. how...,[INST] hola. i just have a quick question. how...
1,sometimes there is a second officer in the roo...,[INST] dear i filed my asylum case in june 201...,[INST] dear i filed my asylum case in june 201...
2,addressing the problem of inconsistent decisio...,[INST] what can be done to address the problem...,[INST] what can be done to address the problem...
3,the study found that refugees contributed an e...,[INST] what were the results of the study on t...,[INST] what were the results of the study on t...
4,answer the author does not seem to have a favo...,[INST] what is the authors opinion on clients ...,[INST] what is the authors opinion on clients ...


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install evaluate==0.4.0 rouge_score==0.1.2

Collecting evaluate==0.4.0
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score==0.1.2
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting responses<0.19 (from evaluate==0.4.0)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=91582dbfd3bde2bcb6ebafea80cf2d04ff5eebeef04f450a22266bb43bbb57e4
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, responses, evaluate
Successfully installed evaluate-0.4.0 responses-0.18.0 rouge_score-0.1.2


In [None]:
import evaluate


In [None]:
original_model_summaries = [i.split('[/INST]')[1] for i in original_model_summaries]
peft_model_summaries = [i.split('[/INST]')[1] for i in peft_model_summaries]

In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_answers[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)


peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_answers[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('baseline MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

baseline MODEL:
{'rouge1': 0.2605104825604272, 'rouge2': 0.04816490579921047, 'rougeL': 0.15414783989627007, 'rougeLsum': 0.1561260595553497}
PEFT MODEL:
{'rouge1': 0.2419125389812292, 'rouge2': 0.04722786499506178, 'rougeL': 0.15795727594252065, 'rougeLsum': 0.15825225434681628}
