# Datasets For Reinforcement Learning Training

### Loading and exploring the datasets

"Reinforcement Learning from Human Feedback" **(RLHF)** requires the following datasets:
- Preference dataset
  - Input prompt, candidate response 0, candidate response 1, choice (candidate 0 or 1)
- Prompt dataset
  - Input prompt only, no response

#### Preference dataset

In [None]:
preference_dataset_path = 'data/sample_preference.jsonl'

In [None]:
import json

In [None]:
preference_data = []

In [None]:
with open(preference_dataset_path) as f:
    for line in f:
        preference_data.append(json.loads(line))

- Print out to explore the preference dataset

In [None]:
sample_1 = preference_data[0]

In [None]:
print(type(sample_1))

In [None]:
# This dictionary has four keys
print(sample_1.keys())

- Key: 'input_test' is a prompt.

In [None]:
sample_1['input_text']

In [None]:
# Try with another examples from the list, and discover that all data end the same way
preference_data[2]['input_text'][-50:]

- Print 'candidate_0' and 'candidate_1', these are the completions for the same prompt.

In [None]:
print(f"candidate_0:\n{sample_1.get('candidate_0')}\n")
print(f"candidate_1:\n{sample_1.get('candidate_1')}\n")

- Print 'choice', this is the human labeler's preference for the results completions (candidate_0 and candidate_1)

In [None]:
print(f"choice: {sample_1.get('choice')}")

#### Prompt dataset

In [None]:
prompt_dataset_path = 'data/sample_prompt.jsonl'

In [None]:
prompt_data = []

In [None]:
with open(prompt_dataset_path) as f:
    for line in f:
        prompt_data.append(json.loads(line))

In [None]:
# Check how many prompts there are in this dataset
len(prompt_data)

**Note**: It is important that the prompts in both datasets, the preference and the prompt, come from the same distribution. 

For this lesson, all the prompts come from the same dataset of [Reddit posts](https://github.com/openai/summarize-from-feedback).

In [None]:
# Function to print the information in the prompt dataset with a better visualization
def print_d(d):
    for key, val in d.items():        
        print(f"key:{key}\nval:{val}\n")

In [None]:
print_d(prompt_data[0])

In [None]:
# Try with another prompt from the list 
print_d(prompt_data[1])

# Tune an LLM with RLHF

#### Project environment setup

The RLHF training process has been implemented in a machine learning pipeline as part of the (Google Cloud Pipeline Components) library. This can be run on any platform that supports KubeFlow Pipelines (an open source framework), and can also run on Google Cloud's Vertex AI Pipelines.

To run it locally, install the following:
```Python
!pip3 install google-cloud-pipeline-components
!pip3 install kfp
```

In [None]:
! gcloud auth login

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
PROJECT_ID = os.environ['PROJECT_ID']
REGION = os.environ['REGION']
print(f"PROJECT_ID: {PROJECT_ID}")
print(f"REGION: {REGION}")

In [None]:
# Authenticate in utils
from utils import authenticate
credentials, PROJECT_ID, STAGING_BUCKET = authenticate()

# RLFH pipeline is available in this region
REGION = "europe-west4"

### Download data

In [None]:
from google.cloud import storage
import os

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client(credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")


def download_files_from_prefix(bucket_name, prefix, destination_dir):
    """Downloads all blobs with a prefix from the bucket."""
    storage_client = storage.Client(credentials=credentials)
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)

    for blob in blobs:
        # Prepare the local file path
        local_path = os.path.join(destination_dir, blob.name.replace(prefix, ""))
        # Ensure the local directory structure mirrors the GCS structure
        os.makedirs(os.path.dirname(local_path), exist_ok=True)

        # Download the blob to the local path
        download_blob(bucket_name, blob.name, local_path)


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client(credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

    
def upload_directory(bucket_name, source_dir, destination_blob_prefix):
    """Uploads a directory to the bucket."""
    for local_root, dirs, files in os.walk(source_dir):
        for filename in files:
            local_path = os.path.join(local_root, filename)
    
            # Construct the full path for the destination
            relative_path = os.path.relpath(local_path, source_dir)
            blob_path = os.path.join(destination_blob_prefix, relative_path)
    
            # Upload the file
            upload_blob(bucket_name, local_path, blob_path)
    

# Download and upload files to the staging bucket
bucket_name = "vertex-ai"
prefixes = [
    "generative-ai/rlhf/text_small/reddit_tfds/train/", 
    "generative-ai/rlhf/text_small/reddit_tfds/val/", 
    "generative-ai/rlhf/text_small/summarize_from_feedback_tfds/comparisons/train/"
    ]
destination_dir = "data/rlhf/"  # Make sure this directory exists or create it
destinations = (
    [f"{destination_dir}{prefix.split('/')[-2]}" for prefix in prefixes[:-1]] 
    + 
    [f"{destination_dir}summarize_from_feedback_tfds/comparisons/train/"]
    )

for i in range(len(prefixes)):
    prefix = prefixes[i]
    destination_dir = destinations[i]
    download_files_from_prefix(bucket_name, prefix, destination_dir)
upload_bucket_name = STAGING_BUCKET.replace("gs://", "")  # Remove gs://
for dst in destinations:
    print(f"Uploading {dst} to {upload_bucket_name}")
    upload_directory(upload_bucket_name, dst, dst)
destinations

### Compile the pipeline

In [None]:
# Import (RLFH is currently in preview)
from google_cloud_pipeline_components.preview.llm \
import rlhf_pipeline

In [None]:
# Import from KubeFlow pipelines
from kfp import compiler

In [None]:
# Define a path to the yaml file
RLHF_PIPELINE_PKG_PATH = "rlhf_pipeline.yaml"

In [None]:
# Execute the compile function
compiler.Compiler().compile(
    pipeline_func=rlhf_pipeline,
    package_path=RLHF_PIPELINE_PKG_PATH
)

In [None]:
# Print the first lines of the YAML file
!head rlhf_pipeline.yaml

**Note**: to print the whole YAML file, use the following:
```Python
!cat rlhf_pipeline.yaml
```

## Define the Vertex AI pipeline job

### Define the location of the training and evaluation data
Previously, the datasets were loaded from small JSONL files, but for typical training jobs, the datasets are much larger, and are usually stored in cloud storage (in this case, Google Cloud Storage).

**Note:** Make sure that the three datasets are stored in the same Google Cloud Storage bucket.
```Python
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/val/*.jsonl",
    ...
```

### Choose the foundation model to be tuned

In this case, we are tuning the [Llama-2](https://ai.meta.com/llama/) foundational model, the LLM to tune is called **large_model_reference**. 

In this course, we're tuning the llama-2-7b, but you can also run an RLHF pipeline on Vertex AI to tune models such as: the T5x or text-bison@001. 

```Python
parameter_values={
        "large_model_reference": "llama-2-7b",
        ...
```

### Calculate the number of reward model training steps

**reward_model_train_steps** is the number of steps to use when training the reward model.  This depends on the size of your preference dataset. We recommend the model should train over the preference dataset for 20-30 epochs for best results.

$$ stepsPerEpoch = \left\lceil \frac{datasetSize}{batchSize} \right\rceil$$
$$ trainSteps = stepsPerEpoch \times numEpochs$$

The RLHF pipeline parameters are asking for the number of training steps and not number of epochs. Here's an example of how to go from epochs to training steps, given that the batch size for this pipeline is fixed at 64 examples per batch.



In [None]:
# Preference dataset size
PREF_DATASET_SIZE = 3000

In [None]:
# Batch size is fixed at 64
BATCH_SIZE = 64

In [None]:
import math

In [None]:
REWARD_STEPS_PER_EPOCH = math.ceil(PREF_DATASET_SIZE / BATCH_SIZE)
print(REWARD_STEPS_PER_EPOCH)

In [None]:
REWARD_NUM_EPOCHS = 30

In [None]:
# Calculate number of steps in the reward model training
reward_model_train_steps = REWARD_STEPS_PER_EPOCH * REWARD_NUM_EPOCHS

In [None]:
print(reward_model_train_steps)

### Calculate the number of reinforcement learning training steps
The **reinforcement_learning_train_steps** parameter is the number of reinforcement learning steps to perform when tuning the base model. 
- The number of training steps depends on the size of your prompt dataset. Usually, this model should train over the prompt dataset for roughly 10-20 epochs.
- Reward hacking: if given too many training steps, the policy model may figure out a way to exploit the reward and exhibit undesired behavior.

In [None]:
# Prompt dataset size
PROMPT_DATASET_SIZE = 2000

In [None]:
# Batch size is fixed at 64
BATCH_SIZE = 64

In [None]:
import math

In [None]:
RL_STEPS_PER_EPOCH = math.ceil(PROMPT_DATASET_SIZE / BATCH_SIZE)
print(RL_STEPS_PER_EPOCH)

In [None]:
RL_NUM_EPOCHS = 10

In [None]:
# Calculate the number of steps in the RL training
reinforcement_learning_train_steps = RL_STEPS_PER_EPOCH * RL_NUM_EPOCHS

In [None]:
print(reinforcement_learning_train_steps)

### Define the instruction

- Choose the task-specific instruction that you want to use to tune the foundational model.  For this example, the instruction is "Summarize in less than 50 words."
- You can choose different instructions, for example, "Write a reply to the following question or comment." Note that you would also need to collect your preference dataset with the same instruction added to the prompt, so that both the responses and the human preferences are based on that instruction.

In [None]:
# Completed values for the dictionary
parameter_values={
        "preference_dataset": \
    "gs://rlhf-staging/data/rlhf/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://rlhf-staging/data/rlhf/train/*.jsonl",
        "eval_dataset": \
    "gs://rlhf-staging/data/rlhf/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": 1410,
        "reinforcement_learning_train_steps": 320, # results from the calculations above
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 1.0,
        "kl_coeff": 0.1, # increased to reduce reward hacking
        "instruction":\
    "Summarize in less than 50 words"}

### Train with full dataset: dictionary 'parameter_values' 

- Adjust the settings for training with the full dataset to achieve optimal results in the evaluation (next lesson). Take a look at the new values; these results are from various training experiments in the pipeline, and the best parameter values are displayed here.

```python
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/reddit_tfds/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": 10000,
        "reinforcement_learning_train_steps": 10000, 
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 0.2,
        "kl_coeff": 0.1,
        "instruction":\
    "Summarize in less than 50 words"}
```

### Set up Google Cloud to run the Vertex AI pipeline

Vertex AI is already installed in this classroom environment.  If you were running this on your own project, you would install Vertex AI SDK like this:
```Python
!pip3 install google-cloud-aiplatform
```

## Run the pipeline job on Vertex AI

Now that we have created our dictionary of values, we can create a PipelineJob. This just means that the RLHF pipeline will execute on Vertex AI. So it's not running locally here in the notebook, but on some server on Google Cloud.

In [None]:
import google.cloud.aiplatform as aiplatform

In [None]:
aiplatform.init(project = PROJECT_ID,
                location = REGION,
                credentials = credentials)

In [None]:
# Look at the path for the YAML file
RLHF_PIPELINE_PKG_PATH

### Create and run the pipeline job
- Here is how you would create the pipeline job and run it if you were working on your own project.
- This job takes about a full day to run with multiple accelerators (TPUs/GPUs), and so we're not going to run it in this classroom.

- To create the pipeline job:

```Python
job = aiplatform.PipelineJob(
    display_name="tutorial-rlhf-tuning",
    pipeline_root=STAGING_BUCKET,
    template_path=RLHF_PIPELINE_PKG_PATH,
    parameter_values=parameter_values)
```
- To run the pipeline job:

```Python
job.run()
```

- The content team has run this RLHF training pipeline to tune the Llama-2 model, and in the next lesson, you'll get to evaluate the log data to compare the performance of the tuned model with the original foundational model.

In [None]:
job = aiplatform.PipelineJob(
    display_name="tutorial-rlhf-tuning",
    template_path=RLHF_PIPELINE_PKG_PATH,
    parameter_values=parameter_values,
    credentials=credentials
)
job

In [None]:
job.submit(service_account=os.getenv("SERVICE_ACCOUNT"))

# Evaluate the Tuned Model

#### Project environment setup

- Install Tensorboard (if running locally)
```Python
!pip install tensorboard
```

### Explore results with Tensorboard

In [None]:
%load_ext tensorboard

In [None]:
port = %env PORT1
%tensorboard --logdir data/reward-logs --port $port --bind_all 

In [None]:
# Look at what this directory has
%ls data/reward-logs

In [None]:
port = %env PORT2
%tensorboard --logdir data/reinforcer-logs --port $port --bind_all

In [None]:
port = %env PORT3
%tensorboard --logdir data/reinforcer-fulldata-logs --port $port --bind_all

- The dictionary of 'parameter_values' defined in the previous lesson

In [None]:
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": 1410,
        "reinforcement_learning_train_steps": 320,
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 1.0,
        "kl_coeff": 0.1,
        "instruction":\
    "Summarize in less than 50 words"}

**Note:** Here, we are using "text_small" for our datasets for learning purposes. However for the results that we're evaluating in this lesson, the team used the full dataset with the following hyperparameters:

```Python
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/reddit_tfds/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": 10000,
        "reinforcement_learning_train_steps": 10000, 
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 0.2,
        "kl_coeff": 0.1,
        "instruction":\
    "Summarize in less than 50 words"}
```

### Evaluate using the tuned and untuned model

In [None]:
import json

In [None]:
eval_tuned_path = 'data/eval_results_tuned.jsonl'

In [None]:
eval_data_tuned = []

In [None]:
with open(eval_tuned_path) as f:
    for line in f:
        eval_data_tuned.append(json.loads(line))

In [None]:
# Import for printing purposes
from utils import print_d

In [None]:
# Look at the result produced by the tuned model
print_d(eval_data_tuned[0])

In [None]:
eval_untuned_path = 'data/eval_results_untuned.jsonl'

In [None]:
eval_data_untuned = []

In [None]:
with open(eval_untuned_path) as f:
    for line in f:
        eval_data_untuned.append(json.loads(line))

In [None]:
# Look at the result produced by the untuned model
print_d(eval_data_untuned[0])

### Explore the results side by side in a dataframe

In [None]:
# Extract all the prompts
prompts = [sample['inputs']['inputs_pretokenized']
           for sample in eval_data_tuned]

In [None]:
# Completions from the untuned model
untuned_completions = [sample['prediction']
                       for sample in eval_data_untuned]

In [None]:
# Completions from the tuned model
tuned_completions = [sample['prediction']
                     for sample in eval_data_tuned]

- Now putting all together in one big dataframe

In [None]:
import pandas as pd

In [None]:
results = pd.DataFrame(
    data={'prompt': prompts,
          'base_model':untuned_completions,
          'tuned_model': tuned_completions})

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# Print the results
results