# Instruction tunning GPT-2 on Alpaca dataset

In [None]:
# Install dependencies
%pip install accelerate transformers datasets trl

In [9]:
import os
import torch
from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from trl import SFTTrainer

## Training and dataset configuration

In [10]:
batch_size = 16
num_workers = os.cpu_count()
max_steps = 3000
bf16 = False
fp16 = False # Turn on if supported
fp16_full_eval=False # Turn on if supported
gradient_accumulation_steps = 2
context_length = 256
logging_steps = 500
save_steps = 500
learning_rate = 0.0001
model_name = 'gpt2'
out_dir = '../models/gpt2_alpaca_preprocess_fn'

## Loading Alpaca instructions dataset

In [4]:
dataset = load_dataset('tatsu-lab/alpaca')
dataset

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})

In [5]:
# print one sample from the text column
dataset['train']['text'][0]


'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'

In [6]:
# Split the dataset into training and validation sets
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 49401
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 2601
})


## Preprocessing the dataset to be used for training GPT-2 model

We will use a pre-processing function to prepare the dataset for training the GPT-2 model. The function will concatenate the instruction, input and output columns and formats it into a string with headers 'Instruction', 'Input', and 'Response' for each corresponding value. It returns this structured string.

In [8]:
def preprocess_function(example):
    """
    This function formats the dictionary values into a single string with specific section headers and returns this string.
    The returned string is structured as follows:
    - Starts with "### Instruction:" followed by the instruction value from the dictionary.
    - Then "### Input:" followed by the input value from the dictionary.
    - Finally "### Response:" followed by the output value from the dictionary.
    Each section is separated by two newline characters for clear demarcation.
    """
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return text

In [9]:
# Example of the preprocess_function
preprocess_function(dataset_train[0])

'### Instruction:\nEdit the given text so that it uses formal language.\n\n### Input:\nhey everyone, we need to finish up the project before the weekend\n\n### Response:\nGreetings everyone, we need to complete the project before the weekend.'

## Initializing the GPT-2 model for instruction tunning 

In [14]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")

total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

124,439,808 total parameters.
124,439,808 training parameters.


## Initializing the tokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

## Training the GPT-2 model on the Alpaca dataset

In [23]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    evaluation_strategy='steps',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    fp16_full_eval=fp16_full_eval,
    report_to='tensorboard',
    max_steps=max_steps,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)



## Initializing the SFTTrainer
We initialize the SFTTrainer with the GPT-2 model, the tokenizer, the training dataset, and the evaluation dataset and the pre-processing function. We also specify `packing=True` which concatenates different samples of similar lengths into one batch to speed up training.

In [24]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True
)

Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


We can call the `get_train_dataloader` function of the trainer instance to visualize the concatenated samples.

In [25]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

s * (s-a) * (s-b) * (s-c)) ** 0.5
    return area 
  
# Driver Code 
# a, b, c are lengths of the sides of triangle
a = 4
b = 5
c = 6
print("Area of triangle is : %.2f" % area_triangle(a, b, c))<|endoftext|>### Instruction:
Generate a headline for an article discussing the importance of backup systems.

### Input:


### Response:
"Don't Wait Until it's Too Late: The Importance of Having a Backup System in Place"<|endoftext|>### Instruction:
Create a speech to inspire people to help the environment.

### Input:


### Response:
Good evening everyone. We are all here tonight to recognize the importance of taking care of the environment. We are all so fortunate to live on this beautiful planet and yet, many of us fail to recognize and preserve its beauty. Too often, we take for granted the clean air we breathe, the clean water we drink, and the abundant wildlife that populate our landscapes. It is our moral responsibility to take care of this planet and to ensure that our
#################

In [26]:
# Start training
history = trainer.train()

  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 2.0502, 'grad_norm': 0.8242108225822449, 'learning_rate': 0.0001, 'epoch': 0.9}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.874091386795044, 'eval_runtime': 120.1069, 'eval_samples_per_second': 7.627, 'eval_steps_per_second': 0.483, 'epoch': 0.9}
{'loss': 1.8613, 'grad_norm': 0.7886101603507996, 'learning_rate': 0.0001, 'epoch': 1.81}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.8360038995742798, 'eval_runtime': 121.6248, 'eval_samples_per_second': 7.531, 'eval_steps_per_second': 0.477, 'epoch': 1.81}
{'loss': 1.7669, 'grad_norm': 0.7994106411933899, 'learning_rate': 0.0001, 'epoch': 2.71}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.8226869106292725, 'eval_runtime': 117.9597, 'eval_samples_per_second': 7.765, 'eval_steps_per_second': 0.492, 'epoch': 2.71}
{'loss': 1.6949, 'grad_norm': 0.7560693621635437, 'learning_rate': 0.0001, 'epoch': 3.62}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.8207974433898926, 'eval_runtime': 118.9676, 'eval_samples_per_second': 7.7, 'eval_steps_per_second': 0.488, 'epoch': 3.62}
{'loss': 1.6329, 'grad_norm': 0.7565377950668335, 'learning_rate': 0.0001, 'epoch': 4.52}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.8262083530426025, 'eval_runtime': 120.393, 'eval_samples_per_second': 7.608, 'eval_steps_per_second': 0.482, 'epoch': 4.52}
{'loss': 1.5768, 'grad_norm': 0.7407647967338562, 'learning_rate': 0.0001, 'epoch': 5.42}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.8371939659118652, 'eval_runtime': 119.4518, 'eval_samples_per_second': 7.668, 'eval_steps_per_second': 0.486, 'epoch': 5.42}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 8208.0386, 'train_samples_per_second': 11.696, 'train_steps_per_second': 0.365, 'train_loss': 1.7638329671223958, 'epoch': 5.42}


In [31]:
import pandas as pd

history_dict = history._asdict()
history_df = pd.DataFrame(history_dict)
print(history_df)


                          global_step  training_loss       metrics
train_runtime                    3000       1.763833  8.208039e+03
train_samples_per_second         3000       1.763833  1.169600e+01
train_steps_per_second           3000       1.763833  3.650000e-01
total_flos                       3000       1.763833  1.253483e+16
train_loss                       3000       1.763833  1.763833e+00
epoch                            3000       1.763833  5.424955e+00


In [32]:
# Save the model and tokenizer
model.save_pretrained(f"{out_dir}/best_model")
tokenizer.save_pretrained(f"{out_dir}/best_model")

('../models/gpt2_alpaca_preprocess_fn/best_model/tokenizer_config.json',
 '../models/gpt2_alpaca_preprocess_fn/best_model/special_tokens_map.json',
 '../models/gpt2_alpaca_preprocess_fn/best_model/vocab.json',
 '../models/gpt2_alpaca_preprocess_fn/best_model/merges.txt',
 '../models/gpt2_alpaca_preprocess_fn/best_model/added_tokens.json')

## Testing the instruction tuned GPT-2 model

In [6]:
from transformers import (
    AutoModelForCausalLM, 
    logging, 
    pipeline,
    AutoTokenizer
)
import torch

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(f'{out_dir}/best_model/')
tokenizer = AutoTokenizer.from_pretrained(f'{out_dir}/best_model/')
tokenizer.pad_token = tokenizer.eos_token

In [12]:
# Create a pipeline to generate text
pipe = pipeline(
    task='text-generation', 
    model=model, 
    tokenizer=tokenizer, 
    max_length=256, # Prompt + new tokens to generate.
    device_map=device
)

In [46]:
# Define the prompt using the same format in which the model was trained
template = """### Instruction:
{}
### Input:
{}
### Response:
{}"""

# Write the prompt
instructions = 'Write a sentence using the given context'
inputs = 'dog, beach, sun'
response = ''
prompt = template.format(instructions, inputs, response)

# Generate text
outputs = pipe(
    prompt, 
    do_sample=True, 
    temperature=0.7, 
    top_k=50, 
    top_p=0.95,
    repetition_penalty=1.1,
)
print(outputs[0]['generated_text'])

### Instruction:
Write a sentence using the given context
### Input:
dog, beach, sun
### Response:
The sun shone down on the dog's face as it watched the sand below.


That is not bad! The model is able to generate the correct output for the given instruction.

## Conclusion

In this notebook, we have trained the GPT-2 model on the Alpaca dataset using the SFTTrainer. We have also tested the model on a sample instruction to check if it is able to generate the correct output.