In [None]:
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer, pipeline, logging
from datasets import load_dataset
from trl import SFTTrainer
import torch
import os

#### Training and Dataset Configurations

In [None]:
batch_size = 16
num_workers = os.cpu_count()
max_steps = 3000
bf16 = False
fp16 = True
gradient_accumulation_steps = 2
learning_rate = 0.0001
context_length = 256
logging_steps = 500
save_steps = 500
model_name = "openai-community/gpt2"
out_dir = "outputs/gpt_alpaca_preprocess_fn"

#### Loading the Alpaca Instruction Tuning Dataset

In [None]:
dataset = load_dataset("tatsu-lab/alpaca")
print(dataset)

In [None]:
full_dataset = dataset['train'].train_test_split(test_size=0.5, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(dataset_train)
print(dataset_valid)

In [None]:
def preprocess_function(examples):
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return text

#### Initializing the GPT2 Base Model for Instruction Tuning

In [None]:
if bf16:
  model = AutoMdelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
  model = AutoModelForCausalLM.from_pretrained(model_name)

print(model)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

#### Initializing the Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)

tokenizer.pad_token = tokenizer.eos_token

#### Training the GPT2 Model on the Alpaca Dataset

In [None]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='steps',
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='tensorboard',
    max_steps=max_steps,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formating_function=preprocess_function,
    packing=True,
)

In [None]:
dataloader = trainer.get_train_dataloader()

for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

In [None]:
history = trainer.train()

In [None]:
model.save_pretrained(f"{out_dir}/model")
tokenizer.save_pretrained(f"{out_dir}/tokenizer")

#### Inference

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained('outputs/gpt2_alpaca_preprocess_fn/best_model/')
tokenizer = AutoTokenizer.from_pretrained('outputs/gpt2_alpaca_preprocess_fn/best_model/')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,
    device=device,
)

In [None]:
template = """### Instruction:
{}
### Input:
{}
### Response:
{}"""

In [None]:
instructions = 'Write three tips for staying healthy.'
inputs = ''
response = ''
prompt = template.format(instructions, inputs, response)

In [None]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
)

print(outputs[0]['generated_text'])