In [None]:
! pip install datasets wandb trl

In [6]:
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer, pipeline, logging
from datasets import load_dataset
from trl import SFTTrainer
import torch
import wandb
import os

#### Training and Dataset Configurations

In [4]:
batch_size = 16
num_workers = os.cpu_count()
max_steps = 3000
bf16 = False
fp16 = True
gradient_accumulation_steps = 2
learning_rate = 0.0001
context_length = 256
logging_steps = 500
save_steps = 500
model_name = "openai-community/gpt2"
out_dir = "outputs/gpt_alpaca_preprocess_fn"

#### W&B logging configurations

In [None]:
wandb.login()

In [None]:
run = wandb.init(
    project='gpt2-instruct-tune-SFT',
    job_type="training",
    config={
               "architecture": "gpt2",
               "dataset": "tatsu-lab/alpaca",
    }
)

#### Loading the Alpaca Instruction Tuning Dataset

In [9]:
dataset = load_dataset("tatsu-lab/alpaca")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


In [10]:
full_dataset = dataset['train'].train_test_split(test_size=0.5, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 26001
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 26001
})


In [11]:
def preprocess_function(examples):
    text = f"### Instruction:\n{examples['instruction']}\n\n### Input:\n{examples['input']}\n\n### Response:\n{examples['output']}"
    return text

#### Initializing the GPT2 Base Model for Instruction Tuning

In [12]:
if bf16:
  model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
  model = AutoModelForCausalLM.from_pretrained(model_name)

print(model)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [13]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

124,439,808 total parameters.
124,439,808 training parameters.


#### Initializing the Tokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

#### Training the GPT2 Model on the Alpaca Dataset

In [None]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='steps',
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='wandb',
    run_name="gpt2-instruct-tune-SFT-v1",
    max_steps=max_steps,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)

In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True,
)



Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1495 > 1024). Running this sequence through the model will result in indexing errors


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [19]:
dataloader = trainer.get_train_dataloader()

for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

  self.pid = os.fork()


 and any reported incidents will result in disciplinary action, up to and including termination. In addition, we strive to provide our employees with equal opportunities for growth and success, and all decisions regarding hiring, promotion and other equitable treatment will be made fairly and without bias.<|endoftext|>### Instruction:
Name five common ingredients used in French cuisine.

### Input:


### Response:
Common ingredients in French cuisine include butter, white wine, egg yolks, garlic, and herbs such as thyme, rosemary, oregano, and tarragon. Onions, mushrooms, and tomatoes are also used commonly in French cooking. Other ingredients often used in French cooking include beef and lamb, potatoes, milk, beef stock, and butter. Cheese and crusty breads are also frequently featured in French dishes.<|endoftext|>### Instruction:
Analyze the pros and cons of using a machine learning model for text summarization.

### Input:


### Response:
The advantages of using a machine learning 

In [None]:
history = trainer.train()

In [None]:
wandb.finish()

In [None]:
model.save_pretrained(f"{out_dir}/model")
tokenizer.save_pretrained(f"{out_dir}/tokenizer")

#### Inference

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained('outputs/gpt2_alpaca_preprocess_fn/best_model/')
tokenizer = AutoTokenizer.from_pretrained('outputs/gpt2_alpaca_preprocess_fn/best_model/')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,
    device=device,
)

In [None]:
template = """### Instruction:
{}
### Input:
{}
### Response:
{}"""

In [None]:
instructions = 'Write three tips for staying healthy.'
inputs = ''
response = ''
prompt = template.format(instructions, inputs, response)

In [None]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
)

print(outputs[0]['generated_text'])