In [None]:
! pip install datasets wandb trl

In [2]:
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer, pipeline, logging
from datasets import load_dataset
from trl import SFTTrainer
import torch
import wandb
import os

#### Training and Dataset Configurations

In [3]:
batch_size = 16
num_workers = os.cpu_count()
max_steps = 3000
bf16 = False
fp16 = True
gradient_accumulation_steps = 2
learning_rate = 0.0001
context_length = 256
logging_steps = 500
save_steps = 500
model_name = "openai-community/gpt2"
out_dir = "outputs/gpt_alpaca_preprocess_fn"

#### W&B logging configurations

In [None]:
wandb.login()

In [None]:
run = wandb.init(
    project='gpt2-instruct-tune-SFT',
    job_type="training",
    config={
               "architecture": "gpt2",
               "dataset": "tatsu-lab/alpaca",
    }
)

#### Loading the Alpaca Instruction Tuning Dataset

In [6]:
dataset = load_dataset("tatsu-lab/alpaca")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


In [7]:
full_dataset = dataset['train'].train_test_split(test_size=0.5, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 26001
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 26001
})


In [8]:
def preprocess_function(examples):
    text = f"### Instruction:\n{examples['instruction']}\n\n### Input:\n{examples['input']}\n\n### Response:\n{examples['output']}"
    return text

#### Initializing the GPT2 Base Model for Instruction Tuning

In [9]:
if bf16:
  model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
  model = AutoModelForCausalLM.from_pretrained(model_name)

print(model)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [10]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

124,439,808 total parameters.
124,439,808 training parameters.


#### Initializing the Tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

#### Training the GPT2 Model on the Alpaca Dataset

In [12]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    # num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='steps',
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='wandb',
    run_name="gpt2-instruct-tune-SFT-v1",
    max_steps=max_steps,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)

In [13]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1495 > 1024). Running this sequence through the model will result in indexing errors


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [14]:
dataloader = trainer.get_train_dataloader()

for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

  self.pid = os.fork()
  self.pid = os.fork()


 worth the money.<|endoftext|>### Instruction:
Rewrite the story so that it takes place in another country.

### Input:
The story takes place in the United States and follows two friends, Sarah and David, on a road trip.

### Response:
The story takes place in Mexico and follows two friends, Sarah and David, on a road trip.<|endoftext|>### Instruction:
What is the next step needed to make a cake?

### Input:


### Response:
The next step needed to make a cake is to mix the dry ingredients together.<|endoftext|>### Instruction:
Describe the impact of Alexander Graham Bell's invention

### Input:


### Response:
Alexander Graham Bell's invention of the telephone in 1876 revolutionized the way people communicate with each other. It was the first reliable way to communicate over long distances and made it possible for people to stay in touch in ways that weren't possible before. It also enabled businesses to expand with the help of long-distance communication and allowed individuals to get

In [15]:
history = trainer.train()

Step,Training Loss,Validation Loss
500,2.0159,1.896439
1000,1.762,1.889916
1500,1.6253,1.913958
2000,1.5008,1.9326
2500,1.389,1.988752
3000,1.2926,2.064393


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [None]:
wandb.finish()

In [17]:
model.save_pretrained(f"{out_dir}/model")
tokenizer.save_pretrained(f"{out_dir}/tokenizer")

('outputs/gpt_alpaca_preprocess_fn/tokenizer/tokenizer_config.json',
 'outputs/gpt_alpaca_preprocess_fn/tokenizer/special_tokens_map.json',
 'outputs/gpt_alpaca_preprocess_fn/tokenizer/vocab.json',
 'outputs/gpt_alpaca_preprocess_fn/tokenizer/merges.txt',
 'outputs/gpt_alpaca_preprocess_fn/tokenizer/added_tokens.json',
 'outputs/gpt_alpaca_preprocess_fn/tokenizer/tokenizer.json')

#### Inference

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained('/content/outputs/gpt_alpaca_preprocess_fn/model')
tokenizer = AutoTokenizer.from_pretrained('/content/outputs/gpt_alpaca_preprocess_fn/tokenizer')
tokenizer.pad_token = tokenizer.eos_token

In [20]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,
    device=device,
)

In [21]:
template = """### Instruction:
{}
### Input:
{}
### Response:
{}"""

In [22]:
instructions = 'Write three tips for staying healthy.'
inputs = ''
response = ''
prompt = template.format(instructions, inputs, response)

In [23]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
)

print(outputs[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Instruction:
Write three tips for staying healthy.
### Input:

### Response:
1) Eat a balanced diet and plan your meals accordingly - this includes eating a variety of fruits, vegetables or lean proteins.  2) Practice mindful eating - practice eating small meals and focusing on one thing at a time. 3) Exercise regularly to stay motivated and improve your mental health


#### Upload model to huggingface hub

In [24]:
! sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [27]:
from huggingface_hub import HfApi, login, logout

In [None]:
login()

In [29]:
api = HfApi()
api.create_repo(repo_id="gpt2-instruct-tune-alpaca-SFT")

RepoUrl('https://huggingface.co/Chamath/gpt2-instruct-tune-alpaca-SFT', endpoint='https://huggingface.co', repo_type='model', repo_id='Chamath/gpt2-instruct-tune-alpaca-SFT')

In [31]:
api.upload_folder(
    folder_path="/content/outputs/gpt_alpaca_preprocess_fn",
    repo_type="model",
    repo_id="Chamath/gpt2-instruct-tune-alpaca-SFT",
)

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 11 LFS files:   0%|          | 0/11 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Chamath/gpt2-instruct-tune-alpaca-SFT/commit/5e1ed491ea030eae363cb7b5c5e2e0fe307e71eb', commit_message='Upload folder using huggingface_hub', commit_description='', oid='5e1ed491ea030eae363cb7b5c5e2e0fe307e71eb', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
logout()

Successfully logged out.
