# Install Dependencies

In [1]:
!pip install transformers
!pip install lora_adapters
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


# Import Modules

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, default_data_collator
from torch.nn import functional as F
from torch import nn
from torch.optim import AdamW
from datasets import load_dataset
from lora_adapters import LoraMergedLinear, LoraLinear, LoraEmbedding, apply_adapter, mark_only_lora_as_trainable, lora_state_dict, undo_lora
import os

In [3]:
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")
model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", device_map="auto", torch_dtype=torch.bfloat16)


In [4]:
USE_LORA = True
if USE_LORA:
    model = apply_adapter(model, LoraEmbedding, rank=16, regex_pattern=".*embed_in")
    model = apply_adapter(model, LoraMergedLinear, rank=16, regex_pattern=".*[0-5].*query_key_value")
    model = mark_only_lora_as_trainable(model, bias='lora_only')
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): LoraEmbedding(
      50280, 2560, rank=16
      (lora_dropout): Identity()
    )
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): LoraMergedLinear(
            in_features=2560, out_features=7680, bias=True, rank=16, enable_lora=[True, False, True]
            (lora_dropout): Identity()
          )
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
      (6-9): 4 x

In [5]:
dataset = load_dataset("databricks/databricks-dolly-15k")
train_dataset = dataset['train']
train_dataset

Found cached dataset json (/home/kian/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15011
})

In [6]:
def tokenize(batch):
    outputs = tokenizer(batch['instruction'], padding='max_length', truncation=True, max_length=128)
    outputs['label_ids'] = tokenizer(batch['instruction'], padding='max_length', truncation=True, max_length=128).input_ids
    return outputs
    

In [7]:
train_dataset = train_dataset.map(tokenize)
train_dataset = train_dataset.select_columns(['input_ids', 'attention_mask', 'label_ids'])

Loading cached processed dataset at /home/kian/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-07875e2139150a01.arrow


In [8]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding=True,  max_length=2048)
batch = collate_fn(train_dataset[:512])
for k,v in batch.items():
    print(k, v.shape)

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids torch.Size([512, 128])
attention_mask torch.Size([512, 128])
labels torch.Size([512, 128])




In [9]:
training_args = TrainingArguments(output_dir="test_trainer", per_device_train_batch_size=48, per_device_eval_batch_size=64, report_to="none", bf16=True)
optimizer = AdamW((param for param in model.parameters() if param.requires_grad), lr=1e-3)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    optimizers=(optimizer, None)
)

In [11]:
trainer.train()

  0%|          | 0/939 [00:00<?, ?it/s]

{'loss': 6.9478, 'learning_rate': 0.00046751863684771036, 'epoch': 1.6}




{'train_runtime': 1893.7981, 'train_samples_per_second': 23.779, 'train_steps_per_second': 0.496, 'train_loss': 6.7459608896598775, 'epoch': 3.0}


TrainOutput(global_step=939, training_loss=6.7459608896598775, metrics={'train_runtime': 1893.7981, 'train_samples_per_second': 23.779, 'train_steps_per_second': 0.496, 'train_loss': 6.7459608896598775, 'epoch': 3.0})

In [12]:
if os.environ.get("COLAB_BACKEND_VERSION", None):
    from google.colab import runtime
    runtime.unassign()