In [5]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone


## Import model

Importing model via the traditional transformers , then loading components for Parameter Effcient Fine Tuning using the 🤗 `PEFT` library and `bitsandbytes` for loading large models in 8-bit. The fine-tuning method will rely on a recent method called "Low Rank Adapters" (LoRA)

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [13]:
model.dtype, model.device

(torch.float32, device(type='cpu'))

## Post Process the Model

In order to isolate other paramters and train only few params and the adaptors we need to disable gradients on other layes and upcast the `LayerNorm` to `float32` for stability.

`model.enable_input_require_grads()` Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping the model weights fixed.

In [16]:
for param in model.parameters():
  param.requires_grad = False # Freeze the model weights and train the adapter
  if param.ndim == 1:
     param.data = param.data.to(torch.float32) # Upcast small paramters like Layer Norm

# Gradient checkpoint
model.gradient_checkpointing_enable() # Reduce the number of stored activations
model.enable_input_require_grads() # Enables the gradients for the input embeddings.

## LoRA

Load the `PeftModel` and specify that we're gonna use LoRA using `get_peft_model` utility from `peft`. Specify the adaptors using the `LoraConfig`

In [17]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [19]:
from peft import get_peft_model, LoraConfig

config = LoraConfig(
    r = 16,
    lora_alpha = 32,
    target_modules = ['q_proj','k_proj','v_proj'],
    lora_dropout = 0.01,
    bias = "none",
    task_type = "CASUAL_LM"
)

model = get_peft_model(model, config)

## Load Data and Train

In [20]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [21]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=200,
        learning_rate=2e-4,
        fp16=False,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()