<a href="https://colab.research.google.com/github/lhwong/fine-tune/blob/main/lora_udemy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
!pip install transformers datasets evaluate peft trl bitsandbytes


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.11.4-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Down

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
guanaco_dataset = "mlabonne/guanaco-llama2-1k"
new_model = "llama-1.1B-chat-guanaco"

#device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")
#print(f"Using device: {device}")


dataset = load_dataset(guanaco_dataset, split="train")
model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto')
model.config.use_cache = False
model.config.pretraining_tp = 1
#model.to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # pad sequences
tokenizer.padding_side = 'right'





README.md:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

(…)-00000-of-00001-9ad84bb9cf65a42f.parquet:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [3]:
# run inference
logging.set_verbosity(logging.CRITICAL)
prompt = "Who is Napoleon Bonaparte?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST] Who is Napoleon Bonaparte? [/INST]
[INST]


In [4]:
peft_params = LoraConfig(lora_alpha=16, # multiplier of Lora output when its added to the full forward output
                         lora_dropout=0.1, # with a probability of 10% it will set random Lora output to 0
                         r=64, # rank of Lora so matrices will have either LHS or RHS dimension of 64
                         bias="none", # no bias term
                         task_type="CAUSAL_LM"
)
training_params = SFTConfig(output_dir='./results',
                                    num_train_epochs=2, # two passs over the dataset
                                    per_device_train_batch_size=2, #mbs=2
                                    gradient_accumulation_steps=16, # effective batch size 16*2
                                    optim="adamw_torch",
                                    save_steps=25, # checkpoint every 25 steps
                                    logging_steps=1,
                                    learning_rate=2e-4, # step size in the optimizer update
                                    weight_decay=0.001,
                                    fp16=True, # 16 bit
                                    bf16=False, # not supported on V100
                                    max_grad_norm=0.3, #gradient clipping improves convergence
                                    max_steps=-1,
                                    warmup_ratio=0.03, # learning rate warmup
                                    group_by_length=True,
                                    lr_scheduler_type="cosine", # cosine lr scheduler
                                    dataset_text_field="text",
                                    max_seq_length=None
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params, # parameter efficient fine tuning AKA Lora

    tokenizer=tokenizer,
    args=training_params,
    packing=False
)
import gc # garbage collection
gc.collect()
torch.cuda.empty_cache() # clean cache

trainer.train() # train the model
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

prompt = "Who is Napoleon Bonaparte?"
pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f'<s>[INST] {prompt} [/INST]')
print(result[0]['generated_text'])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'loss': 1.4805, 'grad_norm': 0.06516026705503464, 'learning_rate': 0.0001, 'epoch': 0.032}
{'loss': 1.6206, 'grad_norm': 0.08122136443853378, 'learning_rate': 0.0002, 'epoch': 0.064}
{'loss': 1.7324, 'grad_norm': 0.07007559388875961, 'learning_rate': 0.0001998629534754574, 'epoch': 0.096}
{'loss': 1.7359, 'grad_norm': 0.09540508687496185, 'learning_rate': 0.00019945218953682734, 'epoch': 0.128}
{'loss': 2.0137, 'grad_norm': 0.10741205513477325, 'learning_rate': 0.00019876883405951377, 'epoch': 0.16}
{'loss': 2.2133, 'grad_norm': 0.14766599237918854, 'learning_rate': 0.00019781476007338058, 'epoch': 0.192}
{'loss': 2.9608, 'grad_norm': 0.3634866774082184, 'learning_rate': 0.00019659258262890683, 'epoch': 0.224}
{'loss': 1.6189, 'grad_norm': 0.09260421991348267, 'learning_rate': 0.00019510565162951537, 'epoch': 0.256}
{'loss': 1.6708, 'grad_norm': 0.1071651503443718, 'learning_rate': 0.00019335804264972018, 'epoch': 0.288}
{'loss': 1.7679, 'grad_norm': 0.10034343600273132, 'learning_rat

In [5]:
#model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
#model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")