# Finetune Llama-3 with LLaMA Factory

Please use a **free** Tesla T4 Colab GPU to run this!

Project homepage: https://github.com/hiyouga/LLaMA-Factory

In [1]:
from huggingface_hub import notebook_login

# Prompt for Hugging Face login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Install Dependencies

In [2]:
%cd /content/
%rm -rf LLaMA-Factory
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
!pip uninstall -y jax
!pip install -e .[torch,bitsandbytes,liger-kernel]

/content
Cloning into 'LLaMA-Factory'...
remote: Enumerating objects: 315, done.[K
remote: Counting objects: 100% (315/315), done.[K
remote: Compressing objects: 100% (245/245), done.[K
remote: Total 315 (delta 80), reused 160 (delta 57), pack-reused 0 (from 0)[K
Receiving objects: 100% (315/315), 8.94 MiB | 16.61 MiB/s, done.
Resolving deltas: 100% (80/80), done.
/content/LLaMA-Factory
[0m[01;34massets[0m/       [01;34mdocker[0m/      LICENSE      pyproject.toml  requirements.txt  [01;34msrc[0m/
CITATION.cff  [01;34mevaluation[0m/  Makefile     README.md       [01;34mscripts[0m/          [01;34mtests[0m/
[01;34mdata[0m/         [01;34mexamples[0m/    MANIFEST.in  README_zh.md    setup.py
[0mObtaining file:///content/LLaMA-Factory
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ..

### Check GPU environment

In [3]:
import torch
try:
  assert torch.cuda.is_available() is True
except AssertionError:
  print("Please set up a GPU before using LLaMA Factory: https://medium.com/mlearning-ai/training-yolov4-on-google-colab-316f8fff99c6")

## Update Identity Dataset

In [4]:
import json

%cd /content/LLaMA-Factory/

# Update model name and author to reflect the new model being used
NAME = "Gemma-2B"
AUTHOR = "LLaMA Factory"

# Open and read the dataset file
with open("data/identity.json", "r", encoding="utf-8") as f:
  dataset = json.load(f)

# Replace placeholders with the updated name and author
for sample in dataset:
  sample["output"] = sample["output"].replace("{{"+ "name" + "}}", NAME).replace("{{"+ "author" + "}}", AUTHOR)

# Write the updated dataset back to the file
with open("data/identity.json", "w", encoding="utf-8") as f:
  json.dump(dataset, f, indent=2, ensure_ascii=False)


/content/LLaMA-Factory


## Fine-tune model via LLaMA Board

In [None]:
%cd /content/LLaMA-Factory/
!GRADIO_SHARE=1 llamafactory-cli webui

/content/LLaMA-Factory
2024-09-18 04:01:40.437128: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-18 04:01:40.476626: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-18 04:01:40.488704: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-18 04:01:40.516941: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Running on local URL:  http://

## Fine-tune model via Command Line

It takes ~30min for training.

In [None]:
# Fine-tune model via command line
import json

args = dict(
  stage="sft",                        # do supervised fine-tuning
  do_train=True,
  model_name_or_path="neuralmagic/gemma-2-2b-it-quantized.w8a16", # use Gemma 2B quantized model
  dataset="identity,alpaca_en_demo",  # use alpaca and identity datasets
  template="gemma2b",                 # appropriate prompt template
  finetuning_type="lora",             # use LoRA adapters to save memory
  lora_target="all",                  # attach LoRA adapters to all linear layers
  output_dir="gemma2b_lora",          # path to save LoRA adapters
  per_device_train_batch_size=1,      # reduced batch size
  gradient_accumulation_steps=2,      # reduced gradient accumulation steps
  lr_scheduler_type="cosine",         # use cosine learning rate scheduler
  logging_steps=10,                   # log every 10 steps
  warmup_ratio=0.1,                   # use warmup scheduler
  save_steps=500,                     # save checkpoint every 500 steps
  learning_rate=3e-5,                 # lower learning rate
  num_train_epochs=1.0,               # reduce number of epochs
  max_samples=100,                    # use fewer examples to fit within memory
  max_grad_norm=1.0,                  # clip gradient norm to 1.0
  loraplus_lr_ratio=8.0,              # lower lambda to reduce computation
  fp16=True,                          # use float16 mixed precision training
  use_liger_kernel=True,              # use liger kernel for efficient training
)

json.dump(args, open("train_gemma2b.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/
!llamafactory-cli train train_gemma2b.json


## Infer the fine-tuned model

In [None]:
# Infer the fine-tuned model
from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc

%cd /content/LLaMA-Factory/

args = dict(
  model_name_or_path="neuralmagic/gemma-2-2b-it-quantized.w8a16", # use Gemma 2B quantized model
  adapter_name_or_path="gemma2b_lora",   # load the saved LoRA adapters
  template="gemma2b",                    # same as in training
  finetuning_type="lora",                # same as in training
  quantization_bit=4,                    # use 4-bit quantized model
)

chat_model = ChatModel(args)

messages = []
print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
while True:
  query = input("\nUser: ")
  if query.strip() == "exit":
    break
  if query.strip() == "clear":
    messages = []
    torch_gc()
    print("History has been removed.")
    continue

  messages.append({"role": "user", "content": query})
  print("Assistant: ", end="", flush=True)

  response = ""
  for new_text in chat_model.stream_chat(messages):
    print(new_text, end="", flush=True)
    response += new_text
  print()
  messages.append({"role": "assistant", "content": response})

torch_gc()


## Merge the LoRA adapter and optionally upload model

NOTE: the Colab free version has merely 12GB RAM, where merging LoRA of a 8B model needs at least 18GB RAM, thus you **cannot** perform it in the free version.

In [None]:
!huggingface-cli login

In [None]:
import json

# Update the model and adapter paths for Gemma 2B
args = dict(
  model_name_or_path="neuralmagic/gemma-2-2b-it-quantized.w8a16",  # use Gemma 2B quantized model
  adapter_name_or_path="gemma2b_lora",  # load the saved LoRA adapters from the earlier training
  template="gemma2b",  # use the same template as in training
  finetuning_type="lora",  # same as in training
  export_dir="gemma2b_lora_merged",  # the path to save the merged model
  export_size=2,  # the file shard size (in GB) of the merged model
  export_device="cpu",  # choose 'cpu' or 'cuda' based on available resources
  #export_hub_model_id="your_id/your_model",  # your Hugging Face hub ID to upload model, uncomment if needed
)

# Save the updated configuration to a JSON file
json.dump(args, open("merge_gemma2b.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

# Export the model using the updated configuration
!llamafactory-cli export merge_gemma2b.json
