# Fine-tuning Llama 3.2 3B Instruct using DUKE and rsLoRA

__Fine-tuning Llama 3.2 3B Instruct using DUKE (Document Understanding and Knowledge Extraction) and rsLoRA__

In this Notebook, we will use a novel technique I've coined, DUKE (Document Understanding and Knowledge Extraction), along with LoRA (Low-Rank Adaptation), specifically, Rank-Stabilized LoRA (rsLoRA), a supervised fine-tuning technique and a method within PEFT (Parameter-Efficient Fine-Tuning), to fine-tune Meta Llama 3.2 3B Instruct, a lightweight 3 billion parameter instruction-tuned generative model. We will fine-tune the Llama model on a completely new domain, NVIDIA's newest Blackwell architecture.

Author: Gary A. Stafford

Date: 2025-05-17

## Installation

In [None]:
%pip install bitsandbytes accelerate peft trl cut_cross_entropy ipywidgets sentencepiece protobuf datasets huggingface_hub hf_transfer transformers packaging ninja matplotlib -Uqqq

In [None]:
# Windows
# %pip install triton-windows -Uqqq

# Linux, MacOS
%pip install triton -Uqqq

In [None]:
%pip install --extra-index-url https://download.pytorch.org/whl/cu128 torch torchvision torchaudio xformers -Uqqq

In [None]:
# Restart kernel (works on Linux, MacOS)
import os

os._exit(00)

In [None]:
# Works on Linux
%pip list | grep "torch\|trl\|peft\|accelerate\|transformers\|torchvision\|tensorflow\|bitsandbytes\|xformers" | sort

# Works on Windows
# %pip list | findstr "torch trl peft accelerate transformers torchvision tensorflow bitsandbytes xformers" | sort

## Base model

### Download base model

In [None]:
from huggingface_hub import snapshot_download

HF_TOKEN = "<YOUR_HUGGING_FACE_TOKEN>"

# Llama-3.2-3B-Instruct
llama_base_model = "meta-llama/Llama-3.2-3B-Instruct"
snapshot_download(repo_id=llama_base_model, repo_type="model", token=HF_TOKEN)

### Load base model

In [None]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
# )

# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     llm_int8_enable_fp32_cpu_offload=True,
# )

model = AutoModelForCausalLM.from_pretrained(
    llama_base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True,
    use_safetensors=True,
    # quantization_config=quantization_config,
).to(device)

tokenizer = AutoTokenizer.from_pretrained(llama_base_model)

# Set padding token id to eos token id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [None]:
print(f"Base model memory (GB): {model.get_memory_footprint() / 1024**3:0,.2f}")

In [None]:
print(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:0,.0f} || all params: {all_param:0,.0f} || trainable %: {(100 * trainable_params / all_param):0,.3f}"
    )


print_trainable_parameters(model)

### Inference with base model

In [None]:
test_prompts = [
    [
        {
            "role": "user",
            "content": "Describe the NVIDIA Blackwell architecture.",
        }
    ],
    [
        {
            "role": "user",
            "content": "What challenges is the NVIDIA Blackwell architecture designed to address?",
        }
    ],
    [
        {
            "role": "user",
            "content": "Please describe NVIDIA's second-generation Transformer Engine.",
        }
    ],
    [
        {
            "role": "user",
            "content": "How many Blackwell GPUs does the NVIDIA GB200 NVL72 have?",
        }
    ],
    [
        {
            "role": "user",
            "content": "How does the NVIDIA Blackwell architecture support generative AI?",
        }
    ],
]

In [None]:
%%time

for i in range(0, 5):
    inputs = tokenizer.apply_chat_template(
        test_prompts[i],
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)

    output = model.generate(
        input_ids=inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.1,
        pad_token_id=tokenizer.pad_token_id,
    ).to(device)

    output = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Response {i+1}: {output.split('assistant\n\n')[1].strip()}\n")

## Training Data Preparation
We now use the `Llama-3.1` format for conversation style fine tuning. But we convert it to HuggingFace's normal multiturn format `("role", "content")`. Llama-3 renders multi turn conversations like below:

```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How are you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
```

In [None]:
def formatting_prompts_func(examples):
    """
    Formats each chat conversation in examples["messages"] into a prompt string
    using the tokenizer's chat template. Returns a dict with the formatted texts.
    """
    messages = examples["messages"]
    texts = [
        tokenizer.apply_chat_template(
            message, tokenize=False, add_generation_prompt=False
        )
        for message in messages
    ]
    return {
        "text": texts,
    }

In [None]:
from datasets import load_dataset

dataset_raw = load_dataset(
    "json", data_files="data/blackwell_architecture.jsonl", split="train"
)

dataset_llama_format = dataset_raw.map(
    formatting_prompts_func,
    batched=True,
)

We look at how the conversations are structured for a row from fine-tuning dataset.

In [None]:
import json

messages = dataset_llama_format[49]["messages"]
print(json.dumps(messages, indent=2))

And we see how the chat template transformed these conversations.

**[Notice]** Llama 3.2 Instruct's default chat template default adds `"Cutting Knowledge Date: December 2023\nToday Date: 26 July 2024"`, so do not be alarmed!

In [None]:
print(dataset_llama_format[49]["text"])

Here is a complete row from fine-tuning dataset.

In [None]:
record = dataset_llama_format[49]
print(json.dumps(record, indent=2))

In [None]:
# Meta base 3B model lora
local_base_meta_lora = "models/Llama-3.2-3B-Instruct-lora-nvidia-blackwell"
hf_base_meta_lora = "garystafford/Llama-3.2-3B-Instruct-lora-nvidia-blackwell"

# Merged Meta base 3B model + lora
local_base_meta_merged_lora = (
    "models/Llama-3.2-3B-Instruct-merged-lora-nvidia-blackwell"
)
hf_base_meta_merged_lora = (
    "garystafford/Llama-3.2-3B-Instruct-merged-lora-nvidia-blackwell"
)

## Fine-tuning
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). You can do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run.

### Fine-tune base model

In [None]:
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig


# https://huggingface.co/docs/trl/main/en/sft_trainer#quickstart
training_args = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=3,
    # max_steps=60,
    learning_rate=2e-5,
    bf16=True,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    hub_model_id=hf_base_meta_lora,
    label_names=["labels"],
)

# https://huggingface.co/docs/trl/main/en/sft_trainer#training-adapters
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
    bias="none",
    use_rslora=True,
    task_type="CAUSAL_LM",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_llama_format,
    args=training_args,
    peft_config=peft_config,
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
max_memory = round(gpu_stats.total_memory / 1024**3, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
print_trainable_parameters(model)

In [None]:
trainer_stats = trainer.train()

In [None]:
trainer_stats

In [None]:
# @title Show final memory and time stats
steps = trainer_stats.global_step
used_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Training steps = {steps}")
print(f"Training time (seconds): {trainer_stats.metrics['train_runtime']}")
print(f"Training time (minutes): {round(trainer_stats.metrics['train_runtime']/60, 2)}")
print(f"Peak reserved memory = {used_memory} GB")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB")
print(f"Peak reserved memory % of max memory = {used_percentage} %")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %")

<a name="Inference"></a>
### Inference with LoRA adapter
Let's run the model! You can change the instruction and input - leave the output blank!

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [None]:
%%time

for i in range(0, 5):
    inputs = tokenizer.apply_chat_template(
        test_prompts[i],
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)

    output = model.generate(
        input_ids=inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.1,
        pad_token_id=tokenizer.pad_token_id,
    ).to(device)

    output = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Response {i+1}: {output.split('assistant\n\n')[1].strip()}\n")

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
%%time

from transformers import TextStreamer

print(model.name_or_path)

for i in range(0, 5):
    inputs = tokenizer.apply_chat_template(
        test_prompts[i],
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)

    text_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = model.generate(
        input_ids=inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.1,
        streamer=text_streamer,
        pad_token_id=tokenizer.pad_token_id,
    ).to(device)

<a name="Save"></a>
### Push LoRA adapter to HF
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model.

In [None]:
trainer.push_to_hub(
    token=HF_TOKEN,
    commit_message="Pushing LoRA adapter to Hugging Face Hub",
    blocking=True,
)

## Merge base model with LoRA adapter

This code is not used in the blog post. Optional, if you want to merge LoRA with base model.

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(llama_base_model)
model = PeftModel.from_pretrained(model, hf_base_meta_lora)
merged_model = model.merge_and_unload()
merged_model.save_pretrained(local_base_meta_merged_lora)
merged_model.push_to_hub(
    hf_base_meta_merged_lora,
    use_auth_token=HF_TOKEN,
    commit_message="Pushing merged model to Hugging Face Hub",
    blocking=True,
)
tokenizer.push_to_hub(
    hf_base_meta_merged_lora,
    use_auth_token=HF_TOKEN,
    commit_message="Pushing tokenizer to Hugging Face Hub",
    blocking=True,
)

## Load LoRA adapter with base model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(llama_base_model)
peft_model_id = hf_base_meta_lora
model = PeftModel.from_pretrained(base_model, peft_model_id).to(device)

tokenizer = AutoTokenizer.from_pretrained(llama_base_model)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [None]:
print(f"Model memory (GB): {model.get_memory_footprint() / 1024**3:0,.2f}")

### Inference with LoRA adapter and base model

In [None]:
%%time

for i in range(0, 5):
    inputs = tokenizer.apply_chat_template(
        test_prompts[i],
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)

    output = model.generate(
        input_ids=inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.1,
        pad_token_id=tokenizer.pad_token_id,
    ).to(device)

    output = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Response {i+1}: {output.split('assistant\n\n')[1].strip()}\n")