In [6]:
import os
from pathlib import Path

HF_HOME = Path("../hf_cache")
assert HF_HOME.exists()

In [7]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["HF_HOME"] = str(HF_HOME)
os.environ["WANDB_PROJECT"] = "qlora-fsdp2"

In [42]:
import datasets.utils.logging as ds_logging
import torch
import transformers.utils.logging as tf_logging
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

USE_QLORA = False
RUN_NAME = "hf-qlora-ref" if USE_QLORA else "hf-lora-ref"

tf_logging.set_verbosity_debug()
ds_logging.set_verbosity_debug()

max_seq_length = 2048
torch.set_default_dtype(torch.bfloat16)
model_name = "meta-llama/Llama-3.2-1B-Instruct"
dtype = torch.bfloat16

if USE_QLORA:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=dtype,
    )
else:
    bnb_config = None

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    attn_implementation="sdpa",
    quantization_config=bnb_config,
)

loading configuration file config.json from cache at ../hf_cache/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embe

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at ../hf_cache/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}



In [43]:
model.model.layers = model.model.layers[:1]
model.config.num_hidden_layers = 1

In [44]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"
# add pad token
added_vocab = tokenizer.get_added_vocab()
pad_token = [w for w in added_vocab if "pad" in w]
assert len(pad_token) == 1
tokenizer.pad_token = pad_token[0]

loading file tokenizer.json from cache at ../hf_cache/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
loading file tokenizer.model from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at ../hf_cache/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
loading file tokenizer_config.json from cache at ../hf_cache/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
loading file chat_template.jinja from cache at None


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [45]:
# print(tokenizer.chat_template)
messages = [{"role": "user", "content": "Hello, how are you?"}]
print(
    tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
        return_dict=False,
        continue_final_message=True,
    )
)
tokenizer.vocab_files_names

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 20 Feb 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Hello, how are you?


{'tokenizer_file': 'tokenizer.json', 'vocab_file': 'tokenizer.model'}

In [46]:
for name, param in model.named_parameters():
    print(f"{name} {type(param)} {param.requires_grad} {param.dtype}")

model.embed_tokens.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.self_attn.q_proj.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.self_attn.k_proj.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.self_attn.v_proj.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.self_attn.o_proj.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.mlp.gate_proj.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.mlp.up_proj.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.mlp.down_proj.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.input_layernorm.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.layers.0.post_attention_layernorm.weight <class 'torch.nn.parameter.Parameter'> True torch.bfloat16
model.norm.weight <class 'torch.nn.pa

In [47]:
model = prepare_model_for_kbit_training(model)
for name, param in model.named_parameters():
    print(f"{name} {type(param)} {param.requires_grad} {param.dtype}")

model.embed_tokens.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.self_attn.q_proj.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.self_attn.k_proj.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.self_attn.v_proj.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.self_attn.o_proj.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.mlp.gate_proj.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.mlp.up_proj.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.mlp.down_proj.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.input_layernorm.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.layers.0.post_attention_layernorm.weight <class 'torch.nn.parameter.Parameter'> False torch.float32
model.norm.weight <class 'torch.nn.pa

In [30]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        # "o_proj",
        # "gate_proj",
        # "up_proj",
        # "down_proj",
    ],
    lora_dropout=0,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

In [32]:
# Get LoRA and setup model
model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    print(f"{name} {type(param)} {param.requires_grad}")

base_model.model.base_model.model.base_model.model.model.embed_tokens.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight <class 'torch.nn.parameter.Parameter'> True
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight <class 'torch.nn.parameter.Parameter'> True
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight <class 'torch.nn.parameter.Parameter'> True
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight <class 'torch.nn.parameter.Paramet

In [34]:
embeddings = model.get_input_embeddings()
output_embeddings = model.get_output_embeddings()

In [37]:
embeddings.weight.dtype

torch.float32

In [40]:
model = prepare_model_for_kbit_training(model)

In [41]:
for name, param in model.named_parameters():
    print(f"{name} {type(param)} {param.requires_grad}")

base_model.model.base_model.model.base_model.model.model.embed_tokens.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight <class 'torch.nn.parameter.Parameter'> False
base_model.model.base_model.model.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight <class 'torch.nn.parameter.Para

In [None]:
model = prepare_model_for_kbit_training(model)

with torch.no_grad():
    for name, param in model.named_parameters():
        if ".lora_A." in name or ".lora_B." in name:
            param.requires_grad_(True)
        else:
            param.requires_grad_(False)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

print(model)
for name, param in model.named_parameters():
    print(f"{name} {type(param)} {param.shape}")
# Get dataset
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer

url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files={"train": url}, split="train[:10%]")

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    processing_class=tokenizer,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=1,
        max_steps=10,
        logging_steps=1,
        output_dir="outputs",
        seed=3407,
        max_seq_length=max_seq_length,
        fp16=model.get_input_embeddings().weight.dtype == torch.float16,
        bf16=model.get_input_embeddings().weight.dtype == torch.bfloat16,
        report_to="wandb",  # For W&B
        dataset_num_proc=4,
        run_name=RUN_NAME,
    ),
)

# trainer.train()

In [None]:
to