# Fine-tuning Llama 3.2 3B Instruct

In [1]:
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

Log in to Hugging Face CLI

In [3]:
!pip install huggingface_hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
The token `newTK` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate

In [4]:
# The model that you want to train from the Hugging Face hub
base_model = "Bllossom/llama-3.2-Korean-Bllossom-3B"

# The instruction dataset to use
#dataset_name = "rssaem/btsdata_kor"
dataset_name = "rssaem/btsdata_resp"

# Fine-tuned model name
new_model = "llama-3.2-3b-bts"


# 2. Loading the model and tokenizer

In [5]:
print(torch.cuda.get_device_capability()[0])

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

7


Load the model and tokenizer by providing the local model directory. Even though our model is small, loading the full model and fine-tuning it will take some time. Instead, we will load the model in 4-bit quantization.

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

# 3. Loading and processing the dataset

In [7]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

dataset[7]


README.md:   0%|          | 0.00/435 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/25.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/146 [00:00<?, ? examples/s]

{'instruction': 'BTS의 대표 인사말은 무엇인가요? ',
 'response': '방! 탄! 안녕하세요, 방탄소년단입니다',
 '__index_level_0__': 7,
 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n        You are a helpful assistant<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n        BTS의 대표 인사말은 무엇인가요? <|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n        방! 탄! 안녕하세요, 방탄소년단입니다<|eot_id|>'}

In [8]:
dataset

Dataset({
    features: ['instruction', 'response', '__index_level_0__', 'text'],
    num_rows: 146
})

In [9]:
instruction = """you are a assistant please answer in korean lanauage.
    """
def format_chat_template(row):

    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]

    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

c = dataset.map(
    format_chat_template,
    num_proc= 4,
)

Map (num_proc=4):   0%|          | 0/146 [00:00<?, ? examples/s]

In [10]:
print(dataset)

Dataset({
    features: ['instruction', 'response', '__index_level_0__', 'text'],
    num_rows: 146
})


In [11]:
dataset['text'][7]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n        You are a helpful assistant<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n        BTS의 대표 인사말은 무엇인가요? <|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n        방! 탄! 안녕하세요, 방탄소년단입니다<|eot_id|>'

In [12]:
datasetDict = dataset.train_test_split(test_size=0.2)

In [13]:
datasetDict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response', '__index_level_0__', 'text'],
        num_rows: 116
    })
    test: Dataset({
        features: ['instruction', 'response', '__index_level_0__', 'text'],
        num_rows: 30
    })
})

In [14]:
print(datasetDict["train"])

Dataset({
    features: ['instruction', 'response', '__index_level_0__', 'text'],
    num_rows: 116
})


In [15]:
print(datasetDict["test"])

Dataset({
    features: ['instruction', 'response', '__index_level_0__', 'text'],
    num_rows: 30
})


# 4. Setting up the model

Extract the linear model name from the model.

In [16]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

Use the linear module name to create the LoRA adopter. We will only fine-tune the LoRA adopter and leave the rest of the model to save memory and for faster training time.

In [17]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    num_train_epochs=50,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=10,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    #report_to="wandb"
)

We will now set up a supervised fine-tuning (SFT) trainer and provide a train and evaluation dataset, LoRA configuration, training argument, tokenizer, and model.

In [18]:
# LoRA config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
#model, chat_format_tokenizer = setup_chat_format(model, tokenizer)
#model = get_peft_model(model, peft_config)

In [19]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [20]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=datasetDict["train"],
    eval_dataset=datasetDict["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [21]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
290,0.1121,0.182448
580,0.1011,0.189057
870,0.097,0.207905
1160,0.0889,0.213078
1450,0.0822,0.22435


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=1450, training_loss=0.1904371112790601, metrics={'train_runtime': 3599.0042, 'train_samples_per_second': 1.612, 'train_steps_per_second': 0.403, 'total_flos': 9605313275904000.0, 'train_loss': 0.1904371112790601, 'epoch': 50.0})

Model Inference

In [27]:
def generate_response(messages, model):

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

    terminators = [
        tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    #decoded_output = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    return decoded_output

instruction = "BTS 음악의 컨셉은 무엇입니까?"
messages = [
    {"role": "user", "content": f"{instruction}"},
]

generate_response(messages, model)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"system\n\nCutting Knowledge Date: December 2023\nToday Date: 06 Dec 2024\n\nYou are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요.user\n\nBTS 음악의 컨셉은 무엇입니까?assistant\n\n방탄소년단(BTS)은 유명한 방탄복(이름: Combat Box)의 컨셉을 신인으로 만든 것처럼, 자신들만의 세계관을 구축하고 있으며, 그 세계관은 '생명과death'의cycle을 당 10개를 만든 것처럼, 삶의 의미를 찾고 있으며, 자신들만의 음악 스타일을 신인으로 만든 것처럼, 음악 10개를 만든 것처럼, 자신들의 음악의 컨셉을 구축하고 있으며, 그 컨셉은 '10의 개념'으로 만든 것처럼, 10을 의미하는 '원ان'의 개념을 만든 것처럼, 자신들의 음악의 컨셉을 구축하고 있습니다."

7. Saving the tokenizer and model

In [29]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [30]:
%cd /content/gdrive/MyDrive/LLMStudy/

/content/gdrive/MyDrive/LLMStudy


In [32]:
savePath = "finetuning/llama3_2_bts-50"
trainer.save_model(savePath)