# Fine-tuning Llama 3.2 3B Instruct

### In Case of Using CoLab : Connect to google drive

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
%cd /content/gdrive/MyDrive/LLMStudy/

### Install Packages

#### In Case of Using CoLab : Run install cells everytime
#### In Case of Using GPU Server : Run install cells at only the first time

In [None]:
%pip install -U transformers==4.46.3
%pip install -U datasets==3.2.0
%pip install -U accelerate==1.2.1
%pip install -U peft==0.14.0
%pip install -U trl==0.12.2
%pip install -U bitsandbytes==0.45.0

In [None]:
%pip install huggingface_hub

### import packages

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format



### Log in to Hugging Face API

In [2]:
API_TOKEN = [본인 허깅페이스 토큰으로 직접 수정하세요]

from huggingface_hub.hf_api import HfFolder
HfFolder.save_token(API_TOKEN)

### model id

In [3]:
# The model that you want to train from the Hugging Face hub
base_model = "Bllossom/llama-3.2-Korean-Bllossom-3B"

# The instruction dataset to use
#dataset_name = "rssaem/btsdata_kor"
dataset_name = "rssaem/btsdata_resp"

# Fine-tuned model name
new_model = "llama-3.2-3b-bts"


# 2. Loading the model and tokenizer

In [4]:
print(torch.cuda.get_device_capability()[0])

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

8


Load the model and tokenizer by providing the local model directory. Even though our model is small, loading the full model and fine-tuning it will take some time. Instead, we will load the model in 4-bit quantization.

In [5]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# 3. Loading and processing the dataset

In [6]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

dataset[7]


{'instruction': 'BTS의 대표 인사말은 무엇인가요? ',
 'response': '방! 탄! 안녕하세요, 방탄소년단입니다',
 '__index_level_0__': 7,
 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n        You are a helpful assistant<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n        BTS의 대표 인사말은 무엇인가요? <|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n        방! 탄! 안녕하세요, 방탄소년단입니다<|eot_id|>'}

In [None]:
#dataset

In [7]:
instruction = """you are a assistant please answer in korean lanauage.
    """
def format_chat_template(row):

    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]

    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

c = dataset.map(
    format_chat_template,
    num_proc= 4,
)

In [9]:
print(dataset)

Dataset({
    features: ['instruction', 'response', '__index_level_0__', 'text'],
    num_rows: 146
})


In [None]:
#dataset['text'][7]

In [10]:
datasetDict = dataset.train_test_split(test_size=0.2)

In [11]:
datasetDict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response', '__index_level_0__', 'text'],
        num_rows: 116
    })
    test: Dataset({
        features: ['instruction', 'response', '__index_level_0__', 'text'],
        num_rows: 30
    })
})

In [None]:
#print(datasetDict["train"])

In [None]:
#print(datasetDict["test"])

# 4. Setting up the model

Extract the linear model name from the model.

In [12]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

Use the linear module name to create the LoRA adopter. We will only fine-tune the LoRA adopter and leave the rest of the model to save memory and for faster training time.

In [13]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=10,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    #report_to="wandb"
)

We will now set up a supervised fine-tuning (SFT) trainer and provide a train and evaluation dataset, LoRA configuration, training argument, tokenizer, and model.

In [14]:
# LoRA config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
#model, chat_format_tokenizer = setup_chat_format(model, tokenizer)
#model = get_peft_model(model, peft_config)

In [15]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    mean_resizing=False
    model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


###  train


In [16]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=datasetDict["train"],
    eval_dataset=datasetDict["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

a60c92ba46fa539106cab1359d31862e634beebd

In [17]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


Step,Training Loss,Validation Loss
12,4.5534,2.857838
24,2.3446,1.981139
36,1.8436,1.79132
48,1.4442,1.650374


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=58, training_loss=2.1916829142077217, metrics={'train_runtime': 584.4727, 'train_samples_per_second': 0.397, 'train_steps_per_second': 0.099, 'total_flos': 383862610698240.0, 'train_loss': 2.1916829142077217, 'epoch': 2.0})

Model Inference

In [18]:
def generate_response(messages, model):

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

    terminators = [
        tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    #decoded_output = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    return decoded_output

instruction = "BTS 음악의 컨셉은 무엇입니까?"
messages = [
    {"role": "user", "content": f"{instruction}"},
]

generate_response(messages, model)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"system\n\nCutting Knowledge Date: December 2023\nToday Date: 16 Dec 2024\n\nYou are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요.user\n\nBTS 음악의 컨셉은 무엇입니까?assistant\n\n 방탄소년단의 음악 컨셉은 자신들의 삶과 생각, 사회에 대한 생각을 담아내고, 음악으로 표현하는 것이다."

7. Saving the tokenizer and model

In [19]:
!ls

1_Run_llama3_2_ko_3b.ipynb		    llama-3.2-3b-bts
2_FineTuning_LoraPeft_llama3_2_ko_3b.ipynb  Llama3_2_MergeTest
3_Load_and_Merge_llama3_2.ipynb		    rag
4_BaseRag_llama3_2_ko_3b.ipynb		    Untitled.ipynb
chromadb


In [20]:
new_model = "llama-3.2-3b-bts"

trainer.model.save_pretrained(new_model)



In [None]:
!ls llama-3.2-3b-bts

In [None]:
#!pip list > requirements.txt

In [21]:
print(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

llama-3.2-3b-bts


adapter_model.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rssaem/llama-3.2-3b-bts/commit/8e920c1a95084fbb6b54860fd89d8ea7636b40ad', commit_message='Upload model', commit_description='', oid='8e920c1a95084fbb6b54860fd89d8ea7636b40ad', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rssaem/llama-3.2-3b-bts', endpoint='https://huggingface.co', repo_type='model', repo_id='rssaem/llama-3.2-3b-bts'), pr_revision=None, pr_num=None)

In [None]:
#savePath = "finetuning/llama3_2_bts-50"
#trainer.save_model(savePath)