# Google Colab에서 Llama 2 finetuning

이 노트북은 Google Colab T4 GPU 환경에서 사용하는 것을 전제로 합니다.

In [None]:
!pip install -q accelerate peft bitsandbytes transformers trl

In [None]:
!pip install -q --upgrade transformers trl

In [None]:
!pip install -q accelerate>=0.26.0

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B"

# The instruction dataset to use
dataset_name = "rssaem/testdb2"

# Fine-tuned model name
new_model = "llama-3.2-3b-rssaem"



In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 20

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "cosine"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 0
# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = False
# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

dataset[7]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/374 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.32k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/41 [00:00<?, ? examples/s]

{'instruction': 'BTS 음악의 컨셉은 무엇입니까?',
 'output': '전반적으로 10대와 20대 청춘들의 생각과 고민, 삶과 사랑, 꿈과 역경을 주요 주제로 하는 노래들을 통해 자신들만의 세계관을 구축하고 있으며, 연계되는 이야기를 다양한 뮤직비디오들을 통해 유기적으로 풀어 나가는 모습을 보여준다.',
 'input': ''}

In [None]:
def create_text_column(example):
    # 'text' 컬럼 생성
    text = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    example["text"] = text
    return example

# 'text' 컬럼 생성
datasetrs = dataset.map(create_text_column)

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

In [None]:
datasetrs[7]

{'instruction': 'BTS 음악의 컨셉은 무엇입니까?',
 'output': '전반적으로 10대와 20대 청춘들의 생각과 고민, 삶과 사랑, 꿈과 역경을 주요 주제로 하는 노래들을 통해 자신들만의 세계관을 구축하고 있으며, 연계되는 이야기를 다양한 뮤직비디오들을 통해 유기적으로 풀어 나가는 모습을 보여준다.',
 'input': '',
 'text': '### Instruction:\nBTS 음악의 컨셉은 무엇입니까?\n\n### Response:\n전반적으로 10대와 20대 청춘들의 생각과 고민, 삶과 사랑, 꿈과 역경을 주요 주제로 하는 노래들을 통해 자신들만의 세계관을 구축하고 있으며, 연계되는 이야기를 다양한 뮤직비디오들을 통해 유기적으로 풀어 나가는 모습을 보여준다.'}

Load tokenizer and model with QLoRA configuration

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


Load base model

In [None]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     device_map=device_map
# )
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)



config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=datasetrs,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Step,Training Loss
25,3.2088
50,2.1463
75,1.6163
100,1.2301
125,0.9353
150,0.8326
175,0.7023
200,0.6971


TrainOutput(global_step=220, training_loss=1.3491450916637073, metrics={'train_runtime': 351.2839, 'train_samples_per_second': 2.334, 'train_steps_per_second': 0.626, 'total_flos': 1096050192162816.0, 'train_loss': 1.3491450916637073, 'epoch': 20.0})

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)

모델 테스트

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "BTS 음악의 컨셉은 무엇입니까?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] BTS 음악의 컨셉은 무엇입니까? [/INST] <s>[Response] ARMY의 마음을 담은 음악을 통해 음악과 음악ists가 만나는 컨셄트를 정의하고, 음악의 감정과 음악ists가 느끼는 감정들을 반영하는 것을 하고 있다. 음악과 음악ists가 만나는 방식과 방향을 정의하고, 음악과 음악ists가 만나는 의미를 정의하고, 음악과 음악ists가 만나는 결과를 정의하고 있다. </s>

10월 1일, 방탄소년단(BTS)이 데뷔 후 첫 번째 데뷔기념일을 기념하고 첫 번째 '오늘'을歌다 decision했다.  이날은 방탄소년단이 2003년 6월 13일에 데뷔한 날이며, 그 날은 한국에서 '오늘'이 의미하는 방식


모델 저장

In [None]:
savePath = "finetuning/llama3_2_rs_1015_100"
trainer.save_model(savePath)

기본 모델 테스트

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
instruction = "BTS 음악의 컨셉은 무엇입니까?"

messages = [
    {"role": "user", "content": f"{instruction}"}
    ]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=1024,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)

print(tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

BTS는 K-pop 그룹으로, 다양한 컨셉과 주제를 다루고 있습니다. 그들의 음악은 주로 다양한 사회적, 문화적, 인류적 문제를 다루며, 또한 개인의 성장, 사랑, 그리고 인간의 감정을 다루고 있습니다. 다음은 BTS의 주요 컨셉 중 몇 가지입니다:

1. **사회적 문제와 사회적 변화를 주장하는 음악**: BTS는 다양한 사회적 문제를 다루고 있으며, 이를 통해 사회적 변화를 주장합니다. 예를 들어, "Boy With Luv"는 평화와 friendship를 주제로 하고 있으며, "Dynamite"는 개인의 성장과 성취를 다루고 있습니다.

2. **인류적 문제와 인류의 감정**: BTS는 인류의 감정과 삶에 대해 다루며, 이를 통해 사람들에게 감동과 생각을 주고 있습니다. 예를 들어, "Spring Day"는 개인의 삶과 죽음에 대해 다루고 있으며, "Euphoria"는 감정의 복잡성을 다루고 있습니다.

3. **성장과 개인의 발전**: BTS는 개인의 성장과 발전을 주제로 하는 음악을 많이 발매합니다. 예를 들어, "No More Dream"는 새로운 시작과 희망을 다루며, "DNA"는 자신감과 성취를 다루고 있습니다.

4. **사랑과 관계**: BTS는 사랑과 관계에 대해 다루며, 이를 통해 사람들에게 감동과 생각을 주고 있습니다. 예를 들어, "Love Myself"는 사랑과 자신에 대한 사랑을 다루며, "Not Today"는 사랑과 관계의 어려움을 다루고 있습니다.

BTS는 이러한 다양한 컨셉을 통해 K-pop을 전 세계적으로 인기를 끌고 있으며, 그들의 음악은 많은 사람들에게 감동과 생각을 주고 있습니다.
