# Fine-Tuning Llama2 Model in Colab

- Colab은 15GB의 T4 GPU 환경

- 해당 환경에서는 사실상 LLMs의 fine-tuning이 불가능 하다.

- 따라서, LoRA와 QLoRA와 같은 PEFT 기법을 사용하여 fine-tuning을 수행한다.

In [1]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl datasets bitsandbytes

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
import torch, os, platform, warnings
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from huggingface_hub import notebook_login

In [3]:
def print_system_specs():
    # CUDA 이용 여부 체크
    is_cuda_available = torch.cuda.is_available()
    print("CUDA Available:", is_cuda_available)

    # 이용 가능한 Device 개수 체크
    num_cuda_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_cuda_devices)

    if is_cuda_available:
        for i in range(num_cuda_devices):
            # CUDA device들의 속성
            device = torch.device('cuda', i)
            print(f"--- CUDA Device {i} ---")
            print("Name:", torch.cuda.get_device_name(i))
            print("Compute Capability:", torch.cuda.get_device_capability(i))
            print("Total Memory:", torch.cuda.get_device_properties(i).total_memory, "bytes")

    # CPU 정보
    print("--- CPU Information ---")
    print("Processor:", platform.processor())
    print("System:", platform.system(), platform.release())
    print("Python Version:", platform.python_version())
print_system_specs()

CUDA Available: True
Number of CUDA devices: 1
--- CUDA Device 0 ---
Name: Tesla T4
Compute Capability: (7, 5)
Total Memory: 15835660288 bytes
--- CPU Information ---
Processor: x86_64
System: Linux 6.1.58+
Python Version: 3.10.12


In [4]:
# Pre trained model
model_name = 'NousResearch/Llama-2-7b-hf'

# Instruction dataset
dataset_name = 'mlabonne/guanaco-llama2-1k'

# fine-tuned model
new_model = "허깅페이스 repo"

In [5]:
# llama2 모델을 가져오기 위한 허깅페이스 로그인
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### QLoRA Parameters

In [6]:
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

### BitsAndBytes Parameters

In [7]:
# base model 4-bit 양자화 설정
use_4bit = True

# 4-bit base model을 위한 계산 dtype 설정
bnb_4bit_compute_dtype = 'float16'

# 양자화 type (fp4 or nf4)
bnb_4bit_quant_type = 'nf4'

# double quantization 설정
use_nested_quant = False

### Training Arguments Parameters

In [8]:

# log나 checkpoint 등 결과들을 저장할 디렉토리
output_dir = './results'

# 학습 에포크 수
num_train_epochs = 1

# fp16/bf16을 설정하여 메모리 사용을 줄일 수 있음 단, 정확도가 낮아질 수 있음
fp16 = False
bf16 = False

# gpu에 대한 batch size 설정
per_device_train_batch_size = 4
per_device_eval_batch_size = 4

# gradient accumulation(기울기를 n step만큼 축적하여 한번에 update) step 설정
gradient_accumulation_steps = 1

# 메모리 사용량을 줄이기 위한 기술 단, 연산 시간이 증가함
gradient_checkpointing = True

# gradient clipping
max_grad_norm = 0.3

# learning rate
learning_rate = 2e-4

# weight_decay
weight_decay = 0.001

# optimizer
optim = 'paged_adamw_8bit'

# learning rate scheduler
lr_scheduler_type = 'constant'

# warm up ratio
warmup_ratio = 0.03

# 동적 패딩을 하는 경우 비슷한 길이의 sequence끼리 group화
group_by_length = True


save_steps = 25
logging_steps = 25
device_map = {"": 0}

### Load Dataset

In [9]:
# Load dataset
dataset = load_dataset(dataset_name, split="train[0:10000]")

# QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load Base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

model = prepare_model_for_kbit_training(model)
model.config.use_cache = False # 추론시 재사용!
model.config.pretraining_tp = 1


# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf", trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [10]:
# Load LoRA configuration

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [11]:
# Set training arguments

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [12]:
# Set supervised fine-tuning parameters

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Train
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)



Step,Training Loss


## 평가

In [None]:
model.config.use_cache = True
model.eval()