<a href="https://colab.research.google.com/github/hajeong67/nlp_LoRA/blob/main/nlp_homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers accelerate datasets peft trl bitsandbytes wandb

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.12.1-py3-none-any.w

# 라이브러리 install

In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional
import re

import torch
import sys
from accelerate import Accelerator
from datasets import load_dataset, Dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
from transformers import (
    HfArgumentParser,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TextStreamer,
    logging as hf_logging,
)
import logging
from trl import SFTTrainer, SFTConfig

from trl.trainer import ConstantLengthDataset

# 모델 설정

In [3]:
base_model_id = "meta-llama/Llama-3.2-1B-Instruct"
device_map = "cuda"
torch_dtype = torch.bfloat16
output_dir = "./llama-order-analysis"
dataset_name = "./llm-modeling-lab.jsonl"
seq_length = 512

# 토크나이저 설정

In [4]:
from huggingface_hub import login

# Hugging Face 토큰 발급
login(token="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")

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

# 양자화 설정

In [6]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# 모델 로딩

In [7]:
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
base_model.config.use_cache = False

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

# 패딩 토큰 설정

In [8]:
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
if base_model.config.pad_token_id != tokenizer.pad_token_id:
    base_model.config.pad_token_id = tokenizer.pad_token_id

# 데이터셋 로딩 및 전처리

In [13]:
from datasets import Dataset

# 데이터셋 로딩
full_dataset = Dataset.from_json(path_or_paths=dataset_name)

# 학습용 2,800개/ 검증용 200개 분리
train_dataset = full_dataset.select(range(2800))
val_dataset = full_dataset.select(range(2800, 3000))

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Generating train split: 0 examples [00:00, ? examples/s]

Training dataset size: 2800
Validation dataset size: 200


# 학습설정

1. LoRA 설정

In [14]:
from peft import LoraConfig

peft_config = LoraConfig(
    r=8,  # 랭크 설정
    lora_alpha=32,  # LoRA 학습률 스케일링 계수
    target_modules=["q_proj", "v_proj"],  # 학습 대상 모듈
    lora_dropout=0.05,  # 드롭아웃 비율
    bias="none",  # 바이어스 설정
    task_type="CAUSAL_LM",  # 작업 유형 설정
)

2. 학습 설정

In [15]:
from trl import SFTConfig

sft_config = SFTConfig(
    output_dir=output_dir,  # 결과가 저장될 디렉터리
    per_device_train_batch_size=2,  # 각 GPU/장치당 학습 배치 크기
    gradient_accumulation_steps=4,  # gradient accumulation 단계 수
    max_steps=100,  # 학습 최대 단계 수
    optim="paged_adamw_32bit",  # 옵티마이저 설정
    logging_steps=20,  # 로깅 빈도
    save_strategy="steps",  # 체크포인트 저장 전략
    save_steps=100,  # 체크포인트 저장 빈도
    save_total_limit=2,  # 저장할 체크포인트 개수 제한
    max_seq_length=seq_length,  # 입력 시퀀스의 최대 길이
    report_to="none",  # 로깅 도구 설정
    run_name="llama-fine-tuning"  # 실험 이름 설정
)

3. Trainer 설정

In [20]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=base_model,  # 학습할 모델
    train_dataset=train_dataset,  # 학습 데이터셋
    eval_dataset=val_dataset,  # 검증 데이터셋
    peft_config=peft_config,  # LoRA 설정
    tokenizer=tokenizer,  # 토크나이저
    args=sft_config  # 학습 설정
)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


4. 모델 학습 시작

In [25]:
trainer.train()

Step,Training Loss
20,5.9756
40,5.7875
60,5.6313
80,5.4694
100,5.4418


TrainOutput(global_step=100, training_loss=5.661101150512695, metrics={'train_runtime': 95.179, 'train_samples_per_second': 8.405, 'train_steps_per_second': 1.051, 'total_flos': 148308739645440.0, 'train_loss': 5.661101150512695, 'epoch': 0.2857142857142857})

# 모델 검증 및 평가

In [26]:
from tqdm import tqdm

# 검증 데이터셋을 사용하여 모델 출력 생성
generated_texts = []
for sample in tqdm(val_dataset):
    input_text = sample["text"]
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    outputs = base_model.generate(**inputs, max_new_tokens=50)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_texts.append(generated_text)

  0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 1/200 [00:05<18:22,  5.54s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 2/200 [00:12<20:29,  6.21s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 3/200 [00:16<17:39,  5.38s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 4/200 [00:21<17:03,  5.22s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▎         | 5/200 [00:25<15:49,  4.87s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 6/200 [00:28<13:51,  4.28s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▎         | 7/200 [00:32<13:06,  4.08s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 8/200 [00:36<12:29,  3.91s/it]Setting `pad_toke

sacrebleu 라이브러리 설치

In [27]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.10.1 sacrebleu-2.4.3


BLEU 스코어 계산

In [29]:
import sacrebleu

# BLEU 스코어 계산
bleu = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU score: {bleu.score}")

BLEU score: 0.9616295075150939
