#### 1. 라이브러리 불러오기

In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments

import time
import psutil
import copy
import gc

In [2]:
print(torch.cuda.is_available())

True


#### 2. 기본 모델 불러오기

In [3]:
model_name = "bert-base-uncased"
model_original = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# 깊은 복사
model_original_copy = copy.deepcopy(model_original)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 3. LoRA 설정

In [4]:
# 자세한 매개변수 설명은 아래 링크 참조
# https://huggingface.co/docs/peft/v0.14.0/en/package_reference/lora#peft.LoraConfig 
# https://huggingface.co/docs/peft/package_reference/peft_types

lora_config = LoraConfig(
    r=4,
    # Transformer architecture에서 query, key, value, output 행렬 중 어떤 행렬에 대해서 LoRA를 적용할 것인지 선택
    target_modules=["query", "value"],
    task_type=TaskType.SEQ_CLS,
    lora_alpha=32,
    lora_dropout=0.05
)

model_lora = get_peft_model(model=model_original_copy, peft_config=lora_config)

model_lora.print_trainable_parameters()

trainable params: 148,994 || all params: 109,632,772 || trainable%: 0.1359


In [5]:
model_original

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
model_lora

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (defaul

In [7]:
print("Base Model Trainable Params:", sum(p.numel() for p in model_original.parameters() if p.requires_grad))
print("LoRA Model Trainable Params:", sum(p.numel() for p in model_lora.parameters() if p.requires_grad))

Base Model Trainable Params: 109483778
LoRA Model Trainable Params: 148994


#### 4. 데이터셋 불러오기

In [8]:
# 텍스트 분류 (Text Classification)
# IMDb: 영화 리뷰 감성 분석 (긍정 / 부정)
dataset = load_dataset("imdb")


# 데이터 전처리 함수
def preprocess_function(tokenizer, data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

# 3000개만 사용
train_dataset = dataset["train"].select(range(3000)).map(
    lambda data: preprocess_function(tokenizer, data), batched=True)

eval_dataset = dataset["test"].select(range(3000)).map(
    lambda data: preprocess_function(tokenizer, data), batched=True)

#### 5. 학습

In [9]:
# 1. 기본 모델 bert-base-uncased
start_time_base = time.time()

training_args_base = TrainingArguments(
    output_dir="./results/base",
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs/base",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=2024,
)

trainer_base = Trainer(
    model=model_original,
    args=training_args_base,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer_base.train()

end_time_base = time.time()
time_base = end_time_base - start_time_base

print(f"Base Model Training Time: {time_base} seconds")

  trainer_base = Trainer(


  0%|          | 0/376 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/188 [00:00<?, ?it/s]

{'eval_loss': 1.1920928244535389e-07, 'eval_runtime': 186.1839, 'eval_samples_per_second': 16.113, 'eval_steps_per_second': 1.01, 'epoch': 1.0}


  0%|          | 0/188 [00:00<?, ?it/s]

{'eval_loss': 1.1920928244535389e-07, 'eval_runtime': 185.5646, 'eval_samples_per_second': 16.167, 'eval_steps_per_second': 1.013, 'epoch': 2.0}
{'train_runtime': 3332.3809, 'train_samples_per_second': 1.801, 'train_steps_per_second': 0.113, 'train_loss': 0.012369127983742572, 'epoch': 2.0}
Base Model Training Time: 3332.8805775642395 seconds


In [10]:
results_base = trainer_base.evaluate()

print("기본 모델 성능:")
print(results_base)

  0%|          | 0/188 [00:00<?, ?it/s]

기본 모델 성능:
{'eval_loss': 1.1920928244535389e-07, 'eval_runtime': 189.5093, 'eval_samples_per_second': 15.83, 'eval_steps_per_second': 0.992, 'epoch': 2.0}


In [11]:
# 학습 후 메모리 비우기

del trainer_base
gc.collect()
torch.cuda.empty_cache()

In [12]:
# 2. LoRA 적용 모델
start_time_lora = time.time()

training_args_lora = TrainingArguments(
    output_dir="./results/LoRA",
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs/LoRA",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=2024,
)

trainer_lora = Trainer(
    model=model_lora,
    args=training_args_lora,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer_lora.train()

end_time_lora = time.time()
time_lora = end_time_lora - start_time_lora

print(f"LoRA Model Training Time: {time_lora} seconds")

  trainer_lora = Trainer(


  0%|          | 0/376 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

{'eval_loss': 1.1761981966174062e-07, 'eval_runtime': 202.1309, 'eval_samples_per_second': 14.842, 'eval_steps_per_second': 0.93, 'epoch': 1.0}


  0%|          | 0/188 [00:00<?, ?it/s]

{'eval_loss': 1.1281171197197182e-07, 'eval_runtime': 202.7794, 'eval_samples_per_second': 14.794, 'eval_steps_per_second': 0.927, 'epoch': 2.0}
{'train_runtime': 1168.244, 'train_samples_per_second': 5.136, 'train_steps_per_second': 0.322, 'train_loss': 0.0018770395124212225, 'epoch': 2.0}
LoRA Model Training Time: 1168.7037088871002 seconds


In [13]:
results_lora = trainer_lora.evaluate()

print("LoRA 모델 성능:")
print(results_lora)

  0%|          | 0/188 [00:00<?, ?it/s]

LoRA 모델 성능:
{'eval_loss': 1.1281171197197182e-07, 'eval_runtime': 197.7231, 'eval_samples_per_second': 15.173, 'eval_steps_per_second': 0.951, 'epoch': 2.0}
