<a href="https://colab.research.google.com/github/hanghae-plus-AI/AI-1-jhyeon-kim/blob/main/241107/%EC%8B%AC%ED%99%94%EA%B3%BC%EC%A0%9C/%ED%95%9C%EA%B5%AD%EC%96%B4_%EA%B0%90%EC%A0%95_%EB%B6%84%EB%A5%98_%EB%AA%A8%EB%8D%B8_%ED%8C%8C%EC%9D%B8%ED%8A%9C%EB%8B%9D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# -*- coding: utf-8 -*-
!pip install transformers datasets torch pandas scikit-learn peft wandb

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = '/content/drive/MyDrive/sentiment_classification_projects_241014/korean_text_sentiment_classification/datasets/감성대화말뭉치_training.csv'

# CSV 파일 읽기
df = pd.read_csv(file_path)
print(f"Number of rows: {len(df)}")

# 데이터셋 준비 및 토큰화 함수 정의
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# 학습/검증 데이터 분할 (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# HuggingFace Dataset 형식으로 변환
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# label_int를 labels로 변경
train_dataset = train_dataset.rename_column("label_int", "labels")
val_dataset = val_dataset.rename_column("label_int", "labels")

tokenizer = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)

# 데이터셋 토큰화 함수 정의
def tokenize_function(examples):
    return tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=128)

# 학습 및 검증 데이터셋 토큰화
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 필요한 열만 남기기
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print(train_dataset[0])  # 첫 번째 샘플 확인
print(val_dataset[0])    # 첫 번째 샘플 확인


# wandb 설정
import wandb
wandb.login()

# 평가 지표 함수 정의
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        "accuracy": accuracy,
        "f1": f1
    }

# 메모리 사용량 로깅
def log_memory_usage():
    mem_used = torch.cuda.memory_allocated() / 1024**3  # GB 단위로 변환
    wandb.log({"memory_usage": mem_used})

# 1. KoBERT 기본 모델 학습
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# wandb 설정 (base_model)
wandb.init(project="korean_emotion_classification", name="base_model")

# KoBERT 모델 초기화 (기본 모델)
base_model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=5).to('cuda')

training_args = TrainingArguments(
    output_dir='./results_base',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()
log_memory_usage()

wandb.finish()  # base_model 로그 세션 종료

# 2. LoRA 적용 경량화 모델 학습 (기본 모델 초기화)
from peft import LoraConfig, get_peft_model

# wandb 설정 (lora_model)
wandb.init(project="korean_emotion_classification", name="lora_model")

# KoBERT 모델 다시 초기화하여 독립적으로 시작
fresh_model_for_lora = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=5).to('cuda')

# LoRA 설정
lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["query", "value"])
lora_model = get_peft_model(fresh_model_for_lora, lora_config).to('cuda')

training_args_lora = TrainingArguments(
    output_dir='./results_lora',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",
)

trainer_lora = Trainer(
    model=lora_model,
    args=training_args_lora,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer_lora.train()
trainer_lora.evaluate()
log_memory_usage()

wandb.finish()  # lora_model 로그 세션 종료


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of rows: 51630




Map:   0%|          | 0/41304 [00:00<?, ? examples/s]

Map:   0%|          | 0/10326 [00:00<?, ? examples/s]



{'labels': tensor(0), 'input_ids': tensor([   2, 3419, 4949, 5118, 6903, 4628, 7096, 1370, 6116,  517, 5330, 5907,
        5439, 2339, 5561, 2135, 7088,  517, 6483, 6060, 6553,  517, 5499, 7350,
        5671, 6855,   54, 3419, 4949, 6903, 4102, 5112, 5655, 6604, 5405, 6857,
          54, 3322, 5411, 7010, 5023, 5931, 6150, 1016, 6003, 6553, 4998,   54,
        4949,  517, 5330, 5561, 3059,   54, 4299, 2692, 2346, 6727, 2872, 3860,
        2270, 7096, 3854, 5655, 6999,  258, 1101, 6087, 8000, 1618, 7836, 6632,
        1698, 1618, 5804, 7784, 5023, 7788, 2095, 6705, 7811, 1100, 6797, 7340,
        3164,  258, 1607, 7136, 6559, 6542, 5778, 5468, 2434, 6897, 1718, 4483,
        7806, 4924, 6705, 5760, 5512, 6999,   54,    3,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1

0,1
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▁▂▃▃▃▃▄▅▅▅▆▆▇▇▇█
train/global_step,▁▁▂▃▃▃▃▄▅▅▅▆▆▇▇▇█
train/grad_norm,▁▁▃▂▃▂▂▂▂█▄▂▂▆▅
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/loss,█▅▄▂▃▃▂▁▁▂▂▁▂▄█

0,1
eval/runtime,18.9169
eval/samples_per_second,545.861
eval/steps_per_second,34.149
train/epoch,2.90473
train/global_step,7500.0
train/grad_norm,1.01066
train/learning_rate,0.0
train/loss,0.393


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6029,0.569676,0.786752,0.778926
2,0.5042,0.565784,0.793918,0.787925
3,0.4262,0.579104,0.794596,0.791191


0,1
eval/accuracy,▁▇██
eval/f1,▁▆██
eval/loss,▃▁██
eval/runtime,▂▁█▂
eval/samples_per_second,▇█▁▇
eval/steps_per_second,▇█▁▇
memory_usage,▁
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇████
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇█████
train/grad_norm,▃▃▃▁▅▃▃▁▃▃▄▅▁▅█

0,1
eval/accuracy,0.7946
eval/f1,0.79119
eval/loss,0.5791
eval/runtime,18.1244
eval/samples_per_second,569.73
eval/steps_per_second,35.643
memory_usage,1.43081
total_flos,8150873817249792.0
train/epoch,3.0
train/global_step,7746.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.2,No log
2,1.1378,No log
3,1.111,No log


0,1
eval/runtime,▂▁█▁
eval/samples_per_second,▇█▁█
eval/steps_per_second,▇█▁█
memory_usage,▁
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇████
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇█████
train/grad_norm,▂▁▂▂▂▆▆▂▄▃█▃▃▆▃
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/loss,█▅▅▄▃▃▂▂▂▂▂▁▁▁▁

0,1
eval/runtime,18.8747
eval/samples_per_second,547.082
eval/steps_per_second,34.226
memory_usage,1.78126
total_flos,8178938945501184.0
train/epoch,3.0
train/global_step,7746.0
train/grad_norm,1.30427
train/learning_rate,0.0
train/loss,1.111


In [7]:
base_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [8]:
lora_model

PeftModel(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(8002, 768, padding_idx=1)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bi

In [9]:
# Calculate the total number of parameters
params_base_model = sum(p.numel() for p in base_model.parameters())
params_lora_model = sum(p.numel() for p in lora_model.parameters())


In [10]:
params_base_model, params_lora_model

(92190725, 92485637)