In [1]:
# !pip install transformers
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# loss 2.8460 - > 0.2065

In [18]:
# # install Hugging Face Libraries
# !pip install "peft==0.2.0"
# !pip install "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# # install additional dependencies needed for training
# !pip install rouge-score tensorboard py7zr
# !pip install loralib


[0m

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW, AutoTokenizer
from tqdm import tqdm
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
data = pd.read_csv('./data/train.csv')

In [4]:
# tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', eos_token='</s>')
tokenizer = PreTrainedTokenizerFast.from_pretrained('./skt2_augment_1_epoch/', eos_token='</s>')
model = GPT2LMHeadModel.from_pretrained("./skt2_augment_1_epoch/")

In [14]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): CastOutputToFloat(
    (0): Linear(in_features=768, out_features=51200, bias=False)
  )
)

In [20]:
# from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
# lora_config = LoraConfig(
#  r=16,
#  lora_alpha=32,
#  target_modules=["transformer"],
#  lora_dropout=0.05,
#  bias="none",
#  # task_type=TaskType.SEQ_2_SEQ_LM
# )
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training

LORA_R = 256 # 512
LORA_ALPHA = 512 # 1024
LORA_DROPOUT = 0.05
# Define LoRA Config
lora_config = LoraConfig(
                 r = LORA_R, # the dimension of the low-rank matrices
                 lora_alpha = LORA_ALPHA, # scaling factor for the weight matrices
                 lora_dropout = LORA_DROPOUT, # dropout probability of the LoRA layers
                 bias="none",
                 task_type="CAUSAL_LM",
                 target_modules=["query_key_value"],
)
# prepare int-8 model for training
# model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = prepare_model_for_int8_training(model)
# initialize the model with the LoRA framework
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# model.print_trainable_parameters()


In [5]:
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)

644it [00:01, 373.45it/s]


In [6]:
# model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')


In [None]:
CFG = {
    'LR' : 2e-5, # Learning Rate
    'EPOCHS' : 30, # 학습 Epoch
}
model.to(device) # 모델을 GPU단으로 이동

# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'])
model.train()

# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        # 데이터를 GPU단으로 이동
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 진행률 표시줄에 평균 손실 업데이트
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    # 에폭의 평균 손실을 출력
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")
    model.save_pretrained(f"./skt3_{epoch}_epoch")
    tokenizer.save_pretrained(f"./skt3_{epoch}_epoch")

Epoch 1 - Avg Loss: 0.1747: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6440/6440 [05:06<00:00, 20.98it/s]


Epoch 1/10, Average Loss: 0.1747176921249639


Epoch 2 - Avg Loss: 0.1664: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6440/6440 [05:03<00:00, 21.23it/s]


Epoch 2/10, Average Loss: 0.16635298613713395


Epoch 3 - Avg Loss: 0.1516:  86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 5557/6440 [04:23<00:43, 20.27it/s]