In [1]:
# !pip install git+https://github.com/haven-jeon/PyKoSpacing.git
# !pip install git+https://github.com/ssut/py-hanspell.git


In [2]:
# !pip install transformers
# !pip install accelerate

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
import pandas as pd
import numpy as np
import torch
import random

from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW, AutoTokenizer
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset # 데이터를 모델에 사용할 수 있도록 정리해 주는 라이브러리

# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
CFG = {
    'LR' : 5e-6, # Learning Rate
    'EPOCHS' : 15, # 학습 Epoch
    'BATCH_SIZE' : 1,
    # 'AUG_RATIO' : 0.15,
    'AUG_PROB' : 0.5,
}

In [5]:
data_df = pd.read_csv('./data/train.csv')
data_df = data_df.drop(['id','category'], axis=1)


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("./alpha_4_29_epoch/", eos_token='</s>', max_length=4000, truncation=True)
model = AutoModelForCausalLM.from_pretrained("./alpha_4_29_epoch/")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
import random


class HansolDataset(Dataset):
    def __init__(self, data_df, tokenizer):
        self.tokenizer = tokenizer
        self.data_df = None
        self.sentence_list = []
        self._augment_question(data_df)
        self._generate_data()
        
    def __getitem__(self, index):
        return self.sentence_list[index]
    
    def __len__(self):
        return len(self.sentence_list)

    def _augment_question(self, data_df):
        for _, row in data_df.iterrows():
            new_row = {}
            random_number = random.choice(range(0, len(data_df)))
            new_row['질문_1'] = row['질문_1'] + ' ' + data_df['질문_1'][random_number]
            new_row['질문_2'] = row['질문_2'] + ' ' + data_df['질문_2'][random_number]
            new_row['답변_1'] = row['답변_1'] + ' ' + data_df['답변_1'][random_number]
            new_row['답변_2'] = row['답변_2'] + ' ' + data_df['답변_2'][random_number]
            new_row['답변_3'] = row['답변_3'] + ' ' + data_df['답변_3'][random_number]
            new_row['답변_4'] = row['답변_4'] + ' ' + data_df['답변_4'][random_number]
            new_row['답변_5'] = row['답변_5'] + ' ' + data_df['답변_5'][random_number]
            df = pd.DataFrame(new_row,  index = [0])
            data_df = pd.concat([data_df, df], ignore_index=True)
        self.data_df = data_df.copy(deep=True)
        
    def _generate_data(self):
        for _, row in self.data_df.iterrows():
            for q_col in ['질문_1', '질문_2']:
                for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
                    # 질문과 답변 쌍을 </s> token으로 연결
                    input_text = row[q_col] + tokenizer.eos_token + row[a_col] + tokenizer.eos_token
                    input_ids = tokenizer.encode(input_text, return_tensors='pt')
                    self.sentence_list.append(input_ids)
                    


In [8]:
for name, module in model.named_children():
    if name == "gpt_neox":
        for name, sub_module in module.named_children():
            # print(name)
            if name == "embed_in":
                for param in sub_module.parameters():
                    param.requires_grad = False
            elif name == "emb_dropout":
                for param in sub_module.parameters():
                    param.requires_grad = False
            elif name == "layers":
                # print(name)
                 for name, layer_module in sub_module.named_children():
                     if int(name) < 26:
                        for param in layer_module.parameters():
                            param.requires_grad = False
            

In [None]:

model.to(device) # 모델을 GPU단으로 이동

# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'])
model.train()

# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    dataset = HansolDataset(data_df = data_df, tokenizer=tokenizer)
    dataloader = DataLoader(dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True) # 미니 배치 형태로 데이터 갖추기
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for batch_idx, batch in progress_bar:
        # 데이터를 GPU단으로 이동
        # batch.unsqueeze(0)
        batch = batch[0]
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 진행률 표시줄에 평균 손실 업데이트
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    # 에폭의 평균 손실을 출력
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(dataloader)}")
    model.save_pretrained(f"./alpha_aug2_{epoch}_epoch")
    tokenizer.save_pretrained(f"./alpha_aug2_{epoch}_epoch")

Epoch 1 - Avg Loss: 0.2368:   4%|█████▋                                                                                                                                     | 527/12880 [01:58<41:22,  4.98it/s]

In [None]:
ㅣ