In [1]:
#=====================================================================
# gpt2 모델을 새롭게 만드는 예제
#
# [과정]

# 1. Sentencepiece tokenizer 생성
# => 생성은 toeknzier/new_token.ipynb 참조
# => 여기서는 만들어진 tokenizer를 사용 함
#
# 2. 빈껍데기 GPT-2 모델 생성 
# => **반드시 위 vocab_size와 같은 크기로 word_embedding 사이즈 설정해야 함
#
# 3. 훈련
# => vocab 만들때 동일한 말뭉치를 훈련 데이터로 사용
#
# 4. 모델과 tokenizer 저장

#=====================================================================
import torch
import numpy as np
import pandas as pd
from transformers import GPT2Config, GPT2LMHeadModel, PreTrainedTokenizerFast
from transformers import TextDataset

from tqdm.notebook import tqdm
import os
import time
from myutils import GPU_info, seed_everything, mlogging

device = GPU_info()
print(device)

#seed 설정
seed_everything(222)

#logging 설정
logger =  mlogging(loggername="gpt2-scratch", logfilename="../log/gpt2-scratch")

model_path = '../model/gpt-2/mymodel'
OUTPATH = '../model/gpt-2/mymodel_Trainer/'

# WandB Disable 시킴
os.environ["WANDB_DISABLED"] = "true"


logfilepath:../log/bwdataset_2022-05-18.log
logfilepath:../log/qnadataset_2022-05-18.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:../log/gpt2-scratch_2022-05-18.log


In [2]:
# 1. Sentencepiece tokenizer 로딩 
# => 반드시 bos_token, eos_token, unk_token, pad_token, mask_token 들은 tokenizer 생성할때 사용한 vocab을 지정해야 함
# => 생성은 toeknzier/new_token.ipynb 참조
# => 여기서는 만들어진 tokenizer를 사용 함
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path,
                                                   bos_token='<cls>',
                                                   eos_token='<eos>',
                                                   unk_token='<unk>',
                                                   pad_token='<pad>',
                                                   mask_token='<mask>')

vocab_size = len(tokenizer.get_vocab())
print(f'*vocab_size:{vocab_size}')

*vocab_size:10661


In [3]:
# tokenizer 테스트 
sentence = "오늘은 날씨가 좋다"
encode = tokenizer.encode(sentence)
print(encode)
decode = tokenizer.decode(encode)
print(decode)

[2448, 703, 7074, 120, 2309, 271]
오늘은 날씨가 좋다


In [4]:
# 2. 빈껍데기 GPT-2 모델 생성 
# => **반드시 위 vocab_size와 같은 크기로 word_embedding 사이즈 설정해야 함
configuration = GPT2Config(vocab_size=vocab_size)
model = GPT2LMHeadModel(config=configuration) 
model.to(device)
print(model.num_parameters())


94030080


In [5]:
# vocab_size가 잘 설정되었는지 모델 출력 확인
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(10661, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [None]:
'''
# vocab 만들때 동일한 말뭉치를 불러와서 최대 토큰 계수를 얻어옴
corpus_path = "../korpora/kowiki_20190620/wiki_20190620_small.txt"
all_sentences = []

with open(corpus_path, 'r', encoding='utf-8') as f:
      for line in tqdm(f):
            all_sentences.append(line.strip())  # strip() 계행문자 제거
            
print(all_sentences[10:15])
print(len(all_sentences))

# 최대 토큰 계수를 구함.
max_token_len = max([len(tokenizer.encode(s)) for s in tqdm(all_sentences)])
print(f'max_token_len:{max_token_len}')
''''

In [6]:
# dataset 생성
corpus_path = "../korpora/kowiki_20190620/wiki_20190620_small.txt"
max_token_len = 256

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=corpus_path,
    block_size=max_token_len,  #최대 토큰 계수
)



In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [8]:
from transformers import Trainer, TrainingArguments

train_epochs = 10
batch_size = 32
save_steps = 100     # 10000 step마다 모델 저장
logging_steps = 10  # 훈련시, 로깅할 step 수 (크면 10000번 정도하고, 작으면 100번정도)
save_total_limit = 1 # 마지막 1개 모델 빼고 과거 모델은 삭제(100000번째마다 모델 저장하는데, 마지감 3개 빼고 나머지는 삭제)

training_args = TrainingArguments(
    output_dir=OUTPATH,
    overwrite_output_dir=True,
    num_train_epochs=train_epochs,
    per_device_train_batch_size=batch_size,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    prediction_loss_only=True
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [10]:
trainer.train()

***** Running training *****
  Num examples = 954
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 300


Step,Training Loss
10,8.8345
20,8.477
30,8.3057
40,8.142
50,8.0176
60,7.9663
70,7.8709
80,7.8815
90,7.8709
100,7.8147


Saving model checkpoint to ../model/gpt-2/mymodel_Trainer/checkpoint-100
Configuration saved in ../model/gpt-2/mymodel_Trainer/checkpoint-100/config.json
Model weights saved in ../model/gpt-2/mymodel_Trainer/checkpoint-100/pytorch_model.bin
Saving model checkpoint to ../model/gpt-2/mymodel_Trainer/checkpoint-200
Configuration saved in ../model/gpt-2/mymodel_Trainer/checkpoint-200/config.json
Model weights saved in ../model/gpt-2/mymodel_Trainer/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [../model/gpt-2/mymodel_Trainer/checkpoint-100] due to args.save_total_limit
Saving model checkpoint to ../model/gpt-2/mymodel_Trainer/checkpoint-300
Configuration saved in ../model/gpt-2/mymodel_Trainer/checkpoint-300/config.json
Model weights saved in ../model/gpt-2/mymodel_Trainer/checkpoint-300/pytorch_model.bin
Deleting older checkpoint [../model/gpt-2/mymodel_Trainer/checkpoint-200] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.

TrainOutput(global_step=300, training_loss=7.822224731445313, metrics={'train_runtime': 99.5643, 'train_samples_per_second': 95.818, 'train_steps_per_second': 3.013, 'total_flos': 1246362992640000.0, 'train_loss': 7.822224731445313, 'epoch': 10.0})