In [1]:
import transformers
import numpy as np
import torch

from myutils import seed_everything, TextDatasetForNextSentencePrediction, GPU_info, mlogging

logfilepath:bwdataset_2022-03-03.log


In [2]:
# 변수들 설정
input_corpus = "Korpora/kowikitext/kowikitext_20200920.train"
ouput_model_dir = "model/model_kowikitext_0207"
vocab_speical_path = "Tokenizer/kowikitext_20200920_speical"

# 훈련용 변수
batch_size = 64   # 128로 하면, GPU Out of memory 발생함(=>**따라서 64로 진행)
train_epochs = 5
 # embedding size 최대 몇 token까지 input으로 사용할 것인지 지정(기본:512) 512, 1024, 2048 식으로 지정함, 엄청난 장문을 다룰경우 10124까지
max_position_embeddings = 256 
logging_steps = 50000  # 훈련시, 로깅할 step 수 (크면 10000번 정도하고, 작으면 100번정도)
save_steps = 100000     # 10000 step마다 모델 저장
save_total_limit = 3 # 마지막 3개 모델 빼고 과거 모델은 삭제(100000번째마다 모델 저장하는데, 마지감 3개 빼고 나머지는 삭제)

# NSP 관련 변수
NSP_block_size = 140

In [3]:
cuda = GPU_info()
print(cuda)

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0


In [4]:
#logging 설정
logger = mlogging(loggername="bertpt", logfilname="bertpt")

In [5]:
#seed 설정
seed_everything(111)

In [6]:
# 토큰(special) 경로 폴더 설정 하여 불러옴
from transformers import BertTokenizerFast

vocab_special_path = vocab_speical_path
tokenizer_check = BertTokenizerFast.from_pretrained(vocab_special_path)

print('check special tokens : %s'%tokenizer_check.all_special_tokens)
#print('speical tokens size : {}'.format(tokenizer_check.all_special_tokens
print('vocab size : %d' % tokenizer_check.vocab_size)
tokenized_input_for_pytorch = tokenizer_check("나는 오늘 아침 밥을 먹었다.", return_tensors="pt")
print("Tokens (str)      : {}".format([tokenizer_check.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int)      : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

check special tokens : ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]', '[BOS]', '[EOS]', '[UNK0]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]', '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]', '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]']
vocab size : 32000
Tokens (str)      : ['[CLS]', '나', '##는', '오늘', '아침', '밥', '##을', '먹', '##었', '##다', '.', '[SEP]']
Tokens (int)      : [2, 265, 1356, 2932, 3768, 502, 1207, 459, 1605, 1151, 17, 3]
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]



In [7]:
# speical 토큰 계수 + vocab 계수 - 이미 vocab에 포함된 speical 토큰 계수(5)
vocab_size = len(tokenizer_check.all_special_tokens) + tokenizer_check.vocab_size - 5
print('vocab_size: {}'.format(vocab_size))

vocab_size: 32022


In [8]:
# MLM(Markup Language Model), NSP(Next Sentence Prediction) 구성
from transformers import DataCollatorForLanguageModeling

# NSP 만들기
train_dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer_check,
    file_path=input_corpus,
    block_size=NSP_block_size,
    overwrite_cache=False,
    short_seq_probability=0.1,
    nsp_probability=0.5,
)

2022-02-07 10:06:27,118 - test - INFO - *init=>File: Korpora/kowikitext/kowikitext_20200920.train, block_size:137, cached_features_file: Korpora/kowikitext/cached_nsp_BertTokenizerFast_140_kowikitext_20200920.train
2022-02-07 10:06:27,118 - test - INFO - *init=>File: Korpora/kowikitext/kowikitext_20200920.train, block_size:137, cached_features_file: Korpora/kowikitext/cached_nsp_BertTokenizerFast_140_kowikitext_20200920.train
2022-02-07 10:06:27,119 - test - INFO - Loading start cashed file Korpora/kowikitext/cached_nsp_BertTokenizerFast_140_kowikitext_20200920.train [starttime: 1644195987.120 s]=>wait 30 min...
2022-02-07 10:06:27,119 - test - INFO - Loading start cashed file Korpora/kowikitext/cached_nsp_BertTokenizerFast_140_kowikitext_20200920.train [starttime: 1644195987.120 s]=>wait 30 min...
2022-02-07 10:27:26,232 - test - INFO - Loading features from cached file Korpora/kowikitext/cached_nsp_BertTokenizerFast_140_kowikitext_20200920.train [took 1259.112 s]
2022-02-07 10:27:26,

In [9]:
# NSP 출력 해보기
for example in train_dataset.examples[0:1]:
    print(example)

{'input_ids': tensor([    2,    32,  1944,    29,  2926,  1407,  1303,  2888,  1236, 17196,
         1262,    32,     3,    32,    32,   687, 24475,    32,    32,     3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]), 'next_sentence_label': tensor(1)}


In [10]:
# MLM 만들기
data_collator = DataCollatorForLanguageModeling(    # [MASK] 를 씌우는 것은 저희가 구현하지 않아도 됩니다! :-)
    tokenizer=tokenizer_check, mlm=True, mlm_probability=0.15
)

In [11]:
# MLM 출력 해보기
print(data_collator(train_dataset.examples[0:2]))

{'input_ids': tensor([[    2,    32,  1944,    29,  2926,  1407,  1303,  2888,  1236, 17196,
          1262,    32,     3,    32,    32,  9134,     4,    32,    32,     3],
        [    2, 17196,  1262,     3,     5,     4,     3,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'next_sentence_label': tensor([1, 1]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,   687, 24475,  -100,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  2210,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  

In [12]:
from transformers import BertConfig, BertForPreTraining

# BERT 껍데기 만들기
# 다양한 조건들을 설정해보자

config = BertConfig(    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig
    vocab_size=vocab_size, # default는 영어 기준이므로 내가 만든 vocab size에 맞게 수정해줘야 함
    # hidden_size=768, # hidden_layer 임베딩수(768)
    # num_hidden_layers=12,    # layer 수(base=12개, large=24개)
    # num_attention_heads=12,    # transformer attention head number
    # intermediate_size=3072,   # transformer 내에 있는 feed-forward network의 dimension size
    # hidden_act="gelu",
    # hidden_dropout_prob=0.1,
    # attention_probs_dropout_prob=0.1,
    max_position_embeddings=max_position_embeddings,    # embedding size 최대 몇 token까지 input으로 사용할 것인지 지정(기본:512) 512, 1024, 2048 식으로 지정함, 엄청난 장문을 다룰경우 10124까지
    # type_vocab_size=2,    # token type ids의 범위 (BERT는 segmentA, segmentB로 2종류)
    # pad_token_id=0,
    # position_embedding_type="absolute"
)

model = BertForPreTraining(config=config)
model.num_parameters()

111063320

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=ouput_model_dir,
    overwrite_output_dir=True,
    num_train_epochs=train_epochs,
    per_gpu_train_batch_size=batch_size,
    save_steps=save_steps,    # step 수마다 모델을 저장
    save_total_limit=save_total_limit, # 마지막 두 모델 빼고 과거 모델은 삭제
    logging_steps=logging_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  #MLM(Masked Language Model)
    train_dataset=train_dataset   #NSP(Next Setence Predictions)
)

In [14]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 10814774
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 844905


Step,Training Loss
50000,3.6749
100000,2.5905
150000,2.3486
200000,2.2085
250000,2.1125
300000,2.0426
350000,1.9839
400000,1.9294
450000,1.8893
500000,1.8536


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

TrainOutput(global_step=844905, training_loss=2.0498080405489376, metrics={'train_runtime': 278530.1621, 'train_samples_per_second': 194.14, 'train_steps_per_second': 3.033, 'total_flos': 3.834691455718223e+18, 'train_loss': 2.0498080405489376, 'epoch': 5.0})

In [15]:
trainer.save_model(ouput_model_dir)

Saving model checkpoint to model/model_kowikitext_0207
Configuration saved in model/model_kowikitext_0207/config.json
Model weights saved in model/model_kowikitext_0207/pytorch_model.bin
