In [1]:
# 여기서는 기존 사전훈련된 bert 모델에 추가적으로 MLM 학습 시키는 과정을 설명한다.
# => 사전은 기존 사전에 단어가 추가된 사전을 이용한다.(추가된 사전은 별도로 만들어야 함)
#
# 참고 사이트 
# https://www.fatalerrors.org/a/further-pre-training-of-chinese-language-model-bert-roberta.html
# 소스는 : https://github.com/zhusleep/pytorch_chinese_lm_pretrain 에 run_language_model_bert.py 참조함
# MLM 다른 소스는 : https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c 참조 바람

import transformers
import numpy as np
import torch

from myutils import seed_everything, TextDatasetForNextSentencePrediction, GPU_info, mlogging
from transformers import BertTokenizer, BertModel, BertTokenizerFast, BertConfig, AutoModelWithLMHead, BertForMaskedLM

In [2]:
# 변수들 설정

# 훈련시킬 말뭉치(사전 만들때 동일한 말뭉치 이용)
input_corpus = "Korpora/kowikitext/kowikitext_20200920.train"

# 기존 사전훈련된 모델
input_model_path = "model/bert-multilingual-cased/"
# 기존 사전 + 추가된 사전 파일
vocab_file="Tokenizer/kowikitext_20200920.train_0216_false_speical/bert-multiligual-cased_add_kowikitext_20200920.train_0216_false.txt"
# 출력 모델 저장 경로
ouput_model_dir = "model/bmc_fpt_kowiki20200920.train_model_epoch10_226"

# 토큰활 할때 최대 길이 
token_max_len = 130

# 훈련용 변수
batch_size = 32   # 64 하면, GPU Out of memory 발생함(=>**따라서 32 진행)
train_epochs = 10
 # embedding size 최대 몇 token까지 input으로 사용할 것인지 지정(기본:512) 512, 1024, 2048 식으로 지정함, 엄청난 장문을 다룰경우 10124까지
#max_position_embeddings = 128 
logging_steps = 10000  # 훈련시, 로깅할 step 수 (크면 10000번 정도하고, 작으면 100번정도)
save_steps = 50000     # 10000 step마다 모델 저장
save_total_limit = 2 # 마지막 3개 모델 빼고 과거 모델은 삭제(100000번째마다 모델 저장하는데, 마지감 3개 빼고 나머지는 삭제)

# NSP 관련 변수 (*여기서는 필요없음)
#NSP_block_size = 140

In [3]:
cuda = GPU_info()
print(cuda)

#seed 설정
seed_everything(111)

#logging 설정
logger =  mlogging(loggername="bertfpt", logfilname="bertfpt")

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:bertfpt_2022-02-26.log


In [4]:
# tokeinzier 생성
# tokenizer 생성
# => BertTokenizer, BertTokenizerFast 둘중 사용하면됨

tokenizer = BertTokenizer(vocab_file=vocab_file, 
                          max_len=token_max_len, 
                          do_lower_case=False)
'''
#tokenizer = BertTokenizerFast(vocab_speical_path)
tokenizer = BertTokenizerFast(
    vocab_file=vocab_file,
    max_len=token_max_len,
    do_lower_case=False,
    )
'''

# speical 토큰 계수 + vocab 계수 - 이미 vocab에 포함된 speical 토큰 계수(5)
vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5 + 1
#vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5
print('special_token_size: {}, tokenizer.vocab_size: {}'.format(len(tokenizer.all_special_tokens), tokenizer.vocab_size))
print('vocab_size: {}'.format(vocab_size))
print('tokenizer_len: {}'.format(len(tokenizer)))

special_token_size: 5, tokenizer.vocab_size: 143772
vocab_size: 143773
tokenizer_len: 143772


In [5]:
# 모델 생성
#output_hidden_states = False # 기본은 False=>output 2개 출력됨, True로 지정하면 output이 3개 출력됨
#return_dict = True   #False로 지정하는 경우 일반적인 tuple을 리턴, True인 경우는 transformers.file_utils.ModelOutput(ouput.logisc) 으로 리턴
#model = BertModel.from_pretrained(input_model_path, output_hidden_states = output_hidden_states, return_dict = return_dict)

# AutoModelWithLMHead 대신에 -> AutoModelForMaskedLM, BertForMaskedLM 해도됨
#=>원래 AutoModelWithLMHead 하면 내부적으로 BertForMaskedLM 가 로딩됨

if input_model_path:
    # further pre-training 인 경우 (기존 거에 추가적으로 하는 경우)
    config = BertConfig.from_pretrained(input_model_path)
    
    model = AutoModelWithLMHead.from_pretrained(input_model_path,
                                                from_tf=bool(".ckpt" in input_model_path),
                                                config=config) 
    print('further pre-training')
else:
    # Training new model from scratch 인 경우 (완전 새롭게 모델을 만드는 경우)
    model = AutoModelWithLMHead.from_config(config)
    print('Training new model from scratch')
 
#################################################################################
# 모델 embedding 사이즈를 tokenizer 크기 만큼 재 설정함.
# 재설정하지 않으면, 다음과 같은 에러 발생함
# CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)` CUDA 에러가 발생함
#  indexSelectLargeIndex: block: [306,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
#
#     해당 오류는 기존 Embedding(8002, 768, padding_idx=1) 처럼 입력 vocab 사이즈가 8002인데,
#     0~8001 사이를 초과하는 word idx 값이 들어가면 에러 발생함.
#################################################################################
model.resize_token_embeddings(len(tokenizer))

# 훈련모드로 변경(평가모드 : model.eval())
model.train()



Some weights of the model checkpoint at model/bert-multilingual-cased/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


further pre-training


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(143772, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          

In [6]:
model.num_parameters()

196603548

In [7]:
# MLM(Markup Language Model), NSP(Next Sentence Prediction) 구성
from transformers import DataCollatorForLanguageModeling, TextDataset
from bwpdataset import MyTextDataset

'''
# NSP 만들기
train_dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path=input_corpus,
    block_size=NSP_block_size,
    overwrite_cache=False,
    short_seq_probability=0.1,
    nsp_probability=0.5,
)
'''

# further-pretrain 일때는 일단 NSP는 입력안함.
# => 따라서 입력 corpus에 대해, NSP Dataset 이 아니라, TextDataset 으로 만듬
train_dataset = MyTextDataset(tokenizer=tokenizer, file_path=input_corpus, block_size=token_max_len, overwrite_cache=True)

# NSP 출력 해보기
print(train_dataset)
count = 0
for example in train_dataset:
    token_str = [[tokenizer.convert_ids_to_tokens(s) for s in example.tolist()]]
    print(token_str)
    print('count:{}=>{}'.format(count,example))
    count +=1
    if count > 5:
        break

2022-02-26 16:09:05,287 - bwpdataset - INFO - Creating features from dataset file at Korpora/kowikitext
2022-02-26 16:09:05,288 - bwpdataset - INFO - ==>[Start] file read: Korpora/kowikitext/kowikitext_20200920.train


logfilepath:bwdataset_2022-02-26.log


2022-02-26 16:09:16,855 - bwpdataset - INFO - <==[End] file read: Korpora/kowikitext/kowikitext_20200920.train
2022-02-26 16:09:16,858 - bwpdataset - INFO - ==>[Start] tokenizer convert_tokens_to_ids..wait max 30minute...
2022-02-26 16:47:15,487 - bwpdataset - INFO - <==[End] tokenizer convert_tokens_to_ids
2022-02-26 16:47:15,496 - bwpdataset - INFO - ==>[Start] tokenizer


  0%|          | 0/2957673 [00:00<?, ?it/s]

2022-02-26 16:48:14,074 - bwpdataset - INFO - ==>[End] tokenizer
2022-02-26 16:48:14,077 - bwpdataset - INFO - ==>[Start] cached file create: Korpora/kowikitext/cached_lm_BertTokenizer_128_kowikitext_20200920.train
2022-02-26 16:48:23,380 - bwpdataset - INFO - <==[End] Saving features into cached file Korpora/kowikitext/cached_lm_BertTokenizer_128_kowikitext_20200920.train [took 9.303 s]


<bwpdataset.MyTextDataset object at 0x7fdc59f812e0>
[['[CLS]', '=', '분류', ':', '중화', '##인', '##민', '##공화국', '##의', '외교부', '##장', '=', '외교부', '##장', '외교부', '##장', '=', '분류', ':', '헝가리', '##의', '공원', '=', '공원', '공원', '=', '김세', '##권', '=', '김세', '##권', '(', '1931', '##년', '~', ',', '金', '世', '權', ')', '은', '제', '##16', '##대', '서울', '##고', '##등', '##검', '##찰청', '검사장', '##을', '역임', '##한', '법조인', '##이다', '.', '=', '=', '생애', '=', '=', '1931', '##년', '서울시', '##에서', '태어나', '경기', '##중', '##학교', ',', '1982년', '4월', '12일', '##자', '매일', '##경제', '서울', '##고', '##등학교', ',', '1981년', '4월', '25일', '##자', '동아일보', '1956', '##년', '서울대', '##학교', '법학', '##과', '##를', '나온', '후', '1956', '##년', '제', '##8', '##회', '고등', '##고시', '사법', '##과', '##에서', '합격', '##하였다', '.', '1958', '##년', '서울', '##지', '##방', '##검', '##찰청', '검사', '##에', '임용', '##되었다', '.', '김세', '##권', '##은', '두산', '##그룹', '창업', '##주', '##인', '박', '##두', '[SEP]']]
count:0=>tensor([   101,    134, 119564,    131, 120053,  12030,  36553, 100084,  10459

In [1]:
# MLM 만들기
data_collator = DataCollatorForLanguageModeling(    # [MASK] 를 씌우는 것은 저희가 구현하지 않아도 됩니다! :-)
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
'''
# MLM 출력 해보기
for example in train_dataset:
    print(type(example))
    mlm_sample = data_collator(example['input_ids'])
    token_str = [[tokenizer.convert_ids_to_tokens(s) for s in mlm_sample['input_ids']]]
    print(token_str)
    print('count:{}=>{}'.format(count,mlm_sample))
    count +=1
    if count > 5:
        break
'''        
# MLM 출력 해보기
mlm_sample = data_collator(train_dataset.examples[2:3])
print(type(mlm_sample))
print(mlm_sample.keys())
token_str = [[tokenizer.convert_ids_to_tokens(s) for s in mlm_sample['input_ids']]]
print(token_str)
print(mlm_sample)


NameError: name 'DataCollatorForLanguageModeling' is not defined

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=ouput_model_dir,
    overwrite_output_dir=True,
    num_train_epochs=train_epochs,
    per_gpu_train_batch_size=batch_size,
    save_steps=save_steps,    # step 수마다 모델을 저장
    save_total_limit=save_total_limit, # 마지막 두 모델 빼고 과거 모델은 삭제
    logging_steps=logging_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  #MLM(Masked Language Model)
    train_dataset=train_dataset   #TEXT 혹은 NSP(Next Setence Predictions)
)

In [None]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 2957673
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 924280


Step,Training Loss
10000,2.9421
20000,2.4133
30000,2.2532
40000,2.1657
50000,2.1024
60000,2.0571
70000,2.0201
80000,1.9892
90000,1.965
100000,1.9344


Saving model checkpoint to model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-50000
Configuration saved in model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-50000/config.json
Model weights saved in model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-50000/pytorch_model.bin
Saving model checkpoint to model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-100000
Configuration saved in model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-100000/config.json
Model weights saved in model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-100000/pytorch_model.bin
Saving model checkpoint to model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-150000
Configuration saved in model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-150000/config.json
Model weights saved in model/bmc_fpt_kowiki20200920.train_model_epoch10_226/checkpoint-150000/pytorch_model.bin
Deleting older checkpoint [model/bmc_fpt_kowiki20200920

In [None]:
# 학습한 모델 저장
trainer.save_model(ouput_model_dir)

In [None]:
# tokeinizer 파일 저장(vocab)
tokenizer.save_pretrained(ouput_model_dir)