In [1]:
# MLM 방식을 이용한 Further pre-traning 방식 구현 예제
# 참고 소스 : https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c 참조 바람
import torch
import os

from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, BertTokenizerFast
from transformers import AdamW, get_linear_schedule_with_warmup
from myutils import GPU_info, seed_everything, mlogging

logfilepath:bwdataset_2022-03-24.log
logfilepath:qnadataset_2022-03-24.log


In [2]:
# 훈련시킬 말뭉치(사전 만들때 동일한 말뭉치 이용)
input_corpus = "korpora/kowiki_20190620/wiki_20190620_mecab_false_0311.txt"

# 기존 사전훈련된 모델
model_path = "model/bert/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0325/100000/"

# 기존 사전 + 추가된 사전 파일
#vocab_path="tokenizer/wiki_20190620_false_0311_speical/bmc_add_wiki_20190620_false_0311.txt"
vocab_path="model/bert/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0325/100000/"

# 출력
OUTPATH = 'model/bert/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327/'

batch_size = 32
token_max_len = 128

device = GPU_info()
print(device)

#seed 설정
seed_everything(222)

#logging 설정
logger =  mlogging(loggername="bertfpt3", logfilename="bertfpt3")

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:bertfpt3_2022-03-24.log


In [3]:
# tokeinzier 생성
# tokenizer 생성
# => BertTokenizer, BertTokenizerFast 둘중 사용하면됨

#tokenizer = BertTokenizer(vocab_file=vocab_path, max_len=token_max_len, do_lower_case=False)
tokenizer = BertTokenizer.from_pretrained(vocab_path, max_len=token_max_len, do_lower_case=False)
# tokenizer = BertTokenizerFast(vocab_file=vocab_file, max_len=token_max_len, do_lower_case=False)


# speical 토큰 계수 + vocab 계수 - 이미 vocab에 포함된 speical 토큰 계수(5)
vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5 + 1
#vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5
print('special_token_size: {}, tokenizer.vocab_size: {}'.format(len(tokenizer.all_special_tokens), tokenizer.vocab_size))
print('vocab_size: {}'.format(vocab_size))
print('tokenizer_len: {}'.format(len(tokenizer)))

# 모델 로딩 further pre-training 
#config = BertConfig.from_pretrained(model_path)
#model = BertForMaskedLM.from_pretrained(model_path, from_tf=bool(".ckpt" in model_path), config=config) 
model = BertForMaskedLM.from_pretrained(model_path)    

#################################################################################
# 모델 embedding 사이즈를 tokenizer 크기 만큼 재 설정함.
# 재설정하지 않으면, 다음과 같은 에러 발생함
# CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)` CUDA 에러가 발생함
#  indexSelectLargeIndex: block: [306,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
#
#     해당 오류는 기존 Embedding(8002, 768, padding_idx=1) 처럼 입력 vocab 사이즈가 8002인데,
#     0~8001 사이를 초과하는 word idx 값이 들어가면 에러 발생함.
#################################################################################
model.resize_token_embeddings(len(tokenizer))

model.to(device)

special_token_size: 27, tokenizer.vocab_size: 167537
vocab_size: 167560
tokenizer_len: 167550


Some weights of the model checkpoint at model/bert/bert-multilingual-cased/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(167550, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          

In [5]:
from torch.utils.data import DataLoader, RandomSampler
import sys
sys.path.append('..')
from myutils import MLMDataset

# 각 스페셜 tokenid를 구함
CLStokenid = tokenizer.convert_tokens_to_ids('[CLS]')
SEPtokenid = tokenizer.convert_tokens_to_ids('[SEP]')
UNKtokenid = tokenizer.convert_tokens_to_ids('[UNK]')
PADtokenid = tokenizer.convert_tokens_to_ids('[PAD]')
MASKtokenid = tokenizer.convert_tokens_to_ids('[MASK]')
print('CLSid:{}, SEPid:{}, UNKid:{}, PADid:{}, MASKid:{}'.format(CLStokenid, SEPtokenid, UNKtokenid, PADtokenid, MASKtokenid))


train_dataset = MLMDataset(corpus_path = input_corpus,
                           tokenizer = tokenizer, 
                           CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                           SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                           UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                           PADtokenid = PADtokenid,    # [PAD] 토큰 id
                           Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                           max_sequence_len=token_max_len,  # max_sequence_len)
                           mlm_probability=0.15,
                           overwrite_cache=False
                          )


# 학습 dataloader 생성
# => tenosor로 만듬
train_loader = DataLoader(train_dataset, 
                          batch_size=batch_size, 
                          #shuffle=True, # dataset을 섞음
                          sampler=RandomSampler(train_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                          num_workers=3
                         )

print(train_dataset[0])

CLSid:101, SEPid:102, UNKid:100, PADid:0, MASKid:103
*corpus:korpora/kowiki_20190620/wiki_20190620_mecab_false_0311.txt
*max_sequence_len:128
*mlm_probability:0.15
*CLStokenid:101, SEPtokenid:102, UNKtokenid:100, PADtokeinid:0, Masktokeid:103
*total_line: 3748586


  0%|          | 0/3748586 [00:00<?, ?it/s]

  0%|          | 0/3748586 [00:00<?, ?it/s]

2022-03-24 16:51:45,190 - bwpdataset - INFO - ==>[Start] cached file create: korpora/kowiki_20190620/cached_lm_BertTokenizer_128_wiki_20190620_mecab_false_0311.txt
2022-03-24 16:52:38,633 - bwpdataset - INFO - <==[End] Saving features into cached file korpora/kowiki_20190620/cached_lm_BertTokenizer_128_wiki_20190620_mecab_false_0311.txt [took 53.441 s]


{'input_ids': tensor([   101, 120501,   9551,    107, 125318,    107, 125598, 122449,    103,
        120397, 119606,  23545,  11303,   9338,   9719,  70672,   9638,   9056,
           119,    102,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,   

In [6]:
##################################################
epochs = 4            # epochs
learning_rate = 2e-5  # 학습률
p_itr = 15000           # 손실률 보여줄 step 수
save_steps = 50000     # 50000 step마다 모델 저장
##################################################

# optimizer 적용
optimizer = AdamW(model.parameters(), 
                 lr=learning_rate, 
                 eps=1e-8) # 0으로 나누는 것을 방지하기 위한 epsilon 값(10^-6 ~ 10^-8 사이 이값 입력합)

# 총 훈련과정에서 반복할 스탭
total_steps = len(train_loader)*epochs
warmup_steps = total_steps * 0.1 #10% of train data for warm-up

# 스캐줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=warmup_steps, 
                                            num_training_steps=total_steps)

itr = 1
total_loss = 0
total_len = 0
total_correct = 0
list_training_loss = []
list_acc_loss = []
list_validation_acc_loss = []

model.zero_grad()# 그래디언트 초기화
for epoch in tqdm(range(epochs)):

    model.train() # 훈련모드로 변환
    for data in tqdm(train_loader):
    
        #optimizer.zero_grad()
        model.zero_grad()# 그래디언트 초기화
        
        # 입력 값 설정
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)       
        labels = data['labels'].to(device)
        #print('Labels:{}'.format(labels))
        
        # 모델 실행
        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        
        # 출력값 loss,logits를 outputs에서 얻어옴
        loss = outputs.loss
        #logits = outputs.logits
        #print('Loss:{}, logits:{}'.format(loss, logits))
        
        # optimizer 과 scheduler 업데이트 시킴
        loss.backward()   # backward 구함
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   # 그래디언트 클리핑 (gradient vanishing이나 gradient exploding 방지하기 위한 기법)
        optimizer.step()  # 가중치 파라미터 업데이트(optimizer 이동)
        scheduler.step()  # 학습률 감소
        
        # ***further pretrain 에는 손실률 계산을 넣지 않음
        # 정확도 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
        
        # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
        # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
        with torch.no_grad():
            # 정확도 계산 
            total_loss += loss.item()

            # 주기마다 test(validataion) 데이터로 평가하여 손실류 계산함.
            if itr % p_itr == 0:
                logger.info('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}'.format(epoch+1, epochs, itr, total_loss/p_itr))

                list_training_loss.append(total_loss/p_itr)

                total_loss = 0
                total_len = 0
                total_correct = 0

              
            if itr % save_steps == 0:
                #전체모델 저장
                TMP_OUT_PATH = OUTPATH + str(itr)
                os.makedirs(TMP_OUT_PATH)
                # save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
                model.save_pretrained(TMP_OUT_PATH)
                #torch.save(model, TMP_OUT_PATH + 'pytorch_model.bin') 

                # tokeinizer 파일 저장(vocab)
                VOCAB_PATH = TMP_OUT_PATH
                os.makedirs(VOCAB_PATH)
                tokenizer.save_pretrained(VOCAB_PATH)
                
                logger.info('Iteration {} -> save model:{}'.format(itr, TMP_OUT_PATH))

        itr+=1
   

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/112041 [00:00<?, ?it/s]

2022-03-24 18:01:13,947 - bertfpt3 - INFO - [Epoch 1/2] Iteration 15000 -> Train Loss: 0.5035
2022-03-24 19:08:02,952 - bertfpt3 - INFO - [Epoch 1/2] Iteration 30000 -> Train Loss: 0.1304
2022-03-24 20:15:26,767 - bertfpt3 - INFO - [Epoch 1/2] Iteration 45000 -> Train Loss: 0.1153
2022-03-24 20:38:30,994 - bertfpt3 - INFO - Iteration 50000 -> save model:model/bert/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0325/50000
2022-03-24 21:24:45,044 - bertfpt3 - INFO - [Epoch 1/2] Iteration 60000 -> Train Loss: 0.1080
2022-03-24 22:34:14,504 - bertfpt3 - INFO - [Epoch 1/2] Iteration 75000 -> Train Loss: 0.1034
2022-03-24 23:43:41,653 - bertfpt3 - INFO - [Epoch 1/2] Iteration 90000 -> Train Loss: 0.1002
2022-03-25 00:30:19,363 - bertfpt3 - INFO - Iteration 100000 -> save model:model/bert/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0325/100000
2022-03-25 00:53:03,700 - bertfpt3 - INFO - [Epoch 1/2] Iteration 105000 -> Train Loss: 0.0980


  0%|          | 0/112041 [00:00<?, ?it/s]

RuntimeError: unique_by_key: failed to synchronize: cudaErrorECCUncorrectable: uncorrectable ECC error encountered

In [None]:
### 전체모델 저장
os.makedirs(OUTPATH, exist_ok=True)
#torch.save(model, OUTPATH + 'pytorch_model.bin') 
# save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
model.save_pretrained(OUTPATH)

# tokeinizer 파일 저장(vocab)
VOCAB_PATH = OUTPATH
tokenizer.save_pretrained(VOCAB_PATH)