In [1]:
# MLM 방식을 이용한 Further pre-traning 방식 구현 예제
# 참고 소스 : https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c 참조 바람
import torch
import os

from tqdm.notebook import tqdm
from transformers import DistilBertTokenizer, BertConfig, DistilBertForMaskedLM
from transformers import AdamW, get_linear_schedule_with_warmup

import sys
sys.path.append("..")
from myutils import GPU_info, seed_everything, mlogging, MLMDatasetbyDistilBert, AccuracyForMLM, SaveBERTModel

logfilepath:../../log/bwdataset_2022-04-21.log
logfilepath:../../log/qnadataset_2022-04-21.log


In [2]:
# 훈련시킬 말뭉치(사전 만들때 동일한 말뭉치 이용)
#input_corpus = "../../korpora/mycorpus/bong_corpus_mecab.txt"
input_corpus = "../../korpora/kowiki_20190620/wiki_20190620_small.txt"
#input_corpus = "../../korpora/kowiki_20190620/wiki_20190620_mecab_false_0311.txt"

# eval 말뭉치 
#eval_corpus = "../../korpora/kowiki_20190620/wiki_20190620_small.txt"
eval_corpus = "../../korpora/kowiki_20190620/wiki_eval_test.txt"

# 기존 사전훈련된 모델
model_path = "../../model/distilbert/distilbert-base-multilingual-cased/"

# 기존 사전 + 추가된 사전 파일
vocab_path="../../tokenizer/my_vocab/"

# 출력
OUTPATH = '../../model/distilbert/distilbert-base-multilingual-cased-bong-04-20/'

batch_size = 32
token_max_len = 128

device = GPU_info()
print(device)

#seed 설정
seed_everything(111)

#logging 설정
logger =  mlogging(loggername="distilbertfpt-1", logfilename="../../log/distilbertfpt-1")

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:../../log/distilbertfpt-1_2022-04-21.log


In [3]:
# tokeinzier 생성
# tokenizer 생성
# => BertTokenizer, BertTokenizerFast 둘중 사용하면됨

#tokenizer = DistilBertTokenizer(vocab_file=vocab_path, max_len=token_max_len, do_lower_case=False)
tokenizer = DistilBertTokenizer.from_pretrained(vocab_path, max_len=token_max_len, do_lower_case=False)
#tokenizer = BertTokenizerFast(vocab_speical_path)
# tokenizer = BertTokenizerFast(vocab_file=vocab_file, max_len=token_max_len, do_lower_case=False)


# speical 토큰 계수 + vocab 계수 - 이미 vocab에 포함된 speical 토큰 계수(5)
vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5 + 1
#vocab_size = len(tokenizer.all_special_tokens) + tokenizer.vocab_size - 5
print('special_token_size: {}, tokenizer.vocab_size: {}'.format(len(tokenizer.all_special_tokens), tokenizer.vocab_size))
print('vocab_size: {}'.format(vocab_size))
print('tokenizer_len: {}'.format(len(tokenizer)))

# 모델 로딩 further pre-training 
#config = BertConfig.from_pretrained(model_path)
model = DistilBertForMaskedLM.from_pretrained(model_path, from_tf=bool(".ckpt" in model_path)) 
#model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')    

#################################################################################
# 모델 embedding 사이즈를 tokenizer 크기 만큼 재 설정함.
# 재설정하지 않으면, 다음과 같은 에러 발생함
# CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)` CUDA 에러가 발생함
#  indexSelectLargeIndex: block: [306,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
#
#     해당 오류는 기존 Embedding(8002, 768, padding_idx=1) 처럼 입력 vocab 사이즈가 8002인데,
#     0~8001 사이를 초과하는 word idx 값이 들어가면 에러 발생함.
#################################################################################
model.resize_token_embeddings(len(tokenizer))

model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


special_token_size: 5, tokenizer.vocab_size: 149793
vocab_size: 149794
tokenizer_len: 149793


DistilBertForMaskedLM(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(149793, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [4]:
from torch.utils.data import DataLoader, RandomSampler

# true이면, 무조건 원본파일 읽고, cache 파일을 만듬.
# False로 하면 cache파일이 있으면 cache파일 이용함. cache파일 없으면 원본파일 일고, cache파일은 만들지 않음
overwrite_cache = False

# 각 스페셜 tokenid를 구함
CLStokenid = tokenizer.convert_tokens_to_ids('[CLS]')
SEPtokenid = tokenizer.convert_tokens_to_ids('[SEP]')
UNKtokenid = tokenizer.convert_tokens_to_ids('[UNK]')
PADtokenid = tokenizer.convert_tokens_to_ids('[PAD]')
MASKtokenid = tokenizer.convert_tokens_to_ids('[MASK]')
print('CLSid:{}, SEPid:{}, UNKid:{}, PADid:{}, MASKid:{}'.format(CLStokenid, SEPtokenid, UNKtokenid, PADtokenid, MASKtokenid))

# distilberttoknizer에는 token_type_ids(문장구분자) 가 없음
# 따라서 MLMDatasetbyDistilBert 함수를 이용하여 MLM 생성함
train_dataset = MLMDatasetbyDistilBert(corpus_path = input_corpus,
                           tokenizer = tokenizer, 
                           CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                           SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                           UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                           PADtokenid = PADtokenid,    # [PAD] 토큰 id
                           Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                           max_sequence_len=token_max_len,  # max_sequence_len)
                           mlm_probability=0.15,
                           overwrite_cache=overwrite_cache
                          )


# 학습 dataloader 생성
# => tenosor로 만듬
train_loader = DataLoader(train_dataset, 
                          batch_size=batch_size, 
                          #shuffle=True, # dataset을 섞음
                          sampler=RandomSampler(train_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                          num_workers=3
                         )
#===============================================================================
# eval dataloader 생성
eval_dataset = MLMDatasetbyDistilBert(corpus_path = eval_corpus,
                        tokenizer = tokenizer, 
                        CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                        SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                        UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                        PADtokenid = PADtokenid,    # [PAD] 토큰 id
                        Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                        max_sequence_len=token_max_len,  # max_sequence_len)
                        mlm_probability=0.15,
                        overwrite_cache=False
                        )


# eval dataloader 생성
# => tenosor로 만듬
eval_loader = DataLoader(eval_dataset, 
                         batch_size=batch_size, 
                         #shuffle=True, # dataset을 섞음
                         sampler=RandomSampler(eval_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                         num_workers=3
                        )
#===============================================================================

print(train_dataset[0])

2022-04-21 11:52:47,414 - bwpdataset - INFO - ==>[Start] cached file read: ../../korpora/kowiki_20190620/cached_lm_DistilBertTokenizer_128_wiki_20190620_small.txt
2022-04-21 11:52:47,584 - bwpdataset - INFO - <==[End] Loading features from cached file ../../korpora/kowiki_20190620/cached_lm_DistilBertTokenizer_128_wiki_20190620_small.txt [took 0.168 s]


CLSid:101, SEPid:102, UNKid:100, PADid:0, MASKid:103
*corpus:../../korpora/kowiki_20190620/wiki_20190620_small.txt
*max_sequence_len:128
*mlm_probability:0.15
*CLStokenid:101, SEPtokenid:102, UNKtokenid:100, PADtokeinid:0, Masktokeid:103
*corpus:../../korpora/kowiki_20190620/wiki_eval_test.txt
*max_sequence_len:128
*mlm_probability:0.15
*CLStokenid:101, SEPtokenid:102, UNKtokenid:100, PADtokeinid:0, Masktokeid:103
*total_line: 114


  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

{'input_ids': tensor([   101, 123809,   9551,    107, 135472,    107, 138135, 126023,  11018,
        120115, 120169,  23545,  11303,  48506,  70672,  30919,    119,    102,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,   

In [5]:
##################################################
epochs = 3            # epochs
learning_rate = 3e-5  # 학습률
##################################################

# optimizer 적용
optimizer = AdamW(model.parameters(), 
                 lr=learning_rate, 
                 eps=1e-8) # 0으로 나누는 것을 방지하기 위한 epsilon 값(10^-6 ~ 10^-8 사이 이값 입력합)

# 총 훈련과정에서 반복할 스탭
total_steps = len(train_loader)*epochs
warmup_steps = total_steps * 0.1 #10% of train data for warm-up

# 손실률 보여줄 step 수
p_itr = int(len(train_loader)*0.1)  
    
# step마다 모델 저장
save_steps = int(total_steps * 0.5)
    
# 스캐줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=warmup_steps, 
                                            num_training_steps=total_steps)

itr = 1
total_loss = 0
total_len = 0
total_correct = 0
total_test_correct = 0
total_test_len = 0
            
list_train_loss = []
list_train_acc = []
list_validation_acc = []

model.zero_grad()# 그래디언트 초기화
for epoch in tqdm(range(epochs)):

    model.train() # 훈련모드로 변환
    for data in tqdm(train_loader):
    
        #optimizer.zero_grad()
        model.zero_grad()# 그래디언트 초기화
        
        # 입력 값 설정
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        #token_type_ids = data['token_type_ids'].to(device)         
        labels = data['labels'].to(device)
        #print('Labels:{}'.format(labels))
        
        # 모델 실행
        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        #token_type_ids=token_type_ids,
                        labels=labels)
        
        # 출력값 loss,logits를 outputs에서 얻어옴
        loss = outputs.loss
        logits = outputs.logits
        #print('Loss:{}, logits:{}'.format(loss, logits))
        
        # optimizer 과 scheduler 업데이트 시킴
        loss.backward()   # backward 구함
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   # 그래디언트 클리핑 (gradient vanishing이나 gradient exploding 방지하기 위한 기법)
        optimizer.step()  # 가중치 파라미터 업데이트(optimizer 이동)
        scheduler.step()  # 학습률 감소
        
        # ***further pretrain 에는 손실률 계산을 넣지 않음
        # 정확도 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
        
        # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
        # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
        with torch.no_grad():
                        
            # 손실(loss) 계산 
            total_loss += loss.item()
                
            #===========================================
            # 정확도(Accuracy) 계산
            correct = AccuracyForMLM(logits, labels, attention_mask)
            total_correct += correct.sum().item() 
            total_len += attention_mask.sum().item()  # 단어 총 수는 attension_mask가 1(True) 인 것들의 합
            #=========================================   
         
            # 주기마다 test(validataion) 데이터로 평가하여 손실류 계산함.
            if itr % p_itr == 0:
                train_loss = total_loss/p_itr
                train_acc = total_correct/total_len
                         
                ####################################################################
                # 주기마다 eval(validataion) 데이터로 평가하여 손실류 계산함.
                # 평가 시작
                model.eval()

                #for data in tqdm(eval_loader):
                for data in eval_loader:
                    # 입력 값 설정
                    input_ids = data['input_ids'].to(device)
                    attention_mask = data['attention_mask'].to(device)
                    labels = data['labels'].to(device)

                    # 손실률 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
                    # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
                    # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
                    with torch.no_grad():
                        # 모델 실행
                        outputs = model(input_ids=input_ids, 
                                       attention_mask=attention_mask,
                                       labels=labels)

                        # 출력값 loss,logits를 outputs에서 얻어옴
                        #loss = outputs.loss
                        logits = outputs.logits

                        #===========================================
                        # 정확도(Accurarcy) 계산
                        correct = AccuracyForMLM(logits, labels, attention_mask)
                        total_test_correct += correct.sum().item() 
                        total_test_len += attention_mask.sum().item()  # 단어 총 수는 attension_mask가 1(True) 인 것들의 합
                        #========================================= 

                val_acc = total_test_correct/total_test_len
                    
                logger.info('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Train Acc: {:.4f}, Val Acc:{}'.format(epoch+1, epochs, itr, train_loss, train_acc, val_acc))
                    
                list_train_loss.append(train_loss)
                list_train_acc.append(train_acc)
                list_validation_acc.append(val_acc)
                 
                total_loss = 0
                total_len = 0
                total_correct = 0
                total_test_correct = 0
                total_test_len = 0
                ####################################################################

            # 모델 저장
            if itr % save_steps == 0:
                SaveBERTModel(model, tokenizer, OUTPATH, epochs, learning_rate, batch_size)

        itr+=1
   

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/307 [00:00<?, ?it/s]

/opt/conda/conda-bld/pytorch_1639180487213/work/aten/src/ATen/native/cuda/Indexing.cu:699: indexSelectLargeIndex: block: [410,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1639180487213/work/aten/src/ATen/native/cuda/Indexing.cu:699: indexSelectLargeIndex: block: [410,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1639180487213/work/aten/src/ATen/native/cuda/Indexing.cu:699: indexSelectLargeIndex: block: [410,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1639180487213/work/aten/src/ATen/native/cuda/Indexing.cu:699: indexSelectLargeIndex: block: [410,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1639180487213/work/aten/src/ATen/native/cuda/Indexing.cu:699: indexSelectLargeIndex: block: [410,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/cond

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
# 그래프로 loss 표기
#!pip install matplotlib
import matplotlib.pyplot as plt

plt.plot(list_train_loss, label='Train Loss')
#plt.plot(list_train_acc, label='Train Accuracy')
#plt.plot(list_validation_acc, label='Eval Accuracy')
plt.legend()

plt.show()

plt.plot(list_train_acc, label='Train Accuracy')
plt.plot(list_validation_acc, label='Eval Accuracy')
plt.legend()
plt.show()

In [None]:
### 전체모델 저장
#SaveBERTModel(model, tokenizer, OUTPATH, epochs, learning_rate, batch_size)