In [1]:
# MLM 테스트 예제
import torch
import os

from tqdm.notebook import tqdm
from transformers import AutoTokenizer, DistilBertTokenizer, BertConfig, DistilBertForMaskedLM, BertForMaskedLM, RobertaForMaskedLM
from transformers import AdamW, get_linear_schedule_with_warmup

import sys
sys.path.append("..")
from myutils import GPU_info, seed_everything, mlogging, MLMDatasetbyDistilBert, MLMDataset, AccuracyForMaskedToken

# Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
# eval 말뭉치 
#eval_corpus = "../../data11/my_data/moco_eval.txt"
#eval_corpus = "../../data11/my_data/bong_eval.txt"
eval_corpus = "../../data11/my_data/klue_dp_valid_text.txt"

# model 타입 : 0=distilbert, 1=bert, 2=Roberta
#=>Roberta 모델에는 distilbert처럼 token_type_id 입력 없음.
model_type = 0
# 기존 사전훈련된 모델
model_path = "../../data11/model/distilbert/bert-re-kowiki-mecab/"
#model_path = "monologg/distilkobert"

batch_size = 16
token_max_len = 128

device = GPU_info()
print(device)

#seed 설정
seed_everything(333)

#logging 설정
logger =  mlogging(loggername="distilbert-MLM-Test", logfilename="../../log/distilbert-MLM-Test")

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0
logfilepath:../../log/distilbert-MLM-Test_2022-10-20.log


In [3]:
# tokeinzier 생성
tokenizer = AutoTokenizer.from_pretrained(model_path, max_len=token_max_len, strip_accents=False, do_lower_case=False)

# 모델 로딩 further pre-training 
if model_type == 0:
    model = DistilBertForMaskedLM.from_pretrained(model_path, from_tf=bool(".ckpt" in model_path)) 
elif model_type == 1:
    model = BertForMaskedLM.from_pretrained(model_path, from_tf=bool(".ckpt" in model_path)) 
elif model_type == 2:
    model = RobertaForMaskedLM.from_pretrained(model_path, from_tf=bool(".ckpt" in model_path)) 
    
model.to(device)

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(139547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, in

In [4]:
from torch.utils.data import DataLoader, RandomSampler

# true이면, 무조건 원본파일 읽고, cache 파일을 만듬.
# False로 하면 cache파일이 있으면 cache파일 이용함. cache파일 없으면 원본파일 일고, cache파일은 만들지 않음
overwrite_cache = False

# 각 스페셜 tokenid를 구함
CLStokenid = tokenizer.convert_tokens_to_ids('[CLS]')
SEPtokenid = tokenizer.convert_tokens_to_ids('[SEP]')
UNKtokenid = tokenizer.convert_tokens_to_ids('[UNK]')
PADtokenid = tokenizer.convert_tokens_to_ids('[PAD]')
MASKtokenid = tokenizer.convert_tokens_to_ids('[MASK]')
print('CLSid:{}, SEPid:{}, UNKid:{}, PADid:{}, MASKid:{}'.format(CLStokenid, SEPtokenid, UNKtokenid, PADtokenid, MASKtokenid))

#===============================================================================
# eval dataloader 생성
if model_type == 0 or model_type == 2:
    eval_dataset = MLMDatasetbyDistilBert(corpus_path = eval_corpus,
                            tokenizer = tokenizer, 
                            CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                            SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                            UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                            PADtokenid = PADtokenid,    # [PAD] 토큰 id
                            Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                            max_sequence_len=token_max_len,  # max_sequence_len)
                            mlm_probability=0.15,
                            overwrite_cache=False
                            )
elif model_type == 1:
    eval_dataset = MLMDataset(corpus_path = eval_corpus,
                            tokenizer = tokenizer, 
                            CLStokeinid = CLStokenid ,   # [CLS] 토큰 id
                            SEPtokenid = SEPtokenid ,    # [SEP] 토큰 id
                            UNKtokenid = UNKtokenid ,    # [UNK] 토큰 id
                            PADtokenid = PADtokenid,    # [PAD] 토큰 id
                            Masktokenid = MASKtokenid,   # [MASK] 토큰 id
                            max_sequence_len=token_max_len,  # max_sequence_len)
                            mlm_probability=0.15,
                            overwrite_cache=False
                            )


# eval dataloader 생성
# => tenosor로 만듬
eval_loader = DataLoader(eval_dataset, 
                         batch_size=batch_size, 
                         #shuffle=True, # dataset을 섞음
                         sampler=RandomSampler(eval_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                         num_workers=4
                        )
#===============================================================================


print(eval_dataset[0])

CLSid:101, SEPid:102, UNKid:100, PADid:0, MASKid:103
*corpus:../../data11/my_data/klue_dp_valid_text.txt
*max_sequence_len:128
*mlm_probability:0.15
*CLStokenid:101, SEPtokenid:102, UNKtokenid:100, PADtokeinid:0, Masktokeid:103
*total_line: 2000


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

{'input_ids': tensor([   101,    112,    148, 119380, 120007,  10884,    100, 129268,  79604,
         10739,    103,  16605,    103,  10459, 121536,  10530, 124704,  10622,
         10018,  45554,    119,    102,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,   

In [5]:
import time

total_test_loss = 0
total_test_correct = 0
total_test_len = 0           
list_validation_acc = []
count = 0

start = time.time()
logger.info(f'---------------------------------------------------------')

model.eval()

for eval_data in tqdm(eval_loader):
        # 입력 값 설정
        input_ids = eval_data['input_ids'].to(device)
        attention_mask = eval_data['attention_mask'].to(device)
        if model_type == 1:
            token_type_ids = eval_data['token_type_ids'].to(device)      #distilbert일때는 token_type_ids 없으므로 주석처리함
        labels = eval_data['labels'].to(device)

        # 손실률 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
        # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
        # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
        with torch.no_grad():
            # 모델 실행
            if model_type == 0 or model_type == 2:
                outputs = model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                #token_type_ids=token_type_ids,             #distilbert일때는 token_type_ids 없으므로 주석처리함
                                labels=labels)
            elif model_type == 1:
                outputs = model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids,             #distilbert일때는 token_type_ids 없으므로 주석처리함
                                labels=labels)

            # 출력값 loss,logits를 outputs에서 얻어옴
            loss = outputs.loss
            logits = outputs.logits

            total_test_loss += loss
            #===========================================
            # 정확도(Accurarcy) 계산
            correct, masked_len = AccuracyForMaskedToken(logits, labels, input_ids, MASKtokenid)           
            total_test_correct += correct.sum().item() 
            total_test_len += masked_len 
            #=========================================

            count += 1

val_acc = total_test_correct/total_test_len
val_loss = total_test_loss/count
    
logger.info(f'*model: {model_path}')
logger.info(f'*evalcorpus: {eval_corpus}')
logger.info(f'---------------------------------------------------------')
logger.info('*Val loss: {:.5f}, *Val Acc:{:.5f}, total_test_correct:{:.1f}, total_test_len:{:.1f}'.format(val_loss, val_acc, total_test_correct, total_test_len))
logger.info(f'---------------------------------------------------------')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info(f'-END-\n')
####################################################################


2022-10-21 08:51:29,294 - distilbert-MLM-Test - INFO - ---------------------------------------------------------


  0%|          | 0/125 [00:00<?, ?it/s]

2022-10-21 08:51:34,960 - distilbert-MLM-Test - INFO - *model: ../../data11/model/distilbert/bert-re-kowiki-mecab/
2022-10-21 08:51:34,961 - distilbert-MLM-Test - INFO - *evalcorpus: ../../data11/my_data/klue_dp_valid_text.txt
2022-10-21 08:51:34,962 - distilbert-MLM-Test - INFO - ---------------------------------------------------------
2022-10-21 08:51:34,964 - distilbert-MLM-Test - INFO - *Val loss: 16.59837, *Val Acc:0.38320, total_test_correct:2756.0, total_test_len:7192.0
2022-10-21 08:51:34,964 - distilbert-MLM-Test - INFO - ---------------------------------------------------------
2022-10-21 08:51:34,965 - distilbert-MLM-Test - INFO - === 처리시간: 5.671 초 ===
2022-10-21 08:51:34,965 - distilbert-MLM-Test - INFO - -END-

