In [1]:
# NLI(Natural Language Interference:자연어 추론) 테스트 예제
#
# => input_ids : [CLS]senetence1(전제)[SEP]sentence2(가설)
# => attention_mask : 1111111111(전체,가설)0000000(그외)
# => token_type_ids : 0000000(전제)1111111(가설)00000000(그외)
# => laels : 참(수반:entailment), 거짓(모순:contradiction), 모름(중립:neutral)

import numpy as np
import pandas as pd
import torch
import os
import torch.nn.functional as F
import sys

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, BertForSequenceClassification

from tqdm.notebook import tqdm

sys.path.append("../../")
from myutils import seed_everything, GPU_info, mlogging

logger = mlogging(loggername="distilbertnlitest", logfilename="distilbertnlitest")
device = GPU_info()

os.environ["TOKENIZERS_PARALLELISM"] = "false"

logfilepath:distilbertnlitest_2022-10-25.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [2]:
#############################################################################################
# 변수들 설정
# - model_path : from_pretrained() 로 호출하는 경우에는 모델파일이 있는 폴더 경로나 
#          huggingface에 등록된 모델명(예:'bert-base-multilingual-cased')
#          torch.load(model)로 로딩하는 경우에는 모델 파일 풀 경로
#
# - vocab_path : from_pretrained() 호출하는 경우에는 모델파일이 있는 폴더 경로나
#          huggingface에 등록된 모델명(예:'bert-base-multilingual-cased')   
#          BertTokenizer() 로 호출하는 경우에는 vocab.txt 파일 풀 경로,
#############################################################################################
#############################################################################
# 변수 설정
#############################################################################
seed = 111
max_seq_len = 72   # 글자 최대 토큰 길이 해당 토큰 길이 이상은 잘린다.
batch_size = 64        # 배치 사이즈(64면 GUP Memory 오류 나므로, 32 이하로 설정할것=>max_seq_length 를 줄이면, 64도 가능함)
cache = True   # 캐쉬파일 생성할거면 True로 (True이면 loding할때 캐쉬파일있어도 이용안함)
#############################################################################
seed_everything(seed)

use_kornli = 0     #  kornli 파일
use_kluenli = 0    # kluests_v1.1 파일
use_gluenli = 1    # glue 파일

kornli_eval_file_fpath = '../../../data11/korpora/kornli/xnli.test.ko.tsv'
kluenli_eval_file_fpath = '../../../data11/korpora/klue-nli/klue-nli-v1.1_dev.json'
gulenli_eval_file_fpath = '../../../data11/korpora/gluemnli/glue-mnli-valid.tsv'

# model 타입 : 0=distilbert, 1=bert, 2=Roberta
#=>Roberta 모델에는 distilbert처럼 token_type_id 입력 없음.
model_type = 0
model_path = '../../../data11/model/distilbert/bert-re-kowiki-mecab'
vocab_path = '../../../data11/model/distilbert/bert-re-kowiki-mecab'

# tokeniaer 및 model 설정
# strip_accents=False : True로 하면, 가자 => ㄱ ㅏ ㅈ ㅏ 식으로 토큰화 되어 버림(*따라서 한국어에서는 반드시 False)
# do_lower_case=False : # 소문자 입력 사용 안함(한국어에서는 반드시 False)
tokenizer = AutoTokenizer.from_pretrained(vocab_path, strip_accents=False, do_lower_case=False) 
                       
# NLI 모델에서 레벨은 3개지(참,거짓,모름) 이므로, num_labels=3을 입력함
if model_type == 0:
    model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=3)
elif model_type == 1:
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)

# 레벨을 멀티로 선택해야 하는 경우
#model = BertForSequenceClassification.from_pretrained(model_path, problem_type="multi_label_classification",num_labels=6)

model.to(device)

Some weights of the model checkpoint at ../../../data11/model/distilbert/bert-re-kowiki-mecab were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ../../../data11/model/distilbert/bert-re-kowiki-mecab and are newly initializ

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(139547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [3]:
# tokenier 테스트
print(len(tokenizer))
print(tokenizer.encode("눈에 보이는 반전이었지만 영화의 흡인력은 사라지지 않았다", "정말 재미있다"))
print(tokenizer.convert_ids_to_tokens(131027))
print(tokenizer.convert_tokens_to_ids('정말'))

139547
[101, 9034, 10530, 119686, 11018, 126914, 10739, 69708, 42428, 10459, 10020, 12030, 28143, 10892, 123311, 12508, 49137, 102, 124709, 131451, 11903, 102]
자부
124709


In [4]:
# 평가 data loader 생성
from torch.utils.data import DataLoader, RandomSampler

sys.path.append("..")
from myutils import ClassificationDataset, KlueNLICorpus, data_collator, KorNLICorpus, GlueMNLICorpus
dataset = []

if use_kornli == 1:
    corpus = KorNLICorpus()
    dataset += ClassificationDataset(file_fpath=kornli_eval_file_fpath, max_seq_length=max_seq_len, tokenizer=tokenizer, corpus=corpus, overwrite_cache=cache)

if use_kluenli == 1:
    corpus = KlueNLICorpus()
    dataset += ClassificationDataset(file_fpath=kluenli_eval_file_fpath, max_seq_length=max_seq_len, tokenizer=tokenizer, corpus=corpus, overwrite_cache=cache)

if use_gluenli == 1:
    corpus = GlueMNLICorpus()
    dataset += ClassificationDataset(file_fpath=gulenli_eval_file_fpath, max_seq_length=max_seq_len, tokenizer=tokenizer, corpus=corpus, overwrite_cache=cache)

print('*dataset_len: {}'.format(len(dataset)))

# 평가 dataloader 생성
eval_loader = DataLoader(dataset, 
                          batch_size=batch_size, 
                          #shuffle=True, # dataset을 섞음
                          sampler=RandomSampler(dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                          collate_fn=data_collator, # dataset을 tensor로 변환(예시 {'input_ids':tensor[0,1,2,3,1,], 'token_type_id:tensor[0,0,0,0,0], 'attention_mask:tensor[1,1,1,1,1], 'labels':tensor[5]}
                          num_workers=4)

print('*eval_loader_len: {}'.format(len(eval_loader)))

Creating features from dataset file at ../../../data11/korpora/gluemnli/glue-mnli-valid.tsv
loading data... LOOKING AT ../../../data11/korpora/gluemnli/glue-mnli-valid.tsv
tokenize sentences, it could take a lot of time...
tokenize sentences [took %.3f s] 1.2919492721557617


  0%|          | 0/9815 [00:00<?, ?it/s]

*** Example ***
sentence A, B: The new rights are nice enough + Everyone really likes the newest benefits 
tokens: [CLS] The new rights are nic ##e enough [SEP] Every ##one really like ##s the new ##est benefits [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
label: neutral
features: ClassificationFeatures(input_ids=[101, 10117, 10751, 16691, 10301, 46267, 10112, 21408, 102, 30929, 12926, 30181, 11850, 10107, 10105, 10751, 13051, 48297, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [5]:
import time

logger.info(f"=== model: {model_path} ===")

if use_kornli == 1:
    logger.info(f"kornli_path : {kornli_eval_file_fpath}")
if use_kluenli == 1:
    logger.info(f"kluenli_path : {kluenli_eval_file_fpath}")
if use_gluenli == 1:
    logger.info(f"gluenli_path : {gulenli_eval_file_fpath}")
    
logger.info(f"num_parameters: {model.num_parameters()}")

# 평가 시작
model.eval()

total_loss = 0
total_len = 0
total_correct = 0

start = time.time()
logger.info(f'---------------------------------------------------------')

for data in tqdm(eval_loader):
    # 입력 값 설정
    # =>**distilbert에는 token_type_ids가 없다
    labels = data['labels'].to(device)
    input_ids = data['input_ids'].to(device)
    if model_type == 1:
        token_type_ids = data['token_type_ids'].to(device) 
    attention_mask = data['attention_mask'].to(device)
 
    # 손실률 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
    # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
    # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
    with torch.no_grad():
        # 모델 실행
        if model_type == 0:
            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels=labels)
        else:
            outputs = model(input_ids=input_ids, 
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask,
                            labels=labels)
    
        # 출력값 loss,logits를 outputs에서 얻어옴
        loss = outputs.loss
        logits = outputs.logits

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)

logger.info(f"eval-accuracy: {total_correct / total_len}")
logger.info(f'---------------------------------------------------------')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info(f'-END-\n')

2022-10-25 11:45:16,555 - distilbertnlitest - INFO - === model: ../../../data11/model/distilbert/bert-re-kowiki-mecab ===
2022-10-25 11:45:16,557 - distilbertnlitest - INFO - gluenli_path : ../../../data11/korpora/gluemnli/glue-mnli-valid.tsv
2022-10-25 11:45:16,560 - distilbertnlitest - INFO - num_parameters: 150686979
2022-10-25 11:45:16,562 - distilbertnlitest - INFO - ---------------------------------------------------------


  0%|          | 0/154 [00:00<?, ?it/s]

  pred = torch.argmax(F.softmax(logits), dim=1)
2022-10-25 11:45:26,311 - distilbertnlitest - INFO - eval-accuracy: 0.32847682119205296
2022-10-25 11:45:26,314 - distilbertnlitest - INFO - ---------------------------------------------------------
2022-10-25 11:45:26,315 - distilbertnlitest - INFO - === 처리시간: 9.753 초 ===
2022-10-25 11:45:26,317 - distilbertnlitest - INFO - -END-

