In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
#from torch.optim import Adam
import torch.nn.functional as F
from myutils import seed_everything, GPU_info, mlogging
from tqdm.notebook import tqdm
import os
#from torch.nn import BCEWithLogitsLoss, BCELoss

logger = mlogging(loggername="bertfttest", logfilename="bertftmultitest")
device = GPU_info()
seed_everything(111)

logfilepath:bwdataset_2022-03-29.log
logfilepath:qnadataset_2022-03-29.log
logfilepath:bertftmultitest_2022-03-29.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [2]:
model_path = 'model/classification/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327-nscm-0329'
vocab_path = 'model/classification/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327-nscm-0329'

#model_path = 'model/distilbert/distilbert-0327-TS-classi'
#vocab_path = 'model/distilbert/distilbert-0327-TS-classi'

# strip_accents=False : True로 하면, 가자 => ㄱ ㅏ ㅈ ㅏ 식으로 토큰화 되어 버림(*따라서 한국어에서는 반드시 False)
# do_lower_case=False : # 소문자 입력 사용 안함(한국어에서는 반드시 False)
#tokenizer = BertTokenizer(vocab_file=vocab_path, strip_accents=False, do_lower_case=False) 
#model = torch.load(model_path) # 기존 모델 파일을 로딩하는 경우

tokenizer = BertTokenizer.from_pretrained(vocab_path, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(167550, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [4]:
# eval data loader 생성
from os import sys
sys.path.append('..')
from myutils import ClassificationCSVCorpus, ClassificationDataset, data_collator
from torch.utils.data import DataLoader, RandomSampler

#############################################################################
# 변수 설정
#############################################################################
max_seq_len = 256   # 글자 최대 토큰 길이 해당 토큰 길이 이상은 잘린다.
batch_size = 32        # 배치 사이즈(64면 GUP Memory 오류 나므로, 32 이하로 설정할것=>max_seq_length 를 줄이면, 64도 가능함)

# 훈련할 csv 파일
#file_fpath = 'Korpora/감성대화말뭉치/감성대화말뭉치(최종데이터)_renew_labelenc_Validation.csv'
file_fpath = 'korpora/nsmc/ratings_test.txt'
column_num = 3           # .csv 파일에 컬럼수(예: text, label만 있으면 =2)
csvfile = 0              # 0:tsv 파일, 1: csv 파일
label_list = ["0","1"]  # .csv 파일에 레벨 목록( list로 입력해야 함)
cache = True   # 캐쉬파일 생성할거면 True로 (True이면 loding할때 캐쉬파일있어도 이용안함)
#############################################################################

# corpus 파일 로딩
corpus = ClassificationCSVCorpus(column_num=column_num, iscsvfile=csvfile, label_list=label_list)

# 학습 dataset 생성
dataset = ClassificationDataset(file_fpath=file_fpath, max_seq_length=max_seq_len, tokenizer=tokenizer, corpus=corpus, overwrite_cache=cache)


# 학습 dataloader 생성
eval_loader = DataLoader(dataset, 
                          batch_size=batch_size, 
                          #shuffle=True, # dataset을 섞음
                          sampler=RandomSampler(dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                          collate_fn=data_collator, # dataset을 tensor로 변환(예시 {'input_ids':tensor[0,1,2,3,1,], 'token_type_id:tensor[0,0,0,0,0], 'attention_mask:tensor[1,1,1,1,1], 'labels':tensor[5]}
                          num_workers=4)

2022-03-29 12:06:31,903 - bwpdataset - INFO - Creating features from dataset file at korpora/nsmc/ratings_test.txt
2022-03-29 12:06:31,905 - bwpdataset - INFO - loading data... LOOKING AT korpora/nsmc/ratings_test.txt
2022-03-29 12:06:31,907 - bwpdataset - INFO - tsv file open
2022-03-29 12:06:32,407 - bwpdataset - INFO - tokenize sentences, it could take a lot of time...
2022-03-29 12:06:41,337 - bwpdataset - INFO - tokenize sentences [took 8.929 s]


  0%|          | 0/50000 [00:00<?, ?it/s]

2022-03-29 12:06:41,854 - bwpdataset - INFO - *** Example ***
2022-03-29 12:06:41,855 - bwpdataset - INFO - sentence: 굳 ㅋ
2022-03-29 12:06:41,856 - bwpdataset - INFO - tokens: [CLS] 굳 [UNK] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [None]:
# 평가 시작
model.eval()

total_loss = 0
total_len = 0
total_correct = 0

for data in tqdm(eval_loader):
     # 입력 값 설정
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)       
    labels = data['labels'].to(device)
 
    # 손실률 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
    # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
    # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
    with torch.no_grad():
        # 모델 실행
        outputs = model(input_ids=input_ids, 
                       attention_mask=attention_mask,
                       token_type_ids=None,
                       labels=labels)
    
        # 출력값 loss,logits를 outputs에서 얻어옴
        loss = outputs.loss
        logits = outputs.logits

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)

logger.info(f"eval-accuracy: {total_correct / total_len}")

  0%|          | 0/1563 [00:00<?, ?it/s]

  pred = torch.argmax(F.softmax(logits), dim=1)
