<a href="https://colab.research.google.com/github/kmouleejunhyuk/NLP_KOBERT-review/blob/main/eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#구글 드라이브 마운트 & 경로설정
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!nvidia-smi

Sat Jan  2 16:11:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#필요 라이브러리 설치
!pip install mxnet-cu101 &> /dev/null
!pip install gluonnlp &> /dev/null
!pip install sentencepiece==0.1.85 &> /dev/null
!pip install transformers==2.1.1 &> /dev/null

#kobert 설치
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master &> /dev/null

In [None]:
#######라이브러리 버젼####### 
#mxnet-cu101: 1.7.0
#gluonnlp: 0.10.0
#pandas: 1.1.5
#sentencepiece: 0.1.85
#transformers: 2.1.1
#torch: 1.7.0+cu101
#numpy: 1.19.4


#######개발 환경#######
#google colab, python
#런타임 가속기 유형: GPU, Tesla T4 이상의 gpu 메모리 요구

## **데이터 불러오기**

In [None]:
#판다스 로드
import pandas as pd
#데이터 경로(root_path는 재출 폴더가 들어있는 곳의 경로가 있으면 됩니다.)
root_path='/gdrive/My Drive/'
raw_test=pd.read_csv(root_path + 'torque/1.Data/news_test.csv', encoding = 'utf-8-sig')

## **시간 측정 시작**

In [None]:
import time
start=time.time()

## **라이브러리 불러오기**

In [None]:
#모델 경로
#사전에 train code(nh_origin)에서 모델 가중치 및 모델, vocab을 pickle을 사용해 파일로 만들어야 합니다.
bertmodel_path = root_path+'torque/5.Model/bertmodel.dat'
#tokenizer용 vocab 경로
vocab_path = root_path+'torque/3.tokenizer/vocab.dat'
#model 경로
model_path = root_path+'torque/5.Model/model.pth'
#submission 저장 경로
save_path = root_path+'torque/submission.csv'

In [None]:
import numpy as np

from torch import torch, nn
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp

from kobert.utils import get_tokenizer

import html
import pickle

device = torch.device("cuda:0")

## **pos_Tagger, Tokenizer, pretraind_embedding, Model 불러오기**

In [None]:
#pretrained model 불러오기
with open(bertmodel_path, 'rb') as f:
  bertmodel = pickle.load(f)
#vocab file 불러오기
with open(vocab_path, 'rb') as f:
  vocab = pickle.load(f)
#tokenizer 불러오기
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

[██████████████████████████████████████████████████]


In [None]:
#kobert 분류기
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)

        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
       
        if self.dr_rate:
            out = self.dropout(pooler)

        return self.classifier(out)

In [None]:
#모델 선언
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

## **형태소 분석 + 전처리**

In [None]:
#학습에 사용될 데이터셋으로 변경하는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]
        

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
#전처리
max_len = 64
batch_size = 512

raw_test=raw_test.astype('string')
raw_test['content']=raw_test['content']+'. '+raw_test['title']+'. '+raw_test['ord']

submission=raw_test[['id']].copy()
raw_test=raw_test[['content']].copy()

raw_test['content']=np.vectorize(html.unescape)(raw_test['content'])
raw_test['info']=0

#to_dataloader
data_test = BERTDataset(raw_test.values, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

## **예측**

In [None]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
model.half()  # convert to half precision

model.eval()
with torch.no_grad():
    answer=[]
    for token_ids, valid_length, segment_ids, label in test_dataloader:
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids).detach()
        tmp=torch.argmax(out, dim=1).tolist()
        answer.extend(tmp)

submission['info']=answer

In [None]:
print(time.time()-start)

127.24531888961792


In [None]:
submission.to_csv(save_path, encoding='utf-8', header=submission.columns, index=None, sep=',')

#### **고찰**

In [None]:
#submission(leaderboard) 기준 93.38% accuracy 달성