In [None]:
%cd '/content/drive/MyDrive/mulcam_project/Korean BERT'

/content/drive/MyDrive/mulcam_project/Korean BERT


In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master


In [None]:
import gc
import torch
import pickle
import re
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import gluonnlp as nlp
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
device = torch.device('cuda')
bertmodel, vocab = get_pytorch_kobert_model()

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


In [None]:
"""
final_idiom_dataset_for_ko.pkl
 - 관용구 포함 문장(1): 3,376개 ( KISS data )
 - 관용구 미포함 문장(0): 3,376개 ( AI Hub에서 위 KISS data 제외, 중복 제거한 뒤 3,376개를 random sampling )
"""
with open('./final_idiom_dataset_for_ko.pkl', 'rb') as f:
  corpus = pickle.load(f)

In [None]:
corpus['Label'].value_counts()

1    3376
0    3376
Name: Label, dtype: int64

In [None]:
corpus.head()

Unnamed: 0,ko,en,Label
0,다만 지난 7월부터 수면 무호흡증 진단을 위한 수면다원검사와 치료에 필요한 양압기 ...,"However, since July, as health insurance (20% ...",0
1,하지만 몇몇 사람들은 그 표현이 관용구인지 알아차리지 못합니다.,"Some people, however, cant recognize that it w...",0
2,끈끈한 승부근성으로 찬스에서 더 영양가 만점의 활약을 하는 오재원이 있기에 선두질주...,The Doosan Bears are gaining momentum in the l...,1
3,협약내용은 삼성화재서비스손해사정㈜에서 매월 임직원들의 기부를 통해 향후 3년간 30...,The contents of the agreement are to support m...,0
4,이 같은 대중적 관심을 겨냥해 이동통신사가 발 빠르게 움직였다.,"In response to such public interest, mobile op...",1


In [None]:
train, test = train_test_split(corpus, test_size=0.2, random_state=42)

In [None]:
# train set안에서 train/validation set으로 나눠주기
dataset = []
for text, label in zip(train.ko.to_list(), train.Label.to_list()):
  dataset.append([text, label])
dataset_train = dataset[:].copy()
dataset_val = dataset[5001:].copy()

In [None]:
len(dataset_train), len(dataset_val)

(5401, 400)

In [None]:
# 기학습된 BERT sentencepiece tokenizer 불러오기
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [None]:
# KoBERT 모델 학습을 위한 데이터 구성
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]
        print(self.sentences[0])
        # print(self.labels[0])

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return len(self.labels)

In [None]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1 # 초기부터 큰 lr는 학습의 불안정을 초래, epoch에 따라 lr을 점차 증가시키면서 학습
num_epochs = 3
max_grad_norm = 1 # 그레디언트 클리핑을 위한 파라미터, 그레디언트(기울기) 값을 임계값이 넘지 않도록 자르는 역할 
log_interval = 50 # PyTorch는 progress bar가 돌아가지 않아서 직접 출력을 통해 확인하기 위한 학습 로그 interval을 설정
learning_rate =  1e-5

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_val = BERTDataset(dataset_val, 0, 1, tok, max_len, True, False)

train_dataloader = torch.utils.data.DataLoader(data_train, shuffle=True, batch_size=batch_size, num_workers=4)
val_dataloader = torch.utils.data.DataLoader(data_val, batch_size=batch_size, num_workers=4)


(array([   2,  517, 5660, 5940, 3501, 6116, 1740, 3304, 2270, 7096, 3867,
        517,   54,    3,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32), array(14, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32))
(array([   2, 1594, 4202, 6903, 2047, 5330, 4983, 2064, 5184, 7794, 1058,
       7095, 4805, 6896, 2199, 7997, 5084, 6903, 2332, 2235, 6527, 7095,
       4065, 7088,  517, 5627, 7852, 4965,  517,   54,    3,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
       

In [None]:
# BERT에 FFN(classifier) 달아주기
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2, # 다중 분류의 경우, 이 부분을 변경
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
        
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

gc.collect()
torch.cuda.empty_cache()

In [None]:
# test 데이터 구성
test_dataset = []
for text, label in zip(test.ko.to_list(), np.zeros(len(test))):
  test_dataset.append([text, label])
test_data = BERTDataset(test_dataset, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1) # batch_size=1

(array([   2, 1316, 7178, 6865, 2149, 6645,  517, 7563, 6054, 7609, 6116,
       2526, 1316, 5666, 6629, 2095, 7003, 7724, 6084, 5330,  517,   46,
       3394, 7096, 6084, 7088, 4012, 5452, 7828, 3826, 5424, 7003, 1698,
       6412, 6607, 6493, 7096, 6855, 7673, 1674, 1815, 1575, 2479, 7095,
       4035, 7096, 2008, 6116, 1970, 5835, 5782,  517,   54,    3,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32), array(54, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32))


In [None]:
# 사전학습 모델 freeze 시키기
for params in bertmodel.parameters():
  params.requires_grad_(False)

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ # overfitting을 방지하기 위한 가중치 감쇠
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
NUM_OF_MODELS = 1
answers=[]
import random
for NUM in range(NUM_OF_MODELS):
  dr_r = 0.5
  print("epoch: ",NUM+1)
  print("drop out rate: ",dr_r)
  model = BERTClassifier(bertmodel,  dr_rate=dr_r ).to(device)
  # Prepare optimizer and schedule (linear warmup and decay)
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
  loss_fn = nn.CrossEntropyLoss()
  t_total = len(train_dataloader) * num_epochs
  warmup_step = int(t_total * warmup_ratio)
  scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
  for e in range(num_epochs):
      train_acc = 0.0
      test_acc = 0.0
      model.train()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
          optimizer.zero_grad()
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          loss = loss_fn(out, label)
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          train_acc += calc_accuracy(out, label)
          if batch_id % log_interval == 0:
              print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
      print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
      model.eval()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          test_acc += calc_accuracy(out, label)
      print("epoch {} val acc {}".format(e+1, test_acc / (batch_id+1)))

  answer=[]
  torch.save(model, './KoBERT(9.30)_freeze.model')
  # evaluate test_data
  model.eval()
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          max_vals, max_indices = torch.max(out, 1)
          answer.append(max_indices[0].item())
  answers.append(answer)

epoch:  1
drop out rate:  0.5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/85 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.7139754891395569 train acc 0.46875
epoch 1 batch id 51 loss 0.7098119854927063 train acc 0.48743872549019607
epoch 1 train acc 0.4915735294117647


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 1 val acc 0.47767857142857145


  0%|          | 0/85 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.6907586455345154 train acc 0.53125
epoch 2 batch id 51 loss 0.7612102031707764 train acc 0.49325980392156865
epoch 2 train acc 0.491125


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 2 val acc 0.49107142857142855


  0%|          | 0/85 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.7267736792564392 train acc 0.453125
epoch 3 batch id 51 loss 0.6768519878387451 train acc 0.4947916666666667
epoch 3 train acc 0.4885073529411765


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 3 val acc 0.484375


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/1351 [00:00<?, ?it/s]

In [None]:
# 전체 모델을 가지고 학습
for param in bertmodel.parameters():
  param.requires_grad_(True)

In [None]:

NUM_OF_MODELS = 1
answers=[]
import random
for NUM in range(NUM_OF_MODELS):
  dr_r = 0.5
  print("epoch: ",NUM+1)
  print("drop out rate: ",dr_r)
  model = BERTClassifier(bertmodel,  dr_rate=dr_r ).to(device)
  # Prepare optimizer and schedule (linear warmup and decay)
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
  loss_fn = nn.CrossEntropyLoss()
  t_total = len(train_dataloader) * num_epochs
  warmup_step = int(t_total * warmup_ratio)
  scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
  for e in range(num_epochs):
      train_acc = 0.0
      test_acc = 0.0
      model.train()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
          optimizer.zero_grad()
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          loss = loss_fn(out, label)
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          train_acc += calc_accuracy(out, label)
          if batch_id % log_interval == 0:
              print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
      print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
      model.eval()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          test_acc += calc_accuracy(out, label)
      print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
      #torch.save(model, 'bert_model5_'+str(e+1)+'.pth')

  answer=[]
  torch.save(model, './KoBERT(9.30)_total.model')
  # evaluate test_data
  model.eval()
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          max_vals, max_indices = torch.max(out, 1)
          answer.append(max_indices[0].item())
  answers.append(answer)


epoch:  1
drop out rate:  0.5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/85 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.7259151339530945 train acc 0.515625
epoch 1 batch id 51 loss 0.6532925367355347 train acc 0.5741421568627451
epoch 1 train acc 0.6378455882352941


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 1 test acc 0.8348214285714286


  0%|          | 0/85 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.5488540530204773 train acc 0.703125
epoch 2 batch id 51 loss 0.41261130571365356 train acc 0.8100490196078431
epoch 2 train acc 0.8319632352941176


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 2 test acc 0.9129464285714286


  0%|          | 0/85 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.2480756640434265 train acc 0.90625
epoch 3 batch id 51 loss 0.2942594885826111 train acc 0.8930759803921569
epoch 3 train acc 0.8948308823529412


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 3 test acc 0.921875


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/1351 [00:00<?, ?it/s]

In [None]:
# 성능 평가
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(answer, test['Label']))
evaluation_report = classification_report(test['Label'], answer)
print(evaluation_report)


0.8793486306439674
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       690
           1       0.86      0.89      0.88       661

    accuracy                           0.88      1351
   macro avg       0.88      0.88      0.88      1351
weighted avg       0.88      0.88      0.88      1351



In [None]:
# 관용구 50문장 + 비관용구 50문장으로 성능 평가
new_test = pd.read_csv('./new_idiom_dataset(100).csv')

In [None]:
new_test['ko'][10]

'이지훈은 3일 JTBC 엔터뉴스팀을 통해 "코로나 19로 인한 사회적 거리두기 4단계 방침과 자가격리를 해야 하는 일본인 예비신부인 미우라 아야네의 가족들 상황을 고려해 예식을 연기한다"고 말했다.'

In [None]:
new_test['Label'].value_counts()

1    50
0    50
Name: Label, dtype: int64

In [None]:
new_test = new_test.sample(frac=1).reset_index(drop=True)

In [None]:
test_model = torch.load('./KoBERT(9.30)_total.model')
test_model.eval()

In [None]:
new_test_dataset = []
for text, label in zip(new_test.ko.to_list(), np.zeros(len(new_test))):
  new_test_dataset.append([text, label])
new_test_data = BERTDataset(new_test_dataset, 0, 1, tok, max_len, True, False)
new_test_dataloader = torch.utils.data.DataLoader(new_test_data, batch_size=1, num_workers=1)

(array([   2,  517, 5330, 6527, 7941, 7727, 4033, 2000, 4665, 7120, 6682,
       6896, 1838, 3295, 6519, 6273, 5859, 6113, 5330, 4012, 2802, 1133,
       4257, 4469, 2514, 7659, 7533, 7119, 7088, 2329, 7639, 6079, 4456,
       7848, 3084, 5561, 2989, 1407, 2514, 7659, 7533, 7119, 1815,  517,
       5330, 6527, 7941, 7727,  730, 1253, 7869,  517,   54,    3,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32), array(54, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32))


In [None]:
new_answer = []
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(new_test_dataloader)):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length
  label = label.long().to(device)
  out = test_model(token_ids, valid_length, segment_ids)
  max_vals, max_indices = torch.max(out, 1)
  new_answer.append(max_indices[0].item())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(new_answer, new_test['Label']))
evaluation_report = classification_report(new_test['Label'], new_answer)
print(evaluation_report)


0.92
              precision    recall  f1-score   support

           0       0.96      0.88      0.92        50
           1       0.89      0.96      0.92        50

    accuracy                           0.92       100
   macro avg       0.92      0.92      0.92       100
weighted avg       0.92      0.92      0.92       100

