In [None]:
%cd '/content/drive/MyDrive/mulcam_project/Korean BERT'

/content/drive/MyDrive/mulcam_project/Korean BERT


In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install imblearn


In [None]:
import gc
import torch
import pickle
import re
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import gluonnlp as nlp
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [None]:
device = torch.device('cuda')
bertmodel, vocab = get_pytorch_kobert_model()

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


In [None]:
with open('./2/train_34614.pkl', 'rb') as f:
  corpus = pickle.load(f)

with open('./3/no_idiom_30000.pkl', 'rb') as f:
  no_idiom = pickle.load(f)

In [None]:
# 관용구 문장만 가져와서 변수에 저장
corpus = corpus[corpus['Label'] == 1]
len(corpus)

17307

In [None]:
no_idiom

Unnamed: 0,ko,Label
0,최창식 구청장은 “동대문디자인플라자 완공에 맞춰 이 지역 일대의 간판을 깨끗이 정비...,0
1,그와 함께 있는 것 자체가 행운이었습니다.,0
2,한국관광공사는 공사가 유치한 ‘홍콩 프루덴셜’ 임직원 1800명이 인센티브 관광 목...,0
3,새로운 그대가 나에게 환하게 웃고 있군요.,0
4,"이해관계가 걸린 법안 로비 성격을 전혀 띠지 않았으나, 이 가운데 일부는 2014년...",0
...,...,...
29995,구청장은 법 제11조의 규정에 의한 정보공개 여부 등을 심의하기 위하여 구에 정보공...,0
29996,그는 구청에는 “현장에서 수고하시는 사회복지사들이 고통과 좌절을 겪지 않고 사회복지...,0
29997,바닐레 킵페르는 바닐라와 아몬드로 만들어지며 쇼트빵 쿠키와 비슷하게 생겼습니다.,0
29998,당신은 최종 BI 원본이 발행되기 전에 우리가 리뷰를 할 수 있도록 하드카피를 아직...,0


In [None]:
corpus.head()

Unnamed: 0,ko,Label
0,그럼에도 불구하고 올림픽에서 가장 멋진 부분 중 하나는 바로 사진입니다.,1
4,이 자리에서 이 대표는 “일자리 문제가 어려운데 당에서도 민생연석회의 출범을 조만간...,1
5,"한국이 당면한, 현대 한국의 정체성과 상처를 상징하는 특수한 것은 다름 아닌 ‘휴전...",1
6,전국 정부출연연구기관 지역조직 57개 관계자들이 지난 3월 6일 한자리 모여 지역혁...,1
8,내 휴가는 물 건너가는구나.,1


In [None]:
new_df = pd.concat([corpus, no_idiom])
new_df = new_df.sample(frac=1)
new_df.reset_index(drop=True, inplace=True)

In [None]:
# train, test 분리하는 작업 - train/val/test를 먼저 분리한 다음에 train에만 oversampling을 적용!
train, test = train_test_split(new_df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

In [None]:
train['Label'].value_counts(), val['Label'].value_counts(), test['Label'].value_counts()

(0    21610
 1    12450
 Name: Label, dtype: int64, 0    2439
 1    1346
 Name: Label, dtype: int64, 0    5951
 1    3511
 Name: Label, dtype: int64)

In [None]:
# oversampling 하는 코드
oversample = RandomOverSampler()
ko_t = train.ko.to_numpy().reshape(-1, 1) # to_numpy()로 해줘야함
label_t = train.Label.to_numpy().reshape(-1, 1)

x_over, y_over = oversample.fit_resample(ko_t, label_t)

  y = column_or_1d(y, warn=True)


In [None]:
# x_over, y_over 개수가 동일해진걸 확인할 수 있음
len(x_over), len(y_over)

(43220, 43220)

In [None]:
# oversampling한 걸로 데이터프레임 다시 구성
train = pd.DataFrame({'ko':x_over.reshape(-1),
                      'Label':y_over.reshape(-1)})

In [None]:
# train dataset 구성
dataset = []
for text, label in zip(train.ko.to_list(), train.Label.to_list()):
  dataset.append([text, label])
dataset_train = dataset[:].copy()

In [None]:
# validation dataset 구성
dataset_val = []
for text, label in zip(val.ko.to_list(), val.Label.to_list()):
  dataset_val.append([text, label])

In [None]:
len(dataset_train), len(dataset_val)

(43220, 3785)

In [None]:
# 기학습된 BERT sentencepiece tokenizer 불러오기
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [None]:
# KoBERT 모델 학습을 위한 데이터 구성
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]
        print(self.sentences[0])
        # print(self.labels[0])

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return len(self.labels)

In [None]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1 # 초기부터 큰 lr는 학습의 불안정을 초래, epoch에 따라 lr을 점차 증가시키면서 학습
num_epochs = 3
max_grad_norm = 1 # 그레디언트 클리핑을 위한 파라미터, 그레디언트(기울기) 값을 임계값이 넘지 않도록 자르는 역할 
log_interval = 50 # PyTorch는 progress bar가 돌아가지 않아서 직접 출력을 통해 확인하기 위한 학습 로그 interval을 설정
learning_rate =  1e-5


In [None]:

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_val = BERTDataset(dataset_val, 0, 1, tok, max_len, True, False)

train_dataloader = torch.utils.data.DataLoader(data_train, shuffle=True, batch_size=batch_size, num_workers=4)
val_dataloader = torch.utils.data.DataLoader(data_val, batch_size=batch_size, num_workers=4)


In [None]:
# BERT에 FFN(classifier) 달아주기
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2, # 다중 분류의 경우, 이 부분을 변경
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
        
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

gc.collect()
torch.cuda.empty_cache()

In [None]:
# test 데이터 구성
test_dataset = []
for text, label in zip(test.ko.to_list(), np.zeros(len(test))):
  test_dataset.append([text, label])
test_data = BERTDataset(test_dataset, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1) # batch_size=1

(array([   2, 1208,  695,  322, 4578, 2292, 1330, 7970, 7096, 1783, 5767,
       7086, 3844, 6234, 7078, 1815, 1732, 6133, 4824, 1918, 1763, 5868,
       6135, 2872, 3881,  517,   54,    3,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32), array(28, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32))


In [None]:
# 사전학습 모델 freeze
for params in bertmodel.parameters():
  params.requires_grad_(False)

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ # overfitting을 방지하기 위한 가중치 감쇠
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
NUM_OF_MODELS = 1
answers=[]
import random
for NUM in range(NUM_OF_MODELS):
  dr_r = 0.5
  print("epoch: ",NUM+1)
  print("drop out rate: ",dr_r)
  model = BERTClassifier(bertmodel,  dr_rate=dr_r ).to(device)
  # Prepare optimizer and schedule (linear warmup and decay)
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
  loss_fn = nn.CrossEntropyLoss()
  t_total = len(train_dataloader) * num_epochs
  warmup_step = int(t_total * warmup_ratio)
  scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
  for e in range(num_epochs):
      train_acc = 0.0
      test_acc = 0.0
      model.train()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
          optimizer.zero_grad()
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          loss = loss_fn(out, label)
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          train_acc += calc_accuracy(out, label)
          if batch_id % log_interval == 0:
              print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
      print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
      model.eval()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          test_acc += calc_accuracy(out, label)
      print("epoch {} val acc {}".format(e+1, test_acc / (batch_id+1)))
      #torch.save(model, 'bert_model5_'+str(e+1)+'.pth')

  answer=[]
  torch.save(model, './KoBERT(10.5)_freeze_oversample.model')
  # evaluate test_data
  model.eval()
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          max_vals, max_indices = torch.max(out, 1)
          answer.append(max_indices[0].item())
  answers.append(answer)

epoch:  1
drop out rate:  0.5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/676 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.7650063633918762 train acc 0.375
epoch 1 batch id 51 loss 0.7366164922714233 train acc 0.49111519607843135
epoch 1 batch id 101 loss 0.7451227903366089 train acc 0.4958230198019802
epoch 1 batch id 151 loss 0.6959648132324219 train acc 0.5014486754966887
epoch 1 batch id 201 loss 0.7177838087081909 train acc 0.49992226368159204
epoch 1 batch id 251 loss 0.7195535898208618 train acc 0.5029880478087649
epoch 1 batch id 301 loss 0.6962070465087891 train acc 0.5042047342192691
epoch 1 batch id 351 loss 0.7341471910476685 train acc 0.5024483618233618
epoch 1 batch id 401 loss 0.7357300519943237 train acc 0.5009741271820449
epoch 1 batch id 451 loss 0.7042489647865295 train acc 0.5017322616407982
epoch 1 batch id 501 loss 0.7042932510375977 train acc 0.5025261976047904
epoch 1 batch id 551 loss 0.6906235814094543 train acc 0.5040834845735027
epoch 1 batch id 601 loss 0.7180185317993164 train acc 0.5046537021630616
epoch 1 batch id 651 loss 0.6399808526039124 train a

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/60 [00:00<?, ?it/s]

epoch 1 val acc 0.5727430555555556


  0%|          | 0/676 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.7515535950660706 train acc 0.390625
epoch 2 batch id 51 loss 0.7467991709709167 train acc 0.49693627450980393
epoch 2 batch id 101 loss 0.7189623713493347 train acc 0.5146967821782178
epoch 2 batch id 151 loss 0.7119548916816711 train acc 0.5145902317880795
epoch 2 batch id 201 loss 0.7176350951194763 train acc 0.5141480099502488
epoch 2 batch id 251 loss 0.6949524283409119 train acc 0.5159362549800797
epoch 2 batch id 301 loss 0.7256926894187927 train acc 0.5146387043189369
epoch 2 batch id 351 loss 0.7357155084609985 train acc 0.5139779202279202
epoch 2 batch id 401 loss 0.6925645470619202 train acc 0.5148456982543641
epoch 2 batch id 451 loss 0.6872038245201111 train acc 0.5152785476718403
epoch 2 batch id 501 loss 0.7037340402603149 train acc 0.5150948103792415
epoch 2 batch id 551 loss 0.6923876404762268 train acc 0.5176950998185118
epoch 2 batch id 601 loss 0.6831713318824768 train acc 0.5171329034941764
epoch 2 batch id 651 loss 0.7419895529747009 train

  0%|          | 0/60 [00:00<?, ?it/s]

epoch 2 val acc 0.5868055555555556


  0%|          | 0/676 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.6746351718902588 train acc 0.640625
epoch 3 batch id 51 loss 0.6860417127609253 train acc 0.522671568627451
epoch 3 batch id 101 loss 0.736508846282959 train acc 0.5221225247524752
epoch 3 batch id 151 loss 0.7020180821418762 train acc 0.5222475165562914
epoch 3 batch id 201 loss 0.6880327463150024 train acc 0.5245646766169154
epoch 3 batch id 251 loss 0.783594012260437 train acc 0.5207295816733067
epoch 3 batch id 301 loss 0.6535577178001404 train acc 0.5241901993355482
epoch 3 batch id 351 loss 0.6990736126899719 train acc 0.5234597578347578
epoch 3 batch id 401 loss 0.6964598894119263 train acc 0.5238466334164589
epoch 3 batch id 451 loss 0.7173520922660828 train acc 0.5229351441241685
epoch 3 batch id 501 loss 0.681636393070221 train acc 0.5237649700598802
epoch 3 batch id 551 loss 0.6904134750366211 train acc 0.5224591651542649
epoch 3 batch id 601 loss 0.6413049101829529 train acc 0.5231385191347754
epoch 3 batch id 651 loss 0.6692578792572021 train acc 

  0%|          | 0/60 [00:00<?, ?it/s]

epoch 3 val acc 0.5936053240740741


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/9462 [00:00<?, ?it/s]

In [None]:
for param in bertmodel.parameters():
  param.requires_grad_(True)

In [None]:

NUM_OF_MODELS = 1
answers=[]
import random
for NUM in range(NUM_OF_MODELS):
  dr_r = 0.5
  print("epoch: ",NUM+1)
  print("drop out rate: ",dr_r)
  model = BERTClassifier(bertmodel,  dr_rate=dr_r ).to(device)
  # Prepare optimizer and schedule (linear warmup and decay)
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
  loss_fn = nn.CrossEntropyLoss()
  t_total = len(train_dataloader) * num_epochs
  warmup_step = int(t_total * warmup_ratio)
  scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
  for e in range(num_epochs):
      train_acc = 0.0
      test_acc = 0.0
      model.train()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
          optimizer.zero_grad()
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          loss = loss_fn(out, label)
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          train_acc += calc_accuracy(out, label)
          if batch_id % log_interval == 0:
              print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
      print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
      model.eval()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          test_acc += calc_accuracy(out, label)
      print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
      #torch.save(model, 'bert_model5_'+str(e+1)+'.pth')

  answer=[]
  torch.save(model, './KoBERT(10.5)_total_oversample.model')
  # evaluate test_data
  model.eval()
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          max_vals, max_indices = torch.max(out, 1)
          answer.append(max_indices[0].item())
  answers.append(answer)


epoch:  1
drop out rate:  0.5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/676 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.6947736740112305 train acc 0.546875
epoch 1 batch id 51 loss 0.6863440871238708 train acc 0.522671568627451
epoch 1 batch id 101 loss 0.6886696219444275 train acc 0.5541460396039604
epoch 1 batch id 151 loss 0.634591281414032 train acc 0.5977855960264901
epoch 1 batch id 201 loss 0.5095459222793579 train acc 0.6361162935323383
epoch 1 batch id 251 loss 0.48030897974967957 train acc 0.6678286852589641
epoch 1 batch id 301 loss 0.2818962633609772 train acc 0.7024501661129569
epoch 1 batch id 351 loss 0.30519211292266846 train acc 0.7315259971509972
epoch 1 batch id 401 loss 0.19722023606300354 train acc 0.7551823566084788
epoch 1 batch id 451 loss 0.19744355976581573 train acc 0.7738359201773836
epoch 1 batch id 501 loss 0.14339962601661682 train acc 0.7903879740518962
epoch 1 batch id 551 loss 0.06938280165195465 train acc 0.8044748185117967
epoch 1 batch id 601 loss 0.15710845589637756 train acc 0.8165557404326124
epoch 1 batch id 651 loss 0.22949856519699097 

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/60 [00:00<?, ?it/s]

epoch 1 test acc 0.9557002314814814


  0%|          | 0/676 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.08528152853250504 train acc 0.953125
epoch 2 batch id 51 loss 0.10115640610456467 train acc 0.9620098039215687
epoch 2 batch id 101 loss 0.23486927151679993 train acc 0.9647277227722773
epoch 2 batch id 151 loss 0.15707577764987946 train acc 0.9634726821192053
epoch 2 batch id 201 loss 0.05530591681599617 train acc 0.9631529850746269
epoch 2 batch id 251 loss 0.19233612716197968 train acc 0.9627116533864541
epoch 2 batch id 301 loss 0.10656677186489105 train acc 0.9637147009966778
epoch 2 batch id 351 loss 0.03431662917137146 train acc 0.9634081196581197
epoch 2 batch id 401 loss 0.13845206797122955 train acc 0.9641131546134664
epoch 2 batch id 451 loss 0.07023510336875916 train acc 0.9648004434589801
epoch 2 batch id 501 loss 0.14706115424633026 train acc 0.9650386726546906
epoch 2 batch id 551 loss 0.03001309745013714 train acc 0.9654888838475499
epoch 2 batch id 601 loss 0.1708841621875763 train acc 0.9657601913477537
epoch 2 batch id 651 loss 0.08329910039

  0%|          | 0/60 [00:00<?, ?it/s]

epoch 2 test acc 0.9640625


  0%|          | 0/676 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.162031888961792 train acc 0.953125
epoch 3 batch id 51 loss 0.025500312447547913 train acc 0.9791666666666666
epoch 3 batch id 101 loss 0.01609197072684765 train acc 0.9808168316831684
epoch 3 batch id 151 loss 0.014526613056659698 train acc 0.980546357615894
epoch 3 batch id 201 loss 0.011585964821279049 train acc 0.9800217661691543
epoch 3 batch id 251 loss 0.025908926501870155 train acc 0.9795816733067729
epoch 3 batch id 301 loss 0.11652131378650665 train acc 0.9798068936877077
epoch 3 batch id 351 loss 0.021201083436608315 train acc 0.9794782763532763
epoch 3 batch id 401 loss 0.05022777244448662 train acc 0.9796212593516209
epoch 3 batch id 451 loss 0.025863613933324814 train acc 0.979559312638581
epoch 3 batch id 501 loss 0.09762544929981232 train acc 0.9797592315369261
epoch 3 batch id 551 loss 0.13653790950775146 train acc 0.9793840744101633
epoch 3 batch id 601 loss 0.09782317280769348 train acc 0.9794353161397671
epoch 3 batch id 651 loss 0.00893872

  0%|          | 0/60 [00:00<?, ?it/s]

epoch 3 test acc 0.9669270833333333


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/9462 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(answer, test['Label']))
evaluation_report = classification_report(test['Label'], answer)
print(evaluation_report)


0.9639611075882477
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5951
           1       0.95      0.96      0.95      3511

    accuracy                           0.96      9462
   macro avg       0.96      0.96      0.96      9462
weighted avg       0.96      0.96      0.96      9462



In [None]:
model = torch.load('./KoBERT(10.5)_total_oversample.model')
model.eval()

with open('./2/test_3000.pkl', 'rb') as f:
  Final_BL = pickle.load(f)

data_total = Final_BL['ko'].to_list()
new_test_dataset = []

for text, label in zip(data_total, np.zeros(len(data_total))):
  new_test_dataset.append([text, label])

new_test_data = BERTDataset(new_test_dataset, 0, 1, tok, 64, True, False)
new_test_dataloader = torch.utils.data.DataLoader(new_test_data, batch_size=1, num_workers=1)

## Setting parameters
max_len = 64
warmup_ratio = 0.1 
num_epochs = 3
max_grad_norm = 1 
log_interval = 50
learning_rate =  1e-6
new_answer = []
model.eval()

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(new_test_dataloader)):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length
  label = label.long().to(device)
  out = model(token_ids, valid_length, segment_ids)
  max_vals, max_indices = torch.max(out, 1)
  new_answer.append(max_indices[0].item())
len(new_answer), len(Final_BL['Label'])
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(new_answer, Final_BL['Label']))
evaluation_report = classification_report(Final_BL['Label'], new_answer)
print(evaluation_report)

kcc940_sentences = []
f = open('./KCC940_Korean_sentences_UTF8_V2.txt')
f = f.readlines()

for s in f:
  kcc940_sentences.append(s.strip())

new_test_dataset = []

for text, label in zip(kcc940_sentences, np.zeros(len(kcc940_sentences))):
  new_test_dataset.append([text, label])
new_test_data = BERTDataset(new_test_dataset, 0, 1, tok, 64, True, False)
new_test_dataloader = torch.utils.data.DataLoader(new_test_data, batch_size=64, num_workers=1)
new_answer = []
model.eval()

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(new_test_dataloader)):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length
  label = label.long().to(device)
  out = model(token_ids, valid_length, segment_ids)
  max_vals, max_indices = torch.max(out, 1)
  # new_answer.append(max_indices[0].item())
  new_answer.append(max_indices)

result = []
for i in range(len(new_answer)):
  for j in range(len(new_answer[i])):
    result.append(new_answer[i][j].item())

df = pd.DataFrame({'ko':kcc940_sentences,
              'Label': result})

for s in df[df['Label'] == 1]['ko']:
  print(s)

import pickle

with open('./KCC940_KoBERT_prediction.pkl', 'wb') as f:
  pickle.dump(df, f)
df['Label'].value_counts()
idiom_pos = df[df['Label'] == 1]

for s in idiom_pos['ko']:
  print(s)

for s in idiom_pos['ko']:
  print(s)

new_test_dataset = []

for text, label in zip(kcc940_sentences, np.zeros(len(kcc940_sentences))):
  new_test_dataset.append([text, label])
new_test_data = BERTDataset(new_test_dataset, 0, 1, tok, 64, True, False)
new_test_dataloader = torch.utils.data.DataLoader(new_test_data, batch_size=64, num_workers=1)

new_answer = []

model.eval()

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(new_test_dataloader)):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length
  label = label.long().to(device)
  out = model(token_ids, valid_length, segment_ids)
  max_vals, max_indices = torch.max(out, 1)
  # new_answer.append(max_indices[0].item())
  new_answer.append(max_indices)

result = []

for i in range(len(new_answer)):
  for j in range(len(new_answer[i])):
    result.append(new_answer[i][j].item())

df = pd.DataFrame({'ko':kcc940_sentences,
              'Label': result})

for s in df[df['Label'] == 1]['ko']:
  print(s)

with open('./KCC940_KoBERT_prediction.pkl', 'wb') as f:
  pickle.dump(df, f)

df['Label'].value_counts()
idiom_pos = df[df['Label'] == 1]


In [None]:
with open('./2/test_3000.pkl', 'rb') as f:
  Final_BL = pickle.load(f)

In [None]:
data_total = Final_BL['ko'].to_list()

In [None]:
new_test_dataset = []
for text, label in zip(data_total, np.zeros(len(data_total))):
  new_test_dataset.append([text, label])
new_test_data = BERTDataset(new_test_dataset, 0, 1, tok, 64, True, False)
new_test_dataloader = torch.utils.data.DataLoader(new_test_data, batch_size=1, num_workers=1)

(array([   2, 1698, 4688, 2127, 3886, 6953, 7155, 2692, 4698, 5357, 7789,
       3567, 2455, 6116, 5125, 7788, 3868, 3948, 6398, 6738, 6896, 1682,
       3500, 4654, 3864,  913,  517,   54,    3,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32), array(29, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32))


In [None]:
## Setting parameters
max_len = 64
warmup_ratio = 0.1 
num_epochs = 3
max_grad_norm = 1 
log_interval = 50
learning_rate =  1e-6

In [None]:
new_answer = []
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(new_test_dataloader)):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length
  label = label.long().to(device)
  out = model(token_ids, valid_length, segment_ids)
  max_vals, max_indices = torch.max(out, 1)
  new_answer.append(max_indices[0].item())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
len(new_answer), len(Final_BL['Label'])

(3000, 3000)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(new_answer, Final_BL['Label']))
evaluation_report = classification_report(Final_BL['Label'], new_answer)
print(evaluation_report)


0.9536666666666667
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1500
           1       0.96      0.95      0.95      1500

    accuracy                           0.95      3000
   macro avg       0.95      0.95      0.95      3000
weighted avg       0.95      0.95      0.95      3000

