# KoBERT finetuning

In [None]:
# 파이썬 버전을 3.7.0으로 변경
!wget https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz
!tar xvfz Python-3.7.0.tgz
!Python-3.7.0/configure
!make
!sudo make install

In [2]:
!python --version

Python 3.7.0


In [None]:
# 필요 패키지 설치
!pip install mxnet
!pip install gluonnlp
!pip install pandas
!pip install tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install openpyxl

# /usr/local/lib/python3.10/dist-packages/mxnet/numpy/utils.py:37: FutureWarning: In the future `np.bool` will be defined as the corresponding NumPy scalar. bool = onp.bool 해결
!pip install numpy==1.23.1

In [None]:
# KoBERT 깃허브에서 필요한 파일 불러오기
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [8]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gluonnlp as nlp
import pandas as pd

In [9]:
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from google.colab import drive
from sklearn.model_selection import train_test_split

In [10]:
# # CPU
# device = torch.device("cpu")

# GPU
device = torch.device("cuda:0")

In [11]:
# 구글 drive와 연동하여 학습에 이용할 파일 불러옴
drive.mount('/content/drive/')
data_input = pd.read_excel('/content/drive/MyDrive/junho/data/data_conversation.xlsx')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [12]:
# 데이터를 제대로 불러왔는지 확인
data_input.sample(n=5)

Unnamed: 0,Text,Class
112631,애들이 인제 세금으로 맛있는 걸 사 먹을 수 있게 과자 같은 걸 배치해 놓고는,9
1028664,어어어,11
297378,스카이패밀리,9
666487,있잖아,11
471073,롤러코스터 거의 다 버스,3


In [27]:
# 읽어온 학습 데이터를 리스트로 변환
data_list = []
for t, c in zip(data_input['Text'], data_input['Class']):
  if type(t)==float:
    continue
  data = []
  data.append(t)
  data.append(str(c))
  data_list.append(data)

In [28]:
print(len(data_list))
print(data_list[0])
print(data_list[-1])

1043936
['안녕하세요 잘 들리시나요', '9']
['하면은 다 똑같은 간호산데 뭐', '1']


In [29]:
dataset_train, dataset_validation = train_test_split(data_list, test_size=0.01, random_state=1)

In [30]:
print(len(dataset_train))
print(len(dataset_validation))

print(dataset_train[-1])
print(dataset_validation[-1])

1033496
10440
['너가 자주 이용하는 인터넷 쇼핑몰은 뭔데', '11']
['아 좀 측정해', '11']


In [31]:
# 파라미터 정보
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [32]:
# 학습/테스트 데이터 전처리를 위한 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [33]:
# 데이터셋 토큰화
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [34]:
# 각 데이터가 BERT 모델의 입력으로 들어갈 수 있도록 tokenization, int encoding, padding하는 함수
data_train = BERTDataset(dataset_train, 0, 1, tok, vocab, max_len, True, False)
data_validation = BERTDataset(dataset_validation, 0, 1, tok, vocab, max_len, True, False)

In [35]:
# torch 형식의 dataset을 만들어 입력 데이터셋의 전처리 마무리
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size = batch_size, num_workers = 5)
validation_dataloader = torch.utils.data.DataLoader(data_validation, batch_size = batch_size, num_workers = 5)

In [36]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 19,   # 감정 클래스 수로 조정
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [37]:
# BERT  모델 불러오기
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
# optimizer 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [39]:
# schedule 설정
loss_fn = nn.CrossEntropyLoss() # 다중 분류를 위한 loss function
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
# 체크포인트 변수
checkpoint_path = '/content/drive/MyDrive/junho/checkpoint/'
checkpoint_file = 'emotion_classification_model_checkpoint.pt'

In [None]:
# 체크포인트 저장 함수
def save_checkpoint(model, optimizer, epoch, path=checkpoint_path+checkpoint_file):
  torch.save({
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
  }, path)
  print(f"체크포인트 저장 - epoch {epoch}")

# 체크포인트 로드 함수
def load_checkpoint(model, optimizer, path=checkpoint_path+checkpoint_file):
  if not os.path.isfile(path):
    return 0
  checkpoint = torch.load(path)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  print(f"체크포인트 로드 -  epoch {epoch}")
  return epoch

In [40]:
# 정확도 측정 함수
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [41]:
# 학습 개시
start = load_checkpoint(model, optimizer)
for e in range(start, num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/16149 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 3.0316402912139893 train acc 0.046875
epoch 1 batch id 201 loss 2.7544097900390625 train acc 0.0681747512437811
epoch 1 batch id 401 loss 2.1203088760375977 train acc 0.17491427680798005
epoch 1 batch id 601 loss 2.212980031967163 train acc 0.21862000831946754
epoch 1 batch id 801 loss 2.176074981689453 train acc 0.24471363920099876


KeyboardInterrupt: 

In [None]:
# 모델을 저장할 폴더 생성
import os

path = '/content/drive/MyDrive/junho/models/'
if not os.path.isdir(path):
  os.mkdir(path)

os.chdir(path)
os.getcwd()

In [None]:
# 모델 이름
pt_name = 'emotion_classification_model.pt'
pkl_name = 'emotion_classification_model.pkl'

In [None]:
import joblib

# 생성한 모델을 폴더에 저장
torch.save(model, path + pt_name)
joblib.dump(model, path + pkl_name)

In [None]:
# 저장한 모델을 불러오기
pt_model = torch.load(path + pt_name)
pkl_model = joblib.load(path + pkl_name)

In [None]:
# 실수를 치역으로 한 가중치 값을 softmax함수를 사용하여 텍스트를 확률값으로 변환
def new_softmax(a) :
    c = np.max(a)         # 최댓값
    exp_a = np.exp(a-c)   # 각각의 원소에 최댓값을 뺀 값에 exp를 취한다. (이를 통해 overflow 방지)
    sum_exp_a = np.sum(exp_a)
    y = (exp_a / sum_exp_a) * 100
    return np.round(y, 3)

In [None]:
def predict(predict_sentence):
    data = [predict_sentence, '0']
    dataset_another = [data]
    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False) # 토큰화한 문장
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    pt_model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = pt_model(token_ids, valid_length, segment_ids)
        test_eval=[]

        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            min_v = min(logits)
            total = 0
            probability = []
            logits = np.round(new_softmax(logits), 3).tolist()
            for logit in logits:
                # print(logit)
                probability.append(np.round(logit, 3))

            if np.argmax(logits) == 0:    emotion = "공포"
            elif np.argmax(logits) == 1:  emotion = "놀람"
            elif np.argmax(logits) == 2:  emotion = '분노'
            elif np.argmax(logits) == 3:  emotion = '슬픔'
            elif np.argmax(logits) == 4:  emotion = '중립'
            elif np.argmax(logits) == 5:  emotion = '행복'
            else:                         emotion = '혐오'

            probability.append(emotion)
            # print(probability)
    return probability

In [None]:
# 질문에 0 입력 시 종료
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0" :
        break
    print(predict(sentence))
    print("\n")

In [None]:
def predict(predict_sentence):
    data = [predict_sentence, '0']
    dataset_another = [data]
    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False) # 토큰화한 문장
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    pkl_model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = pkl_model(token_ids, valid_length, segment_ids)
        test_eval=[]

        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            min_v = min(logits)
            total = 0
            probability = []
            logits = np.round(new_softmax(logits), 3).tolist()
            for logit in logits:
                # print(logit)
                probability.append(np.round(logit, 3))

            if np.argmax(logits) == 0:    emotion = "공포"
            elif np.argmax(logits) == 1:  emotion = "놀람"
            elif np.argmax(logits) == 2:  emotion = '분노'
            elif np.argmax(logits) == 3:  emotion = '슬픔'
            elif np.argmax(logits) == 4:  emotion = '중립'
            elif np.argmax(logits) == 5:  emotion = '행복'
            else:                         emotion = '혐오'

            probability.append(emotion)
            # print(probability)
    return probability

In [None]:
# 질문에 0 입력 시 종료
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0" :
        break
    print(predict(sentence))
    print("\n")