# KOBERT를 이용해 감정 분류하기


`필요 환경 및 패키지 설치`

In [None]:
!pip install mxnet
!pip install pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install gluonnlp==0.10.0
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import urllib.request
from google.colab import drive
from sklearn.model_selection import train_test_split

drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

kobert_tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

word_tokenizer = kobert_tokenizer.tokenize

kobertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

vocab = nlp.vocab.BERTVocab.from_sentencepiece(kobert_tokenizer.vocab_file, padding_token='[PAD]')

`사전 작업`

- 사전 파라미터 정의

In [None]:
max_len = 200
batch_size = 100
warmup_ratio = 0.1
num_epochs = 35
max_grad_norm = 1
log_interval = 100
learning_rate =  2e-4

- Dataset 사전 정의

In [None]:
class SENT_BERTDataset(Dataset):
    def __init__(self, dataset, att_idx, label_idx) :
        self.sentences = [i[att_idx] for i in dataset]
        self.labels = torch.tensor([(int(i[label_idx][0]), int(i[label_idx][1])) for i in dataset])

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

- 모델 사전 정의

In [None]:
class BERT_SENT_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2,
                 dr_rate = None,
                 params = None):
        super(BERT_SENT_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.sent_classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)
        self.activation_softmax = nn.Softmax(dim = 1)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        posneg = self.sent_classifier(out)
        activation_posneg = self.activation_softmax(posneg)

        return activation_posneg

- 검증을 위한 긍부정 정확도 함수 사전 정의

In [None]:
def sent_calc_accuracy(X,Y): # 다 맞춰야 정답이니까 너무 어렵다... 따라서 긍정 부정 별로 맞추는 정확도를 도출한다.
    X[X <= 0.5 ] = 0
    X[X > 0.5 ] = 1
    p_answer = 0
    n_answer = 0
    for pred in (X - Y) :
        if pred[0] == 0 :
            p_answer += 1
        if pred[1] == 0 :
            n_answer += 1
    p_train_acc = p_answer/batch_size
    n_train_acc = n_answer/batch_size
    return p_train_acc, n_train_acc

- valid data를 이용한 성능 측정 함수

In [None]:
def predict_valid_data_acc(sent_test_dataloader):
    model.eval()
    valid_acc_log = []
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(sent_test_dataloader):
        # model의 forward 인자 설정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        # model output 도출
        out = model.forward(token_ids, valid_length, segment_ids)
        p_train_acc, n_train_acc = sent_calc_accuracy(out, label)
        valid_acc_log.append([p_train_acc, n_train_acc])
    p_valid_mean_acc = sum([accuracy[0] for accuracy in valid_acc_log]) / len([accuracy[0] for accuracy in valid_acc_log])
    n_valid_mean_acc = sum([accuracy[1] for accuracy in valid_acc_log]) / len([accuracy[1] for accuracy in valid_acc_log])
    return p_valid_mean_acc, n_valid_mean_acc

`실전 데이터 학습 구조 짜기`

- 학습 / 검증용 데이터 불러오기

In [None]:
total_inputs = pd.read_csv('/content/drive/MyDrive/kobert_modeling/실전용/csv파일 실전용/모델별최종학습데이터셋/최종긍부정분류데이터셋.csv')

total_inputs

Unnamed: 0.1,Unnamed: 0,문장,긍정,부정,조합
0,0,아 더빙.. 진짜 짜증나네요 목소리,0,1,"(0, 1)"
1,1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1,0,"(1, 0)"
2,2,너무재밓었다그래서보는것을추천한다,0,1,"(0, 1)"
3,3,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0,1,"(0, 1)"
4,4,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1,0,"(1, 0)"
...,...,...,...,...,...
221218,71218,체계없는 인사구조. 배울게 없는 리더십. 갑론을박의 브랜딩 방향성,0,1,"(0, 1)"
221219,71219,워라밸 부분은 여타 회사를 비교하더라도 상위권에 속할 것이라 판단됨,1,0,"(1, 0)"
221220,71220,과거의 성장 동력이었던 요소들이 현재는 일정부분 발목을 잡고있는 형국임. 그 타개책...,0,1,"(0, 1)"
221221,71221,자유로운 분위기고 야근 없는 팀은 야근이 없는 편. 복지가 나쁘지 않음,1,0,"(1, 0)"


- 학습을 위한 모델, 옵티마이저, 손실함수, 스케쥴러 초기화 및 requires_grad 설정

In [None]:
# BERT  모델 불러오기
model = BERT_SENT_Classifier(kobertmodel,  dr_rate = 0.5).to(device)

# kober model freezing 하기
for name, para in model.named_parameters() :
    if True:
        para.requires_grad = True

# 역전파 확인
for name, para in model.named_parameters() :
    print(para.requires_grad)

# 옵티마이저 생성 시 전달해줄 파라미터 정의
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [weight for name, weight in model.named_parameters() if not any(nd in name for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [weight for name, weight in model.named_parameters() if any(nd in name for nd in no_decay)], 'weight_decay': 0.0} ]

# 옵티마이저 정의
optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)

# 손실함수 정의
loss_fn = nn.CrossEntropyLoss()

# 스케쥴러 정의
test_size = 0.1
scale_weight = 0.8
len_esg_train_dataloader = len(total_inputs) * (1-test_size) * scale_weight / batch_size # 전체 데이터의 샘플 수 X train_split 비율 X 최소값 scale 가중치 / 배치사이즈(512)
t_total = len_esg_train_dataloader * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

`모델 학습 코드`

In [None]:
train_data_loss = []
p_train_data_accuracy = []
n_train_data_accuracy = []
p_valid_data_mean_accuracy = []
n_valid_data_mean_accuracy = []

transform = nlp.data.BERTSentenceTransform(word_tokenizer, max_seq_length = max_len, vocab = vocab, pad = True, pair = False)
matching_sent_att = {str(sent) : transform(str(sent)) for sent in total_inputs['문장']}

In [None]:
for epoch in range(num_epochs) : # 5번의 학습을 시킨다
    model.train() # predict_valid_data_acc 함수에 model.eval() 존재 따라서 반복문 안에서 훈련/평가 모드 전환한다.
    print(f'{epoch+1}번째 학습에 대한 정확도입니다.')
    train_input, valid_input = train_test_split(total_inputs, test_size = test_size, shuffle = True, random_state = epoch + 1, stratify = total_inputs['조합'])

    train_scale_num = train_input['조합'].value_counts().min()
    valid_scale_num = valid_input['조합'].value_counts().min()

    train_input = train_input.groupby('조합').head(train_scale_num * scale_weight)
    valid_input = valid_input.groupby('조합').head(valid_scale_num * scale_weight)

    sent_train_data = [[matching_sent_att[str(sent)], (str(p_label), str(n_label))] for sent, p_label, n_label in zip(train_input['문장'], train_input['긍정'], train_input['부정'])]
    sent_valid_data = [[matching_sent_att[str(sent)], (str(p_label), str(n_label))] for sent, p_label, n_label in zip(valid_input['문장'], valid_input['긍정'], valid_input['부정'])]

    sent_data_train = SENT_BERTDataset(sent_train_data, 0, 1)
    sent_data_test = SENT_BERTDataset(sent_valid_data, 0, 1)

    sent_train_dataloader = torch.utils.data.DataLoader(sent_data_train, batch_size = batch_size, num_workers = 2)
    sent_test_dataloader = torch.utils.data.DataLoader(sent_data_test, batch_size = batch_size, num_workers = 2)

    # 학습 과정 시작
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(sent_train_dataloader)):
        train_acc = 0.0
        optimizer.zero_grad()

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.float().to(device)

        out = model.forward(token_ids, valid_length, segment_ids)

        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()

        p_train_acc, n_train_acc  = sent_calc_accuracy(out, label)
        if batch_id % log_interval == 0:
             print(f'train_data_accuracy = epoch : {epoch+1} | batch_id : {batch_id} | loss : {loss.data.cpu().numpy()} | positive accuracy : {p_train_acc} | negative accuracy : {n_train_acc}')
             train_data_loss.append(loss.data.cpu().numpy())
             p_train_data_accuracy.append(p_train_acc)
             n_train_data_accuracy.append(n_train_acc)


    # 학습이 완료 되었으니
    p_valid_mean_acc, n_valid_mean_acc = predict_valid_data_acc(sent_test_dataloader)
    print(f'{epoch+1}번째 valid_data 긍부정 평균 정확도 : [긍정 평균 정확도 : {p_valid_mean_acc} , 부정 평균 정확도 : {n_valid_mean_acc}]')
    p_valid_data_mean_accuracy.append(p_valid_mean_acc)
    n_valid_data_mean_accuracy.append(p_valid_mean_acc)
    # 에폭 = random_state --> 에폭마다 학습하는 것은 고정되어 있다. 끊길 위험성 배제 : 에폭마다 모델 저장하기
    torch.save({'model.state_dict': model.state_dict()},  f'/content/drive/MyDrive/model_checkpoint/sent/sent_version1-{epoch + 1}.pt')


`시각화`

In [None]:
import matplotlib.pyplot as plt

# Create the first figure with subplots
fig, axs = plt.subplots(1, 2, figsize=(15, 5))

# Plot train data loss
axs[0].plot(train_data_loss, label='train loss')
axs[0].set_title('Train Data Loss')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].legend()

# Plot p_train data accuracy
axs[1].plot(p_train_data_accuracy, label='p_train accuracy', color='orange')
axs[1].plot(n_train_data_accuracy, label='n_train accuracy', color='green')
axs[1].set_title('Train Data Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Accuracy')
axs[1].legend()
plt.tight_layout()
plt.show()

# Create separate figures for the other plots
plt.figure(figsize=(8, 6))
plt.plot(p_valid_data_mean_accuracy, label='p_validation accuracy', color='blue')
plt.title('P Validation Data Mean Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(n_valid_data_mean_accuracy, label='n_validation accuracy', color='red')
plt.title('N Validation Data Mean Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

