# KoBERT 모델을 활용한 영화 리뷰 감성분석

## kobert를 사용하기 위한 kobert_transformers 패키지 다운로드

https://github.com/monologg/KoBERT-Transformers.git


In [None]:
!wget https://raw.githubusercontent.com/monologg/KoBERT-Transformers/master/kobert_transformers/tokenization_kobert.py -O ./tokenization_kobert.py

In [None]:
!pip3 install kobert-transformers

## 패키지 임포트

- kobert_transformers를 활용해 kobert 모델을 불러옵니다.
- tokenization_kobert는 kobert github에서 tokenizer를 가져오기위해 사용한 모듈입니다.

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from kobert_transformers import get_kobert_model, get_distilkobert_model
from tokenization_kobert import KoBertTokenizer

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

import numpy as np

## 데이터 준비

- 네이버 영화 리뷰 데이터
- 학습 데이터 15만개, 테스트 데이터 5만개로 이루어져 있습니다.

https://github.com/e9t/nsmc.git

In [None]:
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt -O ./nsmc_train.tsv
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt -O ./nsmc_test.tsv

In [None]:
train_reviews = [l.rstrip().split('\t') for l in open('./nsmc_train.tsv').readlines()[1:]]
test_reviews = [l.rstrip().split('\t') for l in open('./nsmc_test.tsv').readlines()[1:]]

In [None]:
# id, review, label
test_reviews[0]

In [None]:
train_sentences, train_labels = list(zip(*[(sentence, int(label)) for _, sentence, label in train_reviews]))
test_sentences, test_labels = list(zip(*[(sentence, int(label)) for _, sentence, label in test_reviews]))

### EDA : 이벤트 기반 '아키텍처'

## Configs 설정

In [None]:
max_len = 128 # 문장 최대길이, EDA 필요
batch_size = 64

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [None]:
device

## Input Transformer

- 입력 문장을 bert 입력에 필요한 값들로 바꿔줍니다.
- token_ids : 입력 문장을 토큰화한 값
    - 입력 문장을 토큰화합니다.
    - [CLS], [SEP]토큰을 추가합니다.
    - 토큰들을 id (숫자값)으로 치환합니다.
    - 문장이 길 경우 max_len까지만 사용합니다.
- valid_lengths : 유효 토큰 길이 값
- segment_ids : NSP task에서 문장 구분을 위해 사용합니다.
앞 문장 0, 뒷 문장 1. 여기서는 모두 0입니다.


In [None]:
class InputTransformer:

    def __init__(self, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad_token = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]

    def transform(self, sentences):
        # 성능개선필요 https://github.com/monologg/KoBERT-nsmc/blob/master/data_loader.py
        tokenized = [self.tokenizer.tokenize(sentence) for sentence in sentences]
        input_ids = [self.tokenizer.convert_tokens_to_ids(['[CLS]'] + tokens + ['[SEP]'])[:self.max_len]
                     for tokens in tokenized]
        valid_lengths = [input_id.index(3) if len(input_id) < self.max_len else self.max_len for input_id in input_ids]
        token_ids = [np.array(input_id + [self.pad_token]*(self.max_len-valid_lengths[i]-1)) for i, input_id in enumerate(input_ids)]
        segment_ids = [np.zeros(self.max_len) for i in range(len(sentences))]

        return list(zip(token_ids, valid_lengths, segment_ids))

# class InputTransformer:

#     def __init__(self, tokenizer, max_len=128):
#         self.tokenizer = tokenizer
#         self.max_len = max_len
#         self.pad_token = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]

#     def transform(self, sentences):
#         inputs = [self.tokenizer(sentence,
#                                  padding='max_length',
#                                  max_length=self.max_len,
#                                  truncation=True)
#                                  for sentence in sentences] #{input_ids, token_type_ids, attention_mask}

#         return inputs

## BERT Dataset

- 문장들을 Input Transformer 처리하고 label을 텐서 자료형으로 변환합니다.

In [None]:
class BERTDataset(Dataset):

    def __init__(self, transformer, sentences, labels=()):
        self.sentences = transformer.transform(sentences)
        self.is_test = False if labels else True
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, i):
        return self.sentences[i], self.labels[i] if not self.is_test else _

    def __len__(self):
        return len(self.sentences)


In [None]:
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
input_transformer = InputTransformer(tokenizer, max_len)

# train & test 데이터로 나누기
from sklearn.model_selection import train_test_split

trn_sents, valid_sents, trn_labels, valid_labels = train_test_split(train_sentences, train_labels, test_size=0.25, random_state=0, stratify=train_labels, shuffle=True)

train_dataset = BERTDataset(input_transformer, trn_sents, trn_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0)

valid_dataset = BERTDataset(input_transformer, valid_sents, valid_labels)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=0)

In [None]:
length_list = [l for _, l, _ in train_dataset.sentences]

In [None]:
import seaborn as sns

sns.displot(length_list, kde=True)

## 모델링

Bert모델에 pytorch 코드 구현 자료
https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial

BERT Classifier 모델입니다.

forward에서 사용하는 attention_mask는 gen_attention_mask에서 생성합니다.
<br /><br />

#### _, pooler = self.bert(...) 에 대한 설명

- bert에 Feed Forward 하여 앞서 input_transformer 적용 시 문장별로 맨 앞에 주었던 [CLS] 토큰에 대한 임베딩 벡터를 반환합니다. (pooled)
- 앞의 _는 bert의 output으로 전체 sequence에 대한 embedding 벡터입니다.

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.regressor = nn.Linear(hidden_size , num_classes)

        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)

        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1

        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(),
                              attention_mask = attention_mask.float().to(token_ids.device), return_dict=False)

        if self.dr_rate:
            out = self.dropout(pooler)

        return self.regressor(out)

kobert = get_kobert_model()
model = BERTClassifier(kobert, dr_rate=0.5).to(device)

In [None]:
# class BERTRegressor(nn.Module):
#     def __init__(self,
#                  bert,
#                  hidden_size = 768,
#                  num_areas=4,
#                  dr_rate=None):
#         super(BERTRegressor, self).__init__()
#         self.bert = bert
#         self.dr_rate = dr_rate
#         self.regressor = nn.Linear(hidden_size , num_areas)
#         if dr_rate:
#             self.dropout = nn.Dropout(p=dr_rate)

#     def forward(self, inputs):
#         if type(self.bert) == DistilBertModel:
#             inputs.pop('token_type_ids')
#             sequence_output = self.bert(**inputs,
#                                   return_dict=False)
#             pooler = sequence_output[0][:, 0]
#         else:
#             _, pooler = self.bert(**inputs, return_dict=False)

#         if self.dr_rate:
#             out = self.dropout(pooler)
#         return self.regressor(out)


## 모델 학습

In [None]:
num_epochs = 5
learning_rate = 5e-5
warmup_ratio = 0.1
max_grad_norm = 1
log_interval = 50

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

#옵티마이저, 손실함수 지정
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-8)
loss_fn = nn.CrossEntropyLoss().cuda()

t_total = len(train_loader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
from tqdm.auto import tqdm

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return acc

In [None]:
from copy import deepcopy

In [None]:
loss_history = [] # 전체 loss
avg_loss_hisroty = [] # epoch별 평균 loss
test_history = [] # validation accuracy
test_loss_history = [] # epoch별 평균 validation loss

best_acc = 0.0
best_acc_model = deepcopy(model)
for e in range(num_epochs):
    total_loss=0.0
    train_acc = 0.0
    test_acc = 0.0
    test_total_loss=0.0

    model.train() #훈련 선언
    for batch_id, ((token_ids, valid_length, segment_ids), label) in enumerate(tqdm(iter(train_loader))):
        optimizer.zero_grad()
        token_ids = token_ids.to(device)
        segment_ids = segment_ids.to(device)
        valid_length= valid_length
        label = label.to(device)
        out = model(token_ids, valid_length, segment_ids)

        #print(label.shape,out.shape)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule

        train_acc += calc_accuracy(out, label)

        if batch_id+1 % log_interval == 0:
            print("[train]\tepoch: {}\t batch id: {}/{}\t train_loss: {:.4f}\t train_acc: {:.4f}".format(e+1, batch_id+1,
                len(train_loader), loss.data.cpu().numpy(), train_acc / (batch_id+1)))
            loss_history.append(loss.data.cpu().numpy())
            total_loss+=loss.data.cpu().numpy()

    avg_loss=total_loss/len(train_loader) #epoch별 평균 loss값
    avg_loss_hisroty.append(avg_loss)

    model.eval() #모델평가
    for batch_id, ((token_ids, valid_length, segment_ids), label) in enumerate(tqdm(iter(valid_loader))):
        token_ids = token_ids.to(device)
        segment_ids = segment_ids.to(device)
        valid_length= valid_length
        label = label.to(device)
        with torch.no_grad():
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label).detach()
            test_acc += calc_accuracy(out, label)
            test_total_loss += loss.data.cpu().numpy()
        test_history.append(test_acc / (batch_id+1))

    test_avg_loss=test_total_loss/len(valid_loader)
    test_loss_history.append(test_avg_loss)
    print("[valid]\tepoch: {}\t test loss: {:.4f}\t test acc: {:.4f}".format(e+1, test_avg_loss,
        test_acc/len(valid_loader)))

    if (test_acc / (batch_id+1)) > best_acc:
          best_acc = (test_acc / (batch_id+1))
          best_acc_model = deepcopy(model)


## 예측, 평가

학습때 사용하지 않은 데이터로 검증을 진행합니다.

전체적인 구조는 validation 때 사용한 코드와 비슷합니다.

In [None]:
predict_dataset = BERTDataset(input_transformer, test_sentences[:256], test_labels)
test_loader = torch.utils.data.DataLoader(predict_dataset, batch_size=batch_size, num_workers=2)

In [None]:
def predict(model, test_loader, device):
    model.eval()
    test_loss = 0.0
    test_acc = 0.0
    for (token_ids, valid_length, segment_ids), label in test_loader:
        token_ids = token_ids.to(device)
        segment_ids = segment_ids.to(device)
        valid_length = valid_length
        label = label.to(device)

        with torch.no_grad():
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label).detach()
            test_acc += calc_accuracy(out, label)
            test_loss += loss.data.cpu().numpy()

    return test_loss/len(test_loader), test_acc/len(test_loader)

In [None]:
predict(model, test_loader, device)