<font color=CC3D3D>

# 잡플래닛 기업 리뷰 데이터 감성분석 </font>
    
다양한 토큰화, 임베딩 방법을 사용해서 RNN 성능을 높여주세요!

# Recurrent Neural Network

- [Reading Data](#Reading-Data)
- [Pre-processing Data](#Pre-processing-Data)
- [Making Vocab & Setting Embedding](#Making-Vocab-&-Setting-Embedding)
- [Spliting Validation Data & Making Data Iterator](#Spliting-Validation-Data-&-Making-Data-Iterator)
    - [Sample Data](#Sample-Data)
- [Modeling](#Modeling)
    - [Checking feed-forward](#Checking-feed-forward)
    - [Training](#Training)
    - [bi-RNN](#bi-RNN)

## Reading Data

In [1]:
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F


from torchtext import data
from torchtext import datasets

In [2]:
#!pip uninstall transformers
#pip install transformers

In [2]:
import pandas as pd
korean_stopwords = pd.read_csv('./korean_stopwords.txt', sep='\n', header=None)
korean_stopwords.columns = ['stopwords']

In [3]:
from kobert_transformers import get_tokenizer

def tokenizer_kobert(text):
    tokenizer = get_tokenizer()
    result = tokenizer.tokenize(text)
    for word in result:
        if (word not in list(korean_stopwords['stopwords'])):
            yield word
# 너무 오래걸림
tokenizer = get_tokenizer() # 바로 kobert로 tokenize하는 방법 선택

In [4]:
from kiwipiepy import Kiwi, Option
kiwi = Kiwi(num_workers=4, options=Option.LOAD_DEFAULT_DICTIONARY | Option.INTEGRATE_ALLOMORPH)
kiwi.prepare()

def tokenizer_kiwi(text):
    result = kiwi.analyze(text)
    for token, _, _, _ in result[0][0]:
            yield f'{token}'
# 성능이 생각보다 안나옴

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

In [7]:
TEXT = data.Field(batch_first = True, 
                  fix_length = 200, 
                  tokenize = tokenizer.tokenize, 
                  pad_first = True, 
                  pad_token ='[PAD]', 
                  unk_token ='[UNK]')

LABEL = data.LabelField(dtype = torch.float)

train_data = data.TabularDataset(path = 'train_data.csv', 
                    format = 'csv', 
                    fields = [("text",TEXT),
                            ("label",LABEL)],  
                    skip_header=True)

test_data = data.TabularDataset(path='test_data.csv', 
                    format='csv', 
                    fields=[("text",TEXT),
                            ("label",LABEL)],  
                    skip_header=True)

In [8]:
print(f'Train Data Length : {len(train_data.examples)}')
print(f'Test Data Length : {len(test_data.examples)}')

Train Data Length : 50000
Test Data Length : 15000


In [9]:
print('---- Data Sample ----')
print('Input : ')
print(' '.join(vars(train_data.examples[0])['text']),'\n') 
print('Label : ')
print(vars(train_data.examples[1])['label'])  

---- Data Sample ----
Input : 
절대 ##가지 ##마세요 몸 다 상 ##합니다 부서 절대 ##요 

Label : 
0


## Pre-processing Data

In [10]:
def PreProcessingText(input_sentence):
    #input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) # "<br />" 처리
    input_sentence = re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_`▁{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    return input_sentence

In [None]:
# 간단한 Data Cleansing 작업
for example in train_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()
    
for example in test_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

## Making Vocab & Setting Embedding

In [12]:
model_config = {'emb_type' : '', 'emb_dim' : 300}

In [13]:
# build_vocab() : Text Data와 Label Data의 Vocab을 만듦
TEXT.build_vocab(train_data,  
                 min_freq = 2,   
                 max_size = None, 
                 vectors = f"glove.6B.{model_config['emb_dim']}d") 

LABEL.build_vocab(train_data)

model_config['vocab_size'] = len(TEXT.vocab) 

In [14]:
# 다양한 임베딩 방법 

## pre-trained vector list
# charngram.100d
# fasttext.en.300d
# fasttext.simple.300d
# glove.42B.300d
# glove.840B.300d
# glove.twitter.27B.25d
# glove.twitter.27B.50d
# glove.twitter.27B.100d
# glove.twitter.27B.200d
# glove.6B.50d
# glove.6B.100d
# glove.6B.200d
# glove.6B.300d

In [15]:
# Vocabulary Info
print(f'Vocab Size : {len(TEXT.vocab)}')

print('Vocab Examples : ')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()):
    if idx >= 10:  
        break    
    print('\t', k, v)

print('---------------------------------')

# Label Info
print(f'Label Size : {len(LABEL.vocab)}')

print('Lable Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)

Vocab Size : 4777
Vocab Examples : 
	 [UNK] 0
	 [PAD] 1
	 이 2
	 회사 3
	 가 4
	 수 5
	 는 6
	 도 7
	 고 8
	 에 9
---------------------------------
Label Size : 2
Lable Examples : 
	 0 0
	 1 1


## Spliting Validation Data & Making Data Iterator

In [16]:
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio=0.8)

In [19]:
model_config['batch_size'] = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets=(train_data, valid_data, test_data), 
                                                                           batch_size=model_config['batch_size'], device=device, sort=False)

### Sample Data

In [20]:
# Check batch data
sample_for_check = next(iter(train_iterator))  
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 128]
	[.text]:[torch.cuda.LongTensor of size 128x200 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 128 (GPU 0)]
tensor([[   1,    1,    1,  ...,    3,  163,   13],
        [   1,    1,    1,  ...,  324,    2,  101],
        [   1,    1,    1,  ...,   55,  705, 1394],
        ...,
        [   1,    1,    1,  ...,   47,   24,    3],
        [   1,    1,    1,  ...,   38,  442,  389],
        [   1,    1,    1,  ...,   36,    4,  303]], device='cuda:0')
tensor([0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0.,
        0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1.,
        1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1.,
        0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
        1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
        0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1.,
        0., 0., 1., 0.,

In [21]:
print(' '.join([TEXT.vocab.itos[int(x)] for x in sample_for_check.text[0,:] if x not in [0,1]]))
print(LABEL.vocab.itos[int(sample_for_check.label[0])]) 

그냥 너무 힘 듬 ᅲᅲ 젊 음을 갈 아 서 성장 하는 회사 같 음
0


## Modeling

In [22]:
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()

        if model_config['emb_type'] == 'glove' or 'fasttext':   
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],  
                                    embedding_dim = model_config['emb_dim'],
                                    _weight = TEXT.vocab.vectors)  
        else:
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],
                                    embedding_dim = model_config['emb_dim'])
        
        self.bidirectional = model_config['bidirectional'] 
        self.num_direction = 2 if model_config['bidirectional'] else 1
        self.model_type = model_config['model_type'] 

        self.RNN = nn.RNN (input_size = model_config['emb_dim'],  
                           hidden_size = model_config['hidden_dim'],  
                           dropout = model_config['dropout'],  
                           bidirectional = model_config['bidirectional'], 
                           batch_first = model_config['batch_first'])  
    
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_direction, model_config['output_dim'])
        
        self.drop = nn.Dropout(model_config['dropout'])

    def forward(self, x):
        
        emb = self.emb(x) 
        output, hidden = self.RNN(emb) 
        
        last_output = output[:,-1,:]

        return self.fc(self.drop(last_output))

### Checking feed-forward

In [23]:
model_config.update(dict(batch_first = True,
                         model_type = 'RNN',
                         bidirectional = True,
                         hidden_dim = 128,
                         output_dim = 1,
                         dropout = 0))

In [24]:
model = SentenceClassification(**model_config).to(device)

In [25]:
predictions = model.forward(sample_for_check.text).squeeze()

In [26]:
loss_fn = nn.BCEWithLogitsLoss().to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum()/len(correct)
    return acc

In [27]:
loss = loss_fn(predictions, sample_for_check.label)
acc = binary_accuracy(predictions, sample_for_check.label)

In [28]:
print(predictions)
print(loss, acc)

tensor([-0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072, -0.0072,
        -0.0072, -0.0072, -0.0072, -0.00

### Training

In [29]:
def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_params):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() 
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator):
        
        # Initializing
        optimizer.zero_grad()
        
        # Forward 
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        sys.stdout.write(
                    "\r" + f"[Train] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}")

        # Backward 
        loss.backward()
        optimizer.step()
        
        # Update Epoch Performance
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator) 

In [30]:
def evaluate(model, iterator, loss_fn):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### bi-RNN

In [31]:
model_config['model_type'] = 'RNN'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [32]:
N_EPOCH = 10

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-RNN_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.5432 | Train Acc : 0.7245
	 Epoch : 0 | Valid Loss : 0.4954 | Valid Acc : 0.7712
	 Saved at 1-epoch
	 Epoch : 1 | Train Loss : 0.461 | Train Acc : 0.79
	 Epoch : 1 | Valid Loss : 0.4612 | Valid Acc : 0.7893
	 Epoch : 2 | Train Loss : 0.4115 | Train Acc : 0.8205
	 Epoch : 2 | Valid Loss : 0.4854 | Valid Acc : 0.7696
	 Epoch : 3 | Train Loss : 0.3788 | Train Acc : 0.8379
	 Epoch : 3 | Valid Loss : 0.4854 | Valid Acc : 0.783
	 Epoch : 4 | Train Loss : 0.3426 | Train Acc : 0.858
	 Epoch : 4 | Valid Loss : 0.5039 | Valid Acc : 0.7697
	 Epoch : 5 | Train Loss : 0.3228 | Train Acc : 0.8671
	 Epoch : 5 | Valid Loss : 0.5299 | Valid Acc : 0.7668
	 Epoch : 6 | Train Loss : 0.2875 | Train Acc : 0.8873
	 Epoch : 6 | Valid Loss : 0.5426 | Valid Acc : 0.7662
	 Epoch : 7 | Train Loss : 0.2594 | Train Acc : 0.8997
	 Epoch : 7 | Valid Loss : 0.5948 | Valid Acc : 0.755

In [34]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.4779 | Test Acc : 0.7807
