# Recurrent Neural Network

- [Reading Data](#Reading-Data)
- [Pre-processing Data](#Pre-processing-Data)
- [Making Vocab & Setting Embedding](#Making-Vocab-&-Setting-Embedding)
- [Spliting Validation Data & Making Data Iterator](#Spliting-Validation-Data-&-Making-Data-Iterator)
    - [Sample Data](#Sample-Data)
- [Modeling](#Modeling)
    - [Checking feed-forward](#Checking-feed-forward)
    - [Function Definition](#Function-Definition)
    - [bi-RNN](#bi-RNN)
    - [LSTM](#LSTM)
    - [GRU](#GRU)

In [1]:
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data  # 텍스트에 대한 여러 추상화 기능을 제공하는 자연어 처리 라이브러리
from torchtext import datasets

## Reading Data

_data.Field (sequential , batch_first, lower) : 데이터를 어떻게 처리할 것인지를 지정 (default : 띄어쓰기 되어있는 부분을 split한 단어의  list 제공)_
* fix_length -> 최대 단어를 몇개까지 list에 저장할 것인지 
* pad_first -> 단어 갯수가 fix_length 미만인 경우 padding을 통해 길이를 맞추는데 이때 padding을 단어 앞쪽에 할지, 뒤쪽에 할지 설정 
* sequential  -> 순차적인 데이터인지에 대한 설정
* batch_first  -> 신경망에 입력되는 텐서의 첫번째 차원값이 batch_size가 되도록 설정
* lower -> 데이터 속 모든 영문 알파벳이 소문자가 되도록 설정



In [2]:
# Data Setting
TEXT = data.Field(batch_first = True,  # Batch-Size를 Data Shape Axis의 가장 앞으로 설정
                  fix_length = 500,  # 문장의 길이 제한
                  tokenize = str.split,  # tokenize를 설정하는 옵션, 기본값은 띄어쓰기 기반
                  pad_first = True,  # 패딩을 앞에서 줄 것인지(fix_length 대비 짧은 문장의 경우)
                  pad_token = '[PAD]',  # padding에 대한 특수 토큰
                  unk_token = '[UNK]')  # dict에 없는 token 표현방법

LABEL = data.LabelField(dtype=torch.float)  # 가져올 데이터에 대한 Type 설정

train_data, test_data = datasets.IMDB.splits(text_field = TEXT, label_field = LABEL)  # datasets안의 IMDB 데이터로 split

downloading aclImdb_v1.tar.gz


.data\imdb\aclImdb_v1.tar.gz: 100%|███████████████████████████████████████████████| 84.1M/84.1M [01:20<00:00, 1.05MB/s]


In [3]:
# Data Length
print(f'Train Data Length : {len(train_data.examples)}')  # 데이터의 개수를 확인
print(f'Test Data Length : {len(test_data.examples)}')

Train Data Length : 25000
Test Data Length : 25000


In [4]:
# Data Sample
print('---- Data Sample ----')
print('Input : ')
print(' '.join(vars(train_data.examples[1])['text']),'\n')  # vars() : 데이터의 값을 직접 확인
print('Label : ')
print(vars(train_data.examples[1])['label'])  

---- Data Sample ----
Input : 
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the str

## Pre-processing Data

In [5]:
def PreProcessingText(input_sentence):
    input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) # "<br />" 처리
    input_sentence = re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    if input_sentence:
        return input_sentence

In [6]:
# 간단한 Data Cleansing 작업
for example in train_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()
    
for example in test_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

## Making Vocab & Setting Embedding

In [7]:
model_config = {'emb_type' : '', 'emb_dim' : 300}

In [8]:
# build_vocab() : Text Data와 Label Data의 Vocab을 만듦
TEXT.build_vocab(train_data,  
                 min_freq = 2,  # vocab에 해당하는 token에 최소한으로 등장하는 횟수 
                 max_size = None,  # 전체 vocab size 자체에 제한
                 vectors = f"glove.6B.{model_config['emb_dim']}d")  # pre-trained vector

LABEL.build_vocab(train_data)

model_config['vocab_size'] = len(TEXT.vocab) 

.vector_cache\glove.6B.zip: 862MB [07:58, 1.80MB/s]                                                                    
100%|███████████████████████████████████████████████████████████████████████▉| 399999/400000 [00:44<00:00, 8938.80it/s]


In [9]:
## pre-trained vector list
# charngram.100d
# fasttext.en.300d
# fasttext.simple.300d
# glove.42B.300d
# glove.840B.300d
# glove.twitter.27B.25d
# glove.twitter.27B.50d
# glove.twitter.27B.100d
# glove.twitter.27B.200d
# glove.6B.50d
# glove.6B.100d
# glove.6B.200d
# glove.6B.300d

In [10]:
# Vocabulary Info
print(f'Vocab Size : {len(TEXT.vocab)}')

print('Vocab Examples : ')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()):
    if idx >= 10:  # 상위 10개 Vacab의 단어와 index 값을 가져옴
        break    
    print('\t', k, v)

print('---------------------------------')

# Label Info
print(f'Label Size : {len(LABEL.vocab)}')

print('Lable Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)

Vocab Size : 51956
Vocab Examples : 
	 [UNK] 0
	 [PAD] 1
	 the 2
	 and 3
	 a 4
	 of 5
	 to 6
	 is 7
	 in 8
	 it 9
---------------------------------
Label Size : 2
Lable Examples : 
	 neg 0
	 pos 1


## Spliting Validation Data & Making Data Iterator

In [11]:
# Spliting Valid set
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio=0.8)

In [12]:
model_config['batch_size'] = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets=(train_data, valid_data, test_data), 
                                                                           batch_size=model_config['batch_size'], 
                                                                           device=device)

## Sample Data

In [13]:
# Check batch data
sample_for_check = next(iter(train_iterator))  # 돌아가지 않을 경우 torchtext 버전이 0.3.1 버전이 맞는지 확인
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 32x500 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 32 (GPU 0)]
tensor([[   1,    1,    1,  ...,    7,  181, 2530],
        [  51,    2, 1031,  ...,    2, 1110,  147],
        [   1,    1,    1,  ...,   21,   57,   75],
        ...,
        [   1,    1,    1,  ...,    2,  950,  267],
        [   1,    1,    1,  ...,  150,   15,    9],
        [   1,    1,    1,  ..., 6062,    2,  265]], device='cuda:0')
tensor([0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0.,
        1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0.],
       device='cuda:0')


In [14]:
# Check reverting data
print(' '.join([TEXT.vocab.itos[int(x)] for x in sample_for_check.text[0,:] if x not in [0,1]]))
print(LABEL.vocab.itos[int(sample_for_check.label[0])]) 

this film is really bad it maybe harsh but it is it really is poor script every vampire cliché in the book is used and no sympathy is given at all to the origins of the main character i e ole dracula there have been some truly brilliant dracula vampire movies in the past but this doesn't even make it into the dire slot take a selection of people who seem to have dropped out of a teen slasher move add a dribble of dracula lore and mix in a heady tonic of religious surreal day dreaming and you get a confusing mess of a film dracula 2000 i really cannot find any good things to say about this movie as if it wasn't bad enough that it was made in the first place they seem to have made johnny lee miller effect an english accent whats the problem with that i hear you cry well he is english but he sounds like an american trying to do an english accent all in all you may as well say your money if you were thinking of buying it or rent it out watch it and discover for yourself why it's about as s

## Modeling

In [15]:
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()

        if model_config['emb_type'] == 'glove' or 'fasttext':  # Vocab size * Embedding_dimension 행렬을 만들어 학습 
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],  
                                    embedding_dim = model_config['emb_dim'],  # 원하는 Embedding_dimension을 설정 
                                    _weight = TEXT.vocab.vectors)  # Pre-Trained Vector를 Embedding 행렬의 Initial Value로 설정
                                                                   # 이 옵션이 없는 경우 정규 분포에서 생성한 값을 Initial Value로 설정
        else:
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],
                                    embedding_dim = model_config['emb_dim'])
        
        self.bidirectional = model_config['bidirectional']  # 두 개의 독립적인 RNN을 합친 것으로 정방향과 역방향 순서 정보를 모두 학습
        self.num_direction = 2 if model_config['bidirectional'] else 1
        self.model_type = model_config['model_type'] 

        self.RNN = nn.RNN (input_size = model_config['emb_dim'],  
                           hidden_size = model_config['hidden_dim'],  
                           dropout = model_config['dropout'],  
                           bidirectional = model_config['bidirectional'], 
                           batch_first = model_config['batch_first'])  # 사용자 지정 Hyperparameter
    
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_direction, model_config['output_dim'])
        
        self.drop = nn.Dropout(model_config['dropout'])

    def forward(self, x):
        # x : (Batch_Size, Max_Seq_Length)
        
        emb = self.emb(x) 
        # emb : (Batch_Size, Max_Seq_Length, Emb_dim)

        output, hidden = self.RNN(emb) 
        # output : (Batch_Size, Max_Seq_Length, Hidden_dim * num_direction) 
        # hidden : (num_direction, Batch_Size, Hidden_dim)
        # hidden의 경우, batch_first 옵션이 안먹는 문제가 있음
        
        last_output = output[:,-1,:]
        # last_output : (Batch_Size, Hidden_dim * num_direction)
        
        return self.fc(self.drop(last_output))

### Checking feed-forward

In [16]:
model_config.update(dict(batch_first = True,
                         model_type = 'RNN',
                         bidirectional = True,
                         hidden_dim = 128,
                         output_dim = 1,
                         dropout = 0))

In [17]:
model = SentenceClassification(**model_config).to(device)

In [18]:
predictions = model.forward(sample_for_check.text).squeeze()

In [19]:
loss_fn = nn.BCEWithLogitsLoss().to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum()/len(correct)
    return acc

In [20]:
loss = loss_fn(predictions, sample_for_check.label)
acc = binary_accuracy(predictions, sample_for_check.label)

In [21]:
print(predictions)
print(loss, acc)

tensor([-0.1094, -0.0898, -0.1592,  0.0513, -0.3095, -0.2103, -0.1794,  0.1216,
        -0.1977, -0.0390, -0.0838, -0.1451, -0.1663, -0.1654, -0.0389, -0.1569,
        -0.1224,  0.0015, -0.1426, -0.4569, -0.0562, -0.2034,  0.0802, -0.2307,
        -0.0546, -0.0217, -0.2358,  0.0091, -0.1971, -0.0353, -0.1321, -0.3449],
       device='cuda:0', grad_fn=<SqueezeBackward0>)
tensor(0.6878, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>) tensor(0.5312, device='cuda:0')


### Function Definition

In [22]:
def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_params):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() 
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator):
        
        # Initializing
        optimizer.zero_grad()
        
        # Forward 
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        sys.stdout.write(
                    "\r" + f"[Train] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}")

        # Backward 
        loss.backward()
        optimizer.step()
        
        # Update Epoch Performance
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator) 

In [23]:
def evaluate(model, iterator, loss_fn):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### bi-RNN

In [24]:
model_config['model_type'] = 'RNN'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [25]:
model_config

{'emb_type': '',
 'emb_dim': 300,
 'vocab_size': 51956,
 'batch_size': 32,
 'batch_first': True,
 'model_type': 'RNN',
 'bidirectional': True,
 'hidden_dim': 128,
 'output_dim': 1,
 'dropout': 0}

In [26]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-RNN_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.6235 | Train Acc : 0.6451
	 Epoch : 0 | Valid Loss : 0.6221 | Valid Acc : 0.6485
	 Epoch : 1 | Train Loss : 0.5704 | Train Acc : 0.6987
	 Epoch : 1 | Valid Loss : 0.6407 | Valid Acc : 0.6393
	 Epoch : 2 | Train Loss : 0.5513 | Train Acc : 0.7088
	 Epoch : 2 | Valid Loss : 0.6572 | Valid Acc : 0.6385
	 Epoch : 3 | Train Loss : 0.5024 | Train Acc : 0.7381
	 Epoch : 3 | Valid Loss : 0.6916 | Valid Acc : 0.625
	 Epoch : 4 | Train Loss : 0.4757 | Train Acc : 0.7535
	 Epoch : 4 | Valid Loss : 0.7023 | Valid Acc : 0.6224


In [28]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.6189 | Test Acc : 0.6521


### LSTM

In [29]:
model_config['model_type'] = 'LSTM'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [30]:
model_config

{'emb_type': '',
 'emb_dim': 300,
 'vocab_size': 51956,
 'batch_size': 32,
 'batch_first': True,
 'model_type': 'LSTM',
 'bidirectional': True,
 'hidden_dim': 128,
 'output_dim': 1,
 'dropout': 0}

In [31]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-LSTM_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.627 | Train Acc : 0.6439
	 Epoch : 0 | Valid Loss : 0.6945 | Valid Acc : 0.587
	 Saved at 1-epoch
	 Epoch : 1 | Train Loss : 0.5817 | Train Acc : 0.6892
	 Epoch : 1 | Valid Loss : 0.5794 | Valid Acc : 0.6933
	 Epoch : 2 | Train Loss : 0.5225 | Train Acc : 0.7339
	 Epoch : 2 | Valid Loss : 0.6764 | Valid Acc : 0.6095
	 Epoch : 3 | Train Loss : 0.5205 | Train Acc : 0.7213
	 Epoch : 3 | Valid Loss : 0.7088 | Valid Acc : 0.6166
	 Epoch : 4 | Train Loss : 0.483 | Train Acc : 0.7425
	 Epoch : 4 | Valid Loss : 0.7023 | Valid Acc : 0.6421


In [32]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.5773 | Test Acc : 0.6946


### GRU

In [33]:
model_config['model_type'] = 'GRU'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [34]:
model_config

{'emb_type': '',
 'emb_dim': 300,
 'vocab_size': 51956,
 'batch_size': 32,
 'batch_first': True,
 'model_type': 'GRU',
 'bidirectional': True,
 'hidden_dim': 128,
 'output_dim': 1,
 'dropout': 0}

In [35]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-GRU_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.6277 | Train Acc : 0.6412
	 Epoch : 0 | Valid Loss : 0.5906 | Valid Acc : 0.6967
	 Epoch : 1 | Train Loss : 0.5914 | Train Acc : 0.6802
	 Epoch : 1 | Valid Loss : 0.6355 | Valid Acc : 0.632
	 Epoch : 2 | Train Loss : 0.5051 | Train Acc : 0.7488
	 Epoch : 2 | Valid Loss : 0.6942 | Valid Acc : 0.66
	 Epoch : 3 | Train Loss : 0.4123 | Train Acc : 0.8092
	 Epoch : 3 | Valid Loss : 0.7084 | Valid Acc : 0.6117
	 Epoch : 4 | Train Loss : 0.4208 | Train Acc : 0.7944
	 Epoch : 4 | Valid Loss : 0.7428 | Valid Acc : 0.6073


In [36]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.5922 | Test Acc : 0.69
