In [1]:
from collections import defaultdict

import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np
import urllib.request

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x242d8900160>)

In [3]:
# data를 읽어옴
def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file, encoding='UTF8') as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls

In [4]:
x_train, y_train = read_txt('./ratings_train.txt')
x_test, y_test = read_txt('./ratings_test.txt')

x_train = [x.split() for x in x_train]
x_test = [x.split() for x in x_test]

print(x_train[:5])

[['아', '더빙..', '진짜', '짜증나네요', '목소리'], ['흠...포스터보고', '초딩영화줄....오버연기조차', '가볍지', '않구나'], ['너무재밓었다그래서보는것을추천한다'], ['교도소', '이야기구먼', '..솔직히', '재미는', '없다..평점', '조정'], ['사이몬페그의', '익살스런', '연기가', '돋보였던', '영화!스파이더맨에서', '늙어보이기만', '했던', '커스틴', '던스트가', '너무나도', '이뻐보였다']]


In [5]:
def remove_empty_review(X, Y):
    empty_idx_ls = []
    
    for idx, review in enumerate(X):
        if len(review) == 0:
            empty_idx_ls.append(idx)
    
    # idx 값이 큰 것부터 제거 (앞으로 밀리는 것을 방지)
    empty_idx_ls = sorted(empty_idx_ls, reverse = True)
    
    for empty_idx in empty_idx_ls:
        del X[empty_idx], Y[empty_idx]
    
    return X, Y

In [6]:
x_train, y_train = remove_empty_review(x_train, y_train)
x_test, y_test = remove_empty_review(x_test, y_test)

In [7]:
# Sequence Length를 맞추기 위한 padding
def add_padding(token_ls, max_len):
    pad = '<PAD>'
    seq_length_ls = []
    
    for i, tokens in enumerate(token_ls):
        seq_length = len(tokens)
        
        # 짧으면 padding을 추가
        if seq_length < max_len:
            seq_length_ls.append(seq_length)
            token_ls[i] += [pad] * (max_len - seq_length)
        
        # 길이가 길면, max_len까지의 token만 사용
        elif seq_length >= max_len:
            seq_length_ls.append(max_len)
            token_ls[i] = tokens[:max_len]
            
    return token_ls, seq_length_ls

In [8]:
max_sequence_length = 30
x_train, x_train_seq_length = add_padding(x_train, max_sequence_length)
x_test, x_test_seq_length = add_padding(x_test, max_sequence_length)

In [9]:
# 단어에 대한 idx 부여
def convert_token_to_idx(token_ls):
    for tokens in token_ls:
        yield [token2idx[token] for token in tokens]
    return

In [10]:
token2idx = defaultdict(lambda: len(token2idx))
pad = token2idx['<PAD>']

x_train = list(convert_token_to_idx(x_train))
x_test = list(convert_token_to_idx(x_test))

idx2token = {val : key for key,val in token2idx.items()}

In [11]:
' '.join([idx2token[x] for x in x_train[20]])

'나름 심오한 뜻도 있는 듯. 그냥 학생이 선생과 놀아나는 영화는 절대 아님 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [12]:
def sort_by_sequence_length(x, y, seq_len):
    sorted_idx = np.argsort(np.array(seq_len))[::-1]
    
    x = Variable(torch.LongTensor(np.array(x)[sorted_idx]))
    y = Variable(torch.LongTensor(np.array(y)[sorted_idx]))
    seq_len = Variable(torch.LongTensor(np.array(seq_len)[sorted_idx]))
    
    return x, y, seq_len

In [13]:
x_train, y_train, x_train_seq_length = sort_by_sequence_length(x_train, y_train, x_train_seq_length)
x_test, y_test, x_test_seq_length = sort_by_sequence_length(x_test, y_test, x_test_seq_length)

In [14]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, pad_index, hid_size, n_layers, dropout, n_category):
        super(RNN, self).__init__()
        self.vocab_size = vocab_size             # 고유 토큰의 갯수
        self.embed_size = embed_size             # 임베딩 차원의 크기
        self.pad_index = pad_index               # 패딩 토큰 (dummy)
        
        self.embed = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=embed_size, 
            padding_idx=self.pad_index
        )
        
        self.hid_size = hid_size           # RNN layer의 뉴런의 갯수
        self.n_layers = n_layers           # RNN layer의 수
        self.drouput = dropout             # 드롭아웃 비율
        self.n_category = n_category       # 카테고리 갯수
        
        self.rnn = nn.RNN(embed_size, hid_size, n_layers, batch_first=True)
        self.lin = nn.Linear(hid_size, n_category)

        
        self.outputs = []
        
    def init_hidden(self, batch_size):
        # the weights are of the form (nb_layers, batch_size, hid_size(n_neuron))
        hidden = Variable(torch.randn(self.n_layers, batch_size, self.hid_size))
        return hidden    
    
    def forward(self, x, x_sequence_length):
        # init h randomly
        batch_size = x.size(0)
        self.h = self.init_hidden(batch_size)
        
        # embedding
        x = self.embed(x) # sequence_length(max_len), batch_size, embed_size
        
        # packing for rnn
        x = torch.nn.utils.rnn.pack_padded_sequence(x, x_sequence_length, batch_first=True)
        
        # RNN
        output, self.h = self.rnn(x, self.h)
        
        # unpack
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        
        # cbow
        x = x.sum(dim = 1)  # flat하게 펼쳐서 fully-connet하는 것도 가능
        
        # fully-connect
        logit = self.lin(x)
        return logit

In [15]:
params = {
    'vocab_size' : len(token2idx),
    'embed_size' : 64,
    'pad_index' : token2idx['<PAD>'],
    'hid_size' : 64,
    'n_layers' : 2,
    'dropout' : 0.5,
    'n_category' : 2,
}

In [17]:
print(torch.cuda.is_available())
if torch.cuda.is_available(): 
    device='cuda'
else:
    device='cpu'
torch.manual_seed(777)

model = RNN(**params)
model

True


RNN(
  (embed): Embedding(448964, 64, padding_idx=0)
  (rnn): RNN(64, 64, num_layers=2, batch_first=True)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)

In [18]:
def adjust_learning_rate(optimizer, epoch, init_lr=0.001, lr_decay_epoch=10):
    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
    lr = init_lr * (0.1**(epoch // lr_decay_epoch))

    if epoch % lr_decay_epoch == 0:
        print('LR is set to %s'%(lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return optimizer

In [22]:
lossSet = []
accSet = []

epochs = 50
lr = 0.01
batch_size = 10000

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr)
criterion = nn.CrossEntropyLoss(reduction='sum')

loss_ls = []

for epoch in range(1, epochs+1):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train, y_train = x_train[train_idx], y_train[train_idx]
    x_train_seq_length = x_train_seq_length[train_idx]
    
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        # batch 뽑기
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].long()
        x_batch_seq_length = x_train_seq_length[start_idx: end_idx]
        
        # sequence 순서대로 정렬하기
        x_batch, y_batch, x_batch_seq_length = sort_by_sequence_length(x_batch, y_batch, x_batch_seq_length)
        
        scores = model(x_batch, x_batch_seq_length)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(scores, y_batch)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch, train_loss / batch_size, acc))
    # print('=================================================================================================')
    
    lossSet.append(train_loss/batch_size)
    accSet.append(acc)
    print("lossSet :" + str(lossSet))
    print("accSet :" + str(accSet))
    loss_ls.append(train_loss)
    optimizer = adjust_learning_rate(optimizer, epoch, lr, 10) # adjust learning_rate while training
    
    if (epoch) % 5 == 0:
        model.eval()
        scores = model(x_test, x_test_seq_length)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_test.long()).sum().item() / y_test.size(0)
        loss = criterion(scores, y_test.long())
        
        print('*************************************************************************************************')
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch, loss.item()/y_test.size(0), acc))
        print('*************************************************************************************************')

RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor

In [None]:
import matplotlib.pyplot as plt

RMSEloss = np.array(list(map(np.sqrt, lossSet)))
xdomain = np.arange(epochs)
xdomain += 1

plt.plot(xdomain, RMSEloss, marker='o', color='white')
plt.xlabel("Epochs")
plt.ylabel("RMSELoss")
plt.title("RMSELoss of RNN")
plt.show()

In [None]:
plt.plot(xdomain, accSet, marker='o', color='white')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy of RNN")
plt.show()