In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from pytorch_transformers import BertTokenizer
from torchtext import data 
from torchtext.data import TabularDataset
import string

In [2]:
BATCH_SIZE = 64
lr = 0.001
EPOCHS = 10
SEED = 0

In [4]:
TEXT = data.Field(sequential=True, batch_first=True, lower=True,  fix_length=500)
LABEL = data.Field(sequential=False, batch_first=True)

In [5]:
train_data, test_data = TabularDataset.splits(
        path='.', train='../data/imdb/train.csv', test='../data/imdb/test.csv', format='csv',
        fields=[('text', TEXT), ('label', LABEL)], skip_header=True)

In [7]:
def polish_data(data):
    for example in data.examples:
        temp = [x.replace("<br","") for x in vars(example)['text']]
        temp = [''.join(c for c in x if c not in string.punctuation) for x in temp]
        vars(example)['text'] = temp
    return

In [8]:
polish_data(train_data)
polish_data(test_data)

In [9]:
train_data, val_data = train_data.split(random_state = random.seed(SEED), 
                                        split_ratio=0.8)

In [10]:
TEXT.build_vocab(train_data, min_freq=5)
LABEL.build_vocab(train_data)

vocab_size = len(TEXT.vocab)
label_size = 2

In [11]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")

Using GPU: GeForce RTX 2080 Ti


In [12]:
train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train_data, val_data, test_data), batch_size=BATCH_SIZE,
        device=device, sort_key=lambda x: len(vars(x)),
        sort_within_batch=False, repeat=False)

In [14]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, embed_dim,
                 n_vocab, n_classes=2, dropout=0.2):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embed = nn.Embedding(n_vocab, embed_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)

    def forward(self, x):
        batch_size = x.size(0)
        h_0 = self.__init__state(batch_size=batch_size)
        
        x = self.embed(x)
        x,_ = self.gru(x, h_0)
        h_t = x[:,-1,:]
        
        self.dropout(h_t)
        logit = self.out(h_t)
        return logit
    
    def __init__state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, 
                          self.hidden_dim).zero_()


In [15]:
n_layers = 1
hidden_dim = 256
embed_dim = 400
dropout = 0.5

model = GRU(n_layers, hidden_dim, embed_dim, vocab_size, 
            2, dropout)
print(model)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

RNN(
  (embed): Embedding(27438, 400, padding_idx=0)
  (dropout): Dropout(p=0.5, inplace=False)
  (gru): GRU(400, 256, batch_first=True)
  (rnn): RNN(400, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=2, bias=True)
)


In [16]:
def train(model, optimizer, iterator):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        x, y = batch.text.to(device), batch.label.to(device)
        predictions = model(x).squeeze(1)
        y.data.sub_(1)
        loss = F.cross_entropy(predictions, y)
        acc = binary_accuracy(predictions, y)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss/len(iterator) , epoch_acc/len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    model.eval()
    
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
    
        for batch in iterator:
            x, y = batch.text.to(device), batch.label.to(device)
            predictions = model(x).squeeze(1)
            y.data.sub_(1)
            loss = F.cross_entropy(predictions, y, reduction='sum')
            acc = binary_accuracy(predictions, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
import os

criterion = nn.BCEWithLogitsLoss().to(device)

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter, criterion)

    print("[Epoch: %d] Train loss: %.3f | Train acc: %.2f | Val loss : %5.2f | Val accuracy : %5.2f" 
          % (epoch+1,train_loss,train_acc, val_loss, val_accuracy))

[Epoch: 1] Train loss: 0.011 | Train acc: 1.00 | Val loss : 42.75 | Val accuracy :  0.88
[Epoch: 2] Train loss: 0.008 | Train acc: 1.00 | Val loss : 40.80 | Val accuracy :  0.88
[Epoch: 3] Train loss: 0.005 | Train acc: 1.00 | Val loss : 46.09 | Val accuracy :  0.87
[Epoch: 4] Train loss: 0.006 | Train acc: 1.00 | Val loss : 39.27 | Val accuracy :  0.87
[Epoch: 5] Train loss: 0.012 | Train acc: 1.00 | Val loss : 45.68 | Val accuracy :  0.88
[Epoch: 6] Train loss: 0.010 | Train acc: 1.00 | Val loss : 39.56 | Val accuracy :  0.88
[Epoch: 7] Train loss: 0.008 | Train acc: 1.00 | Val loss : 40.97 | Val accuracy :  0.88
[Epoch: 8] Train loss: 0.004 | Train acc: 1.00 | Val loss : 46.93 | Val accuracy :  0.88
[Epoch: 9] Train loss: 0.009 | Train acc: 1.00 | Val loss : 37.39 | Val accuracy :  0.88
[Epoch: 10] Train loss: 0.006 | Train acc: 1.00 | Val loss : 42.46 | Val accuracy :  0.88


In [28]:
test_loss, test_acc = evaluate(model, test_iter, criterion)
print('테스트 오차: %.3f | 테스트 정확도: %.3f' % (test_loss, test_acc))

테스트 오차: 49.290 | 테스트 정확도: 0.864
