In [11]:
# import numpy as np
# import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import torch.nn as nn
from torchtext.legacy import data, datasets

In [9]:
BATCH_SIZE = 100
LR = 0.001
EPOCHS = 5
USE_CUDA = torch.cuda.is_available()

In [13]:
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [01:58<00:00, 709kB/s] 


In [14]:
# 문장을 단어로 쪼개서 train dataset 안에서 몇 개가 있는지 
TEXT.build_vocab(trainset, min_freq=5)  # 최소 5번 이상 나온 단어만 넣어라
LABEL.build_vocab(trainset)

In [17]:
trainset, valset = trainset.split(split_ratio=0.8)
train_iter, val_iter, test_iter = data.BucketIterator.splits((trainset, valset, testset), batch_size=BATCH_SIZE, shuffle=True, repeat=True)

In [23]:
vocab_size = len(TEXT.vocab)
n_classes = 2
print(f"[TRAIN]:{len(trainset)} \t [VALID]:{len(valset)} \t [TEST]:{len(testset)} \t [VOCAB]:{vocab_size} \t [CLASSES]:{n_classes}")

[TRAIN]:12800 	 [VALID]:3200 	 [TEST]:25000 	 [VOCAB]:46159 	 [CLASSES]:2


In [24]:
class BasicRNN(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(self).__init__()
        print("Building RNN")
        self.n_layers = n_layers
        
        # 보통 n_vocab 단어수가 엄청 많아서 one-hot-encoding 불가능
        # n_vocab x embed_dim (한 단어를 몇 차원으로 표현할 것인지) 사이즈의 matrix 를 만들어서 사용해보자
        self.embed = nn.Embedding(n_vocab, embed_dim)

        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)
        self.rnn = nn.RNN(embed_dim, self.hidden_dim, num_layer=self.n_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)
    
    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.rnn(x, h_0)
        h_t = x[:, -1, :]
        self.dropout(h_t)
        logit = torch.sigmoid(self.out(h_t))
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()
