### 0) References

https://wikidocs.net/60314

https://wikidocs.net/64904

### 1) Import Libraries

In [3]:
import os
import time
import random
import datetime
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
import torchtext
from torchtext import data, datasets
from torchtext.vocab import GloVe
from torchinfo import summary

### 2) Hyperparameters

In [2]:
n_hid = 128
batch_size = 128
epochs = 50
embedding = 300
lr = 6e-4

random_seed = 42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
n_hid = 128
batch_size = 64
epochs = 100
embedding = 100
lr = 6e-4
delta = 5.4e-2
gamma = 4.9
epsilon = 4.8

random_seed = 42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 3) Prepare data

In [5]:
TEXT = torchtext.data.Field(sequential=True, batch_first=True, lower=True)
LABEL = torchtext.data.Field(sequential=False, batch_first=True)

trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

TEXT.build_vocab(trainset, vectors=GloVe(name='6B', dim=300), max_size=30000, min_freq=5)
LABEL.build_vocab(trainset)

vocab_size = len(TEXT.vocab)
n_classes = 2

print('단어 집합의 크기 : {}'.format(vocab_size))
print('클래스의 개수 : {}'.format(n_classes))
print('임베딩 벡터의 개수와 차원 : {} '.format(TEXT.vocab.vectors.shape))

trainset, valset = trainset.split(split_ratio=0.8)

train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (trainset, valset, testset), batch_size=batch_size,
        shuffle=True, sort=False)



단어 집합의 크기 : 30002
클래스의 개수 : 2
임베딩 벡터의 개수와 차원 : torch.Size([30002, 300]) 




### 4) Build Network

##### 1. GRU type

In [4]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embed = nn.Embedding(n_vocab, embed_dim)
#         self.embedding_layer = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim, num_layers=self.n_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim, n_classes)
        
    def forward(self, x):
        x = self.embed(x)
        h0 = self._init_state(batch_size=x.size(0)) # 첫번째 hidden state를 0벡터로 초기화
        x, _ = self.gru(x, h0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, hidden state의 크기)
        ht = x[:, -1, :]  # 마지막 time-step의 hidden state만 가져옴
        out = self.dropout(ht)
        out = self.fc(out)
        return out
    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [7]:
model = GRU(4, n_hid, vocab_size, embedding, n_classes).to(device)
print(summary(model, verbose=0))

Layer (type:depth-idx)                   Param #
├─Embedding: 1-1                         9,000,600
├─Dropout: 1-2                           --
├─GRU: 1-3                               462,336
├─Linear: 1-4                            258
Total params: 9,463,194
Trainable params: 9,463,194
Non-trainable params: 0


##### 2. LSTM type

In [6]:
class LSTM(nn.Module):
    def __init__(self, n_vocab, hidden_dim, n_layers, embed_dim, n_classes, dropout_p=0.2):
        super(LSTM, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
#         self.embed = nn.Embedding(n_vocab, embed_dim)
        self.embedding_layer = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embed_dim, self.hidden_dim, num_layers=self.n_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(self.hidden_dim*2, n_classes)
        
    def forward(self, x):
        x = self.embedding_layer(x)
#         (h0, c0) = self._init_state(batch_size=x.size(0))
        x, _ = self.lstm(x)
        ht = x[:,-1,:]
        out = self.dropout(ht)
        out = self.fc(out)
        return out
    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

In [8]:
model = LSTM(vocab_size, n_hid, 4, embedding, n_classes).to(device)
print(summary(model, verbose=0))

Layer (type:depth-idx)                   Param #
├─Embedding: 1-1                         9,000,600
├─Dropout: 1-2                           --
├─LSTM: 1-3                              1,626,112
├─Linear: 1-4                            514
Total params: 10,627,226
Trainable params: 10,627,226
Non-trainable params: 0


### 5) Set Optim, loss, Acc

In [11]:
optimizer = optim.Adam(model.parameters(), lr=lr)

criterion = nn.CrossEntropyLoss()

def acc(output, target):
    return (output.max(1)[1].view(target.size()).data == target.data).sum()

### 6) Train model / save

In [22]:
def train(model, optimizer, tran_iter):
    
    train_loss, train_acc = 0, 0
    model.train()
    with tqdm(total=len(train_iter.dataset)) as progress_bar: 
        for i, batch in enumerate(train_iter):
            
            optimizer.zero_grad()
            
            text = batch.text.to(device)
            target = batch.label.to(device)
            target.data.sub_(1) # 레이블 값을 0과 1로 변환
            print(text.shape)
            output = model(text)
            print(output.shape)
            print(target.shape)
            loss = criterion(output, target)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_acc += acc(output,target)
            
            progress_bar.set_postfix(loss=(train_loss/(i+1)))
            progress_bar.update(text.size(0))

    return train_loss / len(train_iter.dataset), train_acc / len(train_iter.dataset)


def evaluate(model, val_iter):
    
    t_acc, t_loss = 0, 0
    model.eval()
    for batch in val_iter:
        text = batch.text.to(device)
        target = batch.label.to(device)
        target.data.sub_(1) # 레이블 값을 0과 1로 변환
        
        output = model(text)
        loss = criterion(output, target)
        
        t_loss += loss.item()
        t_acc += acc(output, target)
    
    return t_loss / len(val_iter.dataset), t_acc / len(val_iter.dataset)

In [None]:
best_val_loss = None
for e in range(30):
    train_loss, train_acc = train(model, optimizer, train_iter)
    val_loss, val_acc = evaluate(model, val_iter)

    print(f"[Epoch: %d] train loss : %.4f | train acc : %.4f | val loss : %.4f | val acc : %.4f" 
          % (e+1, train_loss, train_acc, val_loss, val_acc))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("models"):
            os.makedirs("models")
        torch.save(model.state_dict(), f'models/{model.__class__.__name__}_4-layer-e{e+1}.pth')
        best_val_loss = val_loss

In [12]:
torch.save(model.state_dict(), f'models/{model.__class__.__name__}_4-layer-e{e+1}.pth')

### 7) Load and test

In [8]:
criterion = nn.CrossEntropyLoss()

def acc(output, target):
    return (output.max(1)[1].view(target.size()).data == target.data).sum()

In [15]:
model = GRU(4, n_hid, vocab_size, embedding, n_classes).to(device)
model.load_state_dict(torch.load('models/GRU_h128_4-layer-e50.pth'))

<All keys matched successfully>

In [10]:
model2 = LSTM(vocab_size, n_hid, 4, embedding, n_classes).to(device)
model2.load_state_dict(torch.load('models/LSTM_h128_4-layer-e23.pth'))

<All keys matched successfully>

In [17]:
test_loss, test_acc = evaluate(model, test_iter)
print('테스트 오차: %.4f | 테스트 정확도: %.4f' % (test_loss, test_acc))

테스트 오차: 0.0094 | 테스트 정확도: 0.8370


In [11]:
test_loss, test_acc = evaluate(model2, test_iter)
print('테스트 오차: %.4f | 테스트 정확도: %.4f' % (test_loss, test_acc))



테스트 오차: 0.0028 | 테스트 정확도: 0.8743


In [6]:
model2 = LSTM(vocab_size, n_hid, 1, embedding, n_classes).to(device)
model2.load_state_dict(torch.load('models/LSTM_h256_pre-embed-e45.pth'))

<All keys matched successfully>

In [10]:
test_loss, test_acc = evaluate(model2, test_iter)
print('테스트 오차: %.4f | 테스트 정확도: %.4f' % (test_loss, test_acc))



테스트 오차: 0.0026 | 테스트 정확도: 0.8580


In [15]:
model3 = LSTM(vocab_size, 256, 1, embedding, n_classes).to(device)
model3.load_state_dict(torch.load('models/LSTM_h256-e22.pth'))

<All keys matched successfully>

In [12]:
test_loss, test_acc = evaluate(model3, test_iter)
print('테스트 오차: %.4f | 테스트 정확도: %.4f' % (test_loss, test_acc))

테스트 오차: 0.0044 | 테스트 정확도: 0.8645


In [21]:
test_loss, test_acc = evaluate(model3, test_iter)
print('테스트 오차: %.4f | 테스트 정확도: %.4f' % (test_loss, test_acc))

테스트 오차: 0.0026 | 테스트 정확도: 0.8585


In [18]:
test_loss, test_acc = evaluate(model3, test_iter)
print('테스트 오차: %.4f | 테스트 정확도: %.4f' % (test_loss, test_acc))

테스트 오차: 0.0035 | 테스트 정확도: 0.8056


## LSTM (Bidirectional, 4-layers stacked, 128 Hids, Ep 23) -> Test Acc : 87.43%