# 2. Learning Phrase Representations using RNN Encoder-Decoder for SMT
- RNN EncDec 모델을 구현하자!
- 위 논문은 GRU를 소개한 논문!
- 성능을 향상시킴!
- 파라미터 수가 적고 더 빠름

## Introduction
- remind!

![image.png](https://github.com/bentrevett/pytorch-seq2seq/raw/49df8404d938a6edbf729876405558cc2c2b3013/assets/seq2seq4.png)

## Preparing Data
- 1과 동일합니다.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data, datasets
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
import torchtext
print(torchtext.__version__)

0.5.0


In [3]:
torch.__version__

'1.7.1+cu101'

In [4]:
spacy.__version__

'2.2.4'

In [5]:
np.__version__

'1.19.5'

In [6]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

- 이번에도 동일하게 tokenize

In [9]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [10]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

- load data

In [11]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

- vocab을 만들고 한 번 나온 단어는 UNK 처리

In [12]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

- device 처리

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

- iterator 생성

In [15]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

## Building the Seq2Seq Model
- 달라진 것?
    - n_layers = 1
    - LSTM 대신에 GRU
    - Decoder GRU input에 context도 추가

In [16]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        
        self.rnn = nn.GRU(emb_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded) #no cell state!
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden

#### GRU도 LSTM이랑 비슷해요! cell value 없는 것 제외

In [17]:
x = torch.randn(3, 5, 10) # (batch_size, seq_len, hidden_dim)

In [21]:
rnn = nn.GRU(10, 5, batch_first=False, bidirectional=False, num_layers=1)
output, hidden = rnn(x.permute(1, 0, 2)) # permutation
print(output.size())
print(hidden.size())

torch.Size([5, 3, 5])
torch.Size([1, 3, 5])


In [22]:
rnn = nn.GRU(10, 5, batch_first=True, bidirectional=False, num_layers=1)
output, hidden = rnn(x)
print(output.size())
print(hidden.size())

torch.Size([3, 5, 5])
torch.Size([1, 3, 5])


In [23]:
rnn = nn.GRU(10, 5, batch_first=True, bidirectional=True, num_layers=1)
output, hidden = rnn(x)
print(output.size())
print(hidden.size())

torch.Size([3, 5, 10])
torch.Size([2, 3, 5])


In [24]:
rnn = nn.GRU(10, 5, batch_first=True, bidirectional=True, num_layers=2)
output, hidden = rnn(x)
print(output.size())
print(hidden.size())

torch.Size([3, 5, 10])
torch.Size([4, 3, 5])


In [25]:
# warning
nn.GRU(10, 5, batch_first=True, bidirectional=True, num_layers=1, dropout=0.2)

  "num_layers={}".format(dropout, num_layers))


GRU(10, 5, batch_first=True, dropout=0.2, bidirectional=True)

### Decoder
- decoder 구현은 조금 색달라요!
- bottelneck, information compression 부담을 덜어주는 기법을 소개합니다.
- 수식 말고 아래 그림으로 우리 이해해봐요!

![image.png](https://github.com/bentrevett/pytorch-seq2seq/raw/49df8404d938a6edbf729876405558cc2c2b3013/assets/seq2seq6.png)

- 구현의 핵심?
    - GRU로 바꿈
    - GRU에 input으로 y_t와 z를 넣어줄건데 이를 concat해서 넣어줄 것임!
    - 이 idea 꼭 기억하세요

In [26]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        
        #n layers and n directions in the decoder will both always be 1, therefore:
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        emb_con = torch.cat((embedded, context), dim = 2)
            
        #emb_con = [1, batch size, emb dim + hid dim]
            
        output, hidden = self.rnn(emb_con, hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), 
                           dim = 1)
        
        #output = [batch size, emb dim + hid dim * 2]
        
        prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden

## Seq2Seq
- 어떤 부분이 달라졌는가?
    - decoder에 hidden과 cell이 들어가는 것이 아니라 hidden과 context가 들어감

![image.png](https://github.com/bentrevett/pytorch-seq2seq/raw/49df8404d938a6edbf729876405558cc2c2b3013/assets/seq2seq7.png)

In [27]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is the context
        context = self.encoder(src)
        
        #context also used as the initial hidden state of the decoder
        hidden = context
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and the context state
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

### Training the seq2seq model
- 모델 초기화
- 파라미터 설정

In [30]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
# N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
# dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

- 초기화를 uniform 대신 normal(0, 0.1^2) 분포를 사용할거에요!

In [31]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

- 어? 모델 parameter 더 적다고 하지 않았나요?
- 잘 생각해보세요! 지금 GRU에 input으로 emb_dim + hidden_dim을 넣어주고 있어요!
- 즉, context까지 넣어주니 이에 대응하여 parameter가 추가로 많이 생긴거죠!
- 결과적으로 1과 parameter 수가 비슷해진거죠!

In [32]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 14,220,293 trainable parameters


- Adam optimizer 사용

In [33]:
optimizer = optim.Adam(model.parameters())

- Cross Entropy loss 사용

In [34]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

- 학습 및 평가 코드! (동일합니다.)

In [35]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [36]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

- 학습에 소요된 시간을 체크

In [37]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

- 동일한 코드로 학습

In [38]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 33s
	Train Loss: 4.789 | Train PPL: 120.192
	 Val. Loss: 4.966 |  Val. PPL: 143.484
Epoch: 02 | Time: 0m 32s
	Train Loss: 4.152 | Train PPL:  63.562
	 Val. Loss: 4.788 |  Val. PPL: 120.081
Epoch: 03 | Time: 0m 31s
	Train Loss: 3.708 | Train PPL:  40.773
	 Val. Loss: 4.186 |  Val. PPL:  65.732
Epoch: 04 | Time: 0m 31s
	Train Loss: 3.249 | Train PPL:  25.754
	 Val. Loss: 3.804 |  Val. PPL:  44.896
Epoch: 05 | Time: 0m 31s
	Train Loss: 2.844 | Train PPL:  17.182
	 Val. Loss: 3.607 |  Val. PPL:  36.859
Epoch: 06 | Time: 0m 31s
	Train Loss: 2.539 | Train PPL:  12.670
	 Val. Loss: 3.565 |  Val. PPL:  35.357
Epoch: 07 | Time: 0m 31s
	Train Loss: 2.272 | Train PPL:   9.699
	 Val. Loss: 3.600 |  Val. PPL:  36.593
Epoch: 08 | Time: 0m 31s
	Train Loss: 2.038 | Train PPL:   7.672
	 Val. Loss: 3.606 |  Val. PPL:  36.822
Epoch: 09 | Time: 0m 32s
	Train Loss: 1.869 | Train PPL:   6.483
	 Val. Loss: 3.636 |  Val. PPL:  37.949
Epoch: 10 | Time: 0m 31s
	Train Loss: 1.716 | Train PPL

- 성능이 조금 더 개선되었군요!

In [39]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.523 | Test PPL:  33.891 |
