# Sequence to Sequence Learning with Neural Networks Implementation

Dutch → English translation task

## 1. Preprocessing

spaCy: tokenization & tagging library

In [None]:
!python -m spacy download en_core_web_sm # Engilish
!python -m spacy download nl_core_news_sm # Deustch

In [2]:
import spacy

spacy_en = spacy.load("en_core_web_sm")
spacy_de = spacy.load("nl_core_news_sm")

In [3]:
# tokenization test
tokenized = spacy_en.tokenizer("These days, i'm really into LostArk")

for i, token in enumerate(tokenized):
  print(f"{i}번째 토큰: {token.text}")

0번째 토큰: These
1번째 토큰: days
2번째 토큰: ,
3번째 토큰: i
4번째 토큰: 'm
5번째 토큰: really
6번째 토큰: into
7번째 토큰: LostArk


In [4]:
# Tokenizer 함수 정의

def tokenizer_de(text):
  """
  논문에서 토큰의 순서를 거꾸로 뒤집어서 넣었을 때 성능이 향상됨을 보였으므로 토큰의 순서를 뒤집어서 반환
  """
  return [token.text for token in spacy_de.tokenizer(text)][::-1]

def tokenizer_en(text):
  return [token.text for token in spacy_en.tokenizer(text)]

torchtext는 자연어 처리(NLP) 분야에서 사용하는 데이터로더(DataLoader)

In [None]:
pip install --user torchtext

In [9]:
import torchtext
from torchtext.legacy.data import Field
# Field를 통해 앞으로 어떤 전처리를 할 것인지를 정의

SRC = Field(tokenize=tokenizer_de, init_token="<SOS>", eos_token="<EOS>", lower=True)
TRG = Field(tokenize=tokenizer_en, init_token="<SOS>", eos_token="<EOS>", lower=True)

## 2. Load Dataset

In [10]:
from torchtext.legacy.datasets import Multi30k

train_dataset, valid_dataset, test_dataset = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 564kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 176kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 165kB/s]


In [11]:
print(f"train dataset: {len(train_dataset.examples)}")
print(f"valid dataset: {len(valid_dataset.examples)}")
print(f"test dataset: {len(test_dataset.examples)}")

train dataset: 29000
valid dataset: 1014
test dataset: 1000


In [13]:
# 독일어 문장과 영어 문장 샘플 출력
print(vars(train_dataset.examples[16])['src'])
print(vars(train_dataset.examples[16])['trg'])

['.', 'regenbogen', 'gemalten', 'großen', 'einem', 'vor', 'sitzt', 'mädchen', 'kleines', 'ein']
['a', 'little', 'girl', 'is', 'sitting', 'in', 'front', 'of', 'a', 'large', 'painted', 'rainbow', '.']


최소 2번 이상 등장한 단어들을 이용해 영어와 독일어 단어 사전을 생성(field 객체의 build_vocab 메서드)

In [14]:
SRC.build_vocab(train_dataset, min_freq=2)
TRG.build_vocab(train_dataset, min_freq=2)

print(f"SRC vocab length: {len(SRC.vocab)}")
print(f"TRG vocab length: {len(TRG.vocab)}")

SRC vocab length: 7853
TRG vocab length: 5893


논문에서 학습 속도를 위해 하나의 배치에 포함된 문장들이 가지는 단어의 개수가 유사하게 만들었다. <br/>
이를 구현하기 위해 BucketIterator를 사용한다.

In [15]:
import torch
from torchtext.legacy.data import BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset),
    batch_size=BATCH_SIZE,
    device = device
)

In [20]:
for i, batch in enumerate(train_iterator):
  src = batch.src
  trg = batch.trg

  print(src.shape)
  break

torch.Size([30, 128])


## 3. Model Architecture

### Encoder

In [40]:
import torch.nn as nn

class Encoder(nn.Module):

  def __init__(self, input_dim, embed_dim, hidden_dim, n_layers, dropout_ratio):
    super().__init__()

    self.embedding = nn.Embedding(input_dim, embed_dim)

    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout_ratio)

    self.dropout = nn.Dropout(dropout_ratio)
  

  def forward(self, src):
    """
      src 문장을 입력받아 context vector 반환
    """
    # src = |단어 개수, bs|
    embedded = self.dropout(self.embedding(src)) # |단어 개수, bs, embed_dim|

    outputs, (hidden, cell) = self.rnn(embedded)
    # outputs = |단어 개수, bs, hidden_dim|
    # hidden = |레이어 개수, bs, hidden_dim|
    # cell = [레이어 개수, bs, hidden_dim|

    return hidden, cell

### Decoder

In [41]:
class Decoder(nn.Module):
  
  def __init__(self, output_dim, embed_dim, hidden_dim, n_layers, dropout_ratio):
    super().__init__()

    self.embedding = nn.Embedding(output_dim, embed_dim)

    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout_ratio)

    # Encoder와의 차이점: FC layer
    self.output_dim = output_dim
    self.fc_out = nn.Linear(hidden_dim, output_dim)

    self.dropout = nn.Dropout(dropout_ratio)


  def forward(self, input, hidden, cell):
    # input = |bs|
    # hidden = |레이어 개수, bs, hidden_dim|
    # cell = |레이어 개수, bs, hidden_dim|
    input = input.unsqueeze(0) # input = |1(단어 개수), bs|

    embedded = self.dropout(self.embedding(input)) # embedded = |단어 개수, bs, embed_dim|

    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    # outputs = |1(단어 개수), bs, hidden_dim|
    # hidden = |레이어 개수, bs, hidden_dim|
    # cell = [레이어 개수, bs, hidden_dim|

    pred = self.fc_out(output.squeeze(0)) # |bs, output_dim|

    return pred, hidden, cell

### Seq2Seq

In [42]:
import random

In [43]:
class Seq2Seq(nn.Module):

  def __init__(self, encoder, decoder, device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, src, trg, teacher_forcing_ratio=0.5):
    # src = |단어 개수, bs|
    # trg = |단어 개수, bs|

    hidden, cell = self.encoder(src)

    trg_len = trg.shape[0] # 단어 개수
    batch_size = trg.shape[1] # bs
    trg_vocab_size = self.decoder.output_dim 
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    input = trg[0, :]

    for t in range(1, trg_len):
      output, hidden, cell = self.decoder(input, hidden, cell)

      outputs[t] = output # FC를 거쳐 나온 현재의 출력 단어 정보
      top1 = output.argmax(1) # 가장 확률이 높은 단어의 idx

      # teacjer forcing 여부
      teacher_force = random.random() < teacher_forcing_ratio
      input = trg[t] if teacher_force else top1 
    
    return outputs

## 4. Training

In [44]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENCODER_EMBED_DIM = 256
DEOCDER_EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT_RATIO = 0.5
DEC_DROPOUT_RATIO = 0.5

In [45]:
enc = Encoder(INPUT_DIM, ENCODER_EMBED_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT_RATIO)
dec = Decoder(OUTPUT_DIM, DEOCDER_EMBED_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT_RATIO)

model = Seq2Seq(enc, dec, device).to(device)

모델 가중치 파라미터 : (-0.08, 0.08)

In [46]:
def init_weights(m):
  for name, param in m.named_parameters():
    nn.init.uniform(param.data, -0.08, 0.08)

model.apply(init_weights)

  This is separate from the ipykernel package so we can avoid doing imports until


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [47]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

# padding 무시
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [48]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()
  epoch_loss = 0

  for i, batch in enumerate(iterator):
    src = batch.src
    trg = batch.trg

    optimizer.zero_grad()

    output = model(src, trg) # |출력단어개수, bs, output_dim|
    output_dim = output.shape[-1]

    output = output[1:].view(-1, output_dim) # |(출력 단어의 개수 - 1) * bs, output_dim|
    trg = trg[1:].view(-1) # |(타겟 단어의 개수 -1) * bs|

    loss = criterion(output, trg)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(),clip)

    optimizer.step()

    epoch_loss += loss.item()
  
  return epoch_loss / len(iterator)

In [49]:
def evaluate(model, iterator, criterion):
  model.eval()
  epoch_loss = 0

  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src = batch.src
      trg = batch.trg

      output = model(src, trg) # no teacher forcing
      output_dim = output.shape[-1]

      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)

      loss = criterion(output, trg)

      epoch_loss += loss.item()
  
  return epoch_loss / len(iterator)

In [50]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [66]:
import time
import math

N_EPOCHS = 20
CLIP = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss = evaluate(model, valid_iterator, criterion)

  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'seq2seq_ver1.pt')
  
  print(f"Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s")
  print(f"Train loss: {train_loss} | Train perplexity: {math.exp(train_loss)}")
  print(f"Valid loss: {valid_loss} | Valid perplexity: {math.exp(valid_loss)}")



Epoch: 1 | Time: 0m 28s
Train loss: 2.9396154407887733 | Train perplexity: 18.908573447830648
Valid loss: 3.2289921045303345 | Valid perplexity: 25.254190555125582
Epoch: 2 | Time: 0m 29s
Train loss: 2.8466261515008195 | Train perplexity: 17.229553765336256
Valid loss: 3.007067322731018 | Valid perplexity: 20.2279906852768
Epoch: 3 | Time: 0m 28s
Train loss: 2.7633949996091194 | Train perplexity: 15.853574566696373
Valid loss: 2.8605929613113403 | Valid perplexity: 17.4718840168608
Epoch: 4 | Time: 0m 28s
Train loss: 2.693400189740017 | Train perplexity: 14.781851668951798
Valid loss: 3.063153862953186 | Valid perplexity: 21.394927537063577
Epoch: 5 | Time: 0m 28s
Train loss: 2.610236561771006 | Train perplexity: 13.602268247925894
Valid loss: 2.827407330274582 | Valid perplexity: 16.90158374524378
Epoch: 6 | Time: 0m 28s
Train loss: 2.5190156482914996 | Train perplexity: 12.416368572575989
Valid loss: 2.7784290313720703 | Valid perplexity: 16.09371835180147
Epoch: 7 | Time: 0m 28s
Tra

In [67]:
model.load_state_dict(torch.load('/content/seq2seq_ver1.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'Test loss: {test_loss:.3f} | Test Perplexity: {math.exp(test_loss):.3f}')

Test loss: 2.840 | Test Perplexity: 17.110


## 5. Inference

In [68]:
model.load_state_dict(torch.load('/content/seq2seq_ver1.pt'))

<All keys matched successfully>

In [69]:
def translate(sent, src_field, trg_field, model, device, max_len=45):
  model.eval()

  if isinstance(sent, str):
    spacy_de = spacy.load("nl_core_news_sm")
    tokens = [token.text.lower() for token in spacy_de(sent)]
  else:
    tokens = [token.lower() for token in sent]
  
  # <SOS> <EOS> token
  tokens = [src_field.init_token] + tokens + [src_field.eos_token]
  print(f"soruce token: {tokens}")

  src_indexes = [src_field.vocab.stoi[token] for token in tokens]
  print(f"source token idx: {src_indexes}")

  src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

  with torch.no_grad():
    hidden, cell = model.encoder(src_tensor)
  
  trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]] # 처음에는 <SOS>토큰을 갖고 있음

  for i in range(max_len):
    # 이전 출력 단어가 현재 단어로 입력되도록
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device) 

    with torch.no_grad():
      output, hidden, cell = model.decoder(trg_tensor, hidden, cell)

    pred_token = output.argmax(1).item()
    trg_indexes.append(pred_token)

    if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
      break
    
  trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

  return trg_tokens

In [70]:
example_idx = 10

src = vars(test_dataset.examples[example_idx])['src']
trg = vars(test_dataset.examples[example_idx])['trg']

print(f"Source Dutch: {src}")
print(" ".join(translate(src, SRC, TRG, model, device)))
print(trg)

Source Dutch: ['.', 'freien', 'im', 'tag', 'schönen', 'einen', 'genießen', 'sohn', 'kleiner', 'ihr', 'und', 'mutter', 'eine']
soruce token: ['<SOS>', '.', 'freien', 'im', 'tag', 'schönen', 'einen', 'genießen', 'sohn', 'kleiner', 'ihr', 'und', 'mutter', 'eine', '<EOS>']
source token idx: [2, 4, 87, 20, 200, 781, 19, 566, 625, 70, 134, 10, 365, 8, 3]
<SOS> a mother and her girl are enjoying themselves on a outdoor day . <EOS>
['a', 'mother', 'and', 'her', 'young', 'song', 'enjoying', 'a', 'beautiful', 'day', 'outside', '.']
