In [1]:
import os
import random
import numpy as np

import functools

import torch
import torch.nn as nn

from torchtext import datasets
from torchtext.data import Field
from torchtext.data import BucketIterator


SEED = 241

In [2]:
def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)

  if torch.cuda.is_available(): 
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [3]:
import spacy

!python -m spacy download de --quiet
!python -m spacy download en --quiet

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

[K     |████████████████████████████████| 14.9MB 3.4MB/s 
[?25h  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [4]:
def tokenize_de(text):
  return [token.text for token in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
  return [token.text for token in spacy_en.tokenizer(text)]

In [5]:
SRC = Field(lower=True,
            use_vocab=True, 
            sequential=True, 
            init_token='<sos>',
            eos_token='<eos>',
            batch_first=True,
            include_lengths=True,
            tokenize=tokenize_de)

TRG = Field(lower=True, 
            use_vocab=True,
            sequential=True,
            init_token='<sos>',
            eos_token='<eos>',
            batch_first=True,
            include_lengths=True,
            tokenize=tokenize_en)

In [6]:
train_data, valid_data, test_data = datasets.Multi30k.splits(exts=('.de', '.en'),
                                                             fields=[('de', SRC), ('en', TRG)])

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 1.02MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 267kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 258kB/s]


In [7]:
SRC.build_vocab(train_data)
TRG.build_vocab(train_data)

In [8]:
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


train_iterator = BucketIterator.splits((train_data,), batch_size=batch_size, device=device)[0]
valid_iterator = BucketIterator.splits((valid_data,), batch_size=batch_size, device=device)[0]
test_iterator = BucketIterator.splits((test_data,), batch_size=batch_size, device=device)[0]

In [9]:
class Config:
  def __init__(self,
               vocab_size,
               emb_size,
               hidden_size,
               num_layers, 
               dropout, 
               pad_index,
               device):
    self.vocab_size = vocab_size
    self.emb_size = emb_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = dropout
    self.pad_index = pad_index
    self.device = device

In [10]:
class Encoder(nn.Module):

  def __init__(self, config):
    super().__init__()
    self.embedding = nn.Embedding(config.vocab_size,
                                  config.emb_size, 
                                  padding_idx=config.pad_index)
    self.dropout = nn.Dropout(config.dropout)
    self.rnn = nn.LSTM(config.emb_size,
                       config.hidden_size, 
                       config.num_layers, 
                       batch_first=True, 
                       dropout=config.dropout)
    self.device = config.device

  def forward(self, text, text_lens):
    text_embedded = self.dropout(self.embedding(text))
    packed_sequence = nn.utils.rnn.pack_padded_sequence(text_embedded, 
                                                        text_lens,
                                                        batch_first=True, 
                                                        enforce_sorted=False)
    
    packed_outputs, (hidden, cell) = self.rnn(packed_sequence)
    outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
    # hidden = hidden.permute(1, 0, 2)
    # cell = cell.permute(1, 0, 2)
    return hidden, cell

In [11]:
class Decoder(nn.Module):

  def __init__(self, config):
    super().__init__()
    self.embedding = nn.Embedding(config.vocab_size, 
                                  config.emb_size, 
                                  padding_idx=config.pad_index)
    self.dropout = nn.Dropout(config.dropout)
    self.rnn = nn.LSTM(config.emb_size,
                       config.hidden_size,
                       config.num_layers,
                       batch_first=True, 
                       dropout=config.dropout)
    self.device = config.device
    self.output_dim = config.vocab_size
    self.output = nn.Linear(config.hidden_size, config.vocab_size)

  def forward(self, trg_input, hidden, cell):
    trg_input = trg_input.unsqueeze(1)
    text_embedded = self.dropout(self.embedding(trg_input))
    outputs, (hidden, cell) = self.rnn(text_embedded, (hidden, cell))
    outputs = outputs.squeeze(1)
    return self.output(outputs), hidden, cell

In [12]:
class Seq2Seq(nn.Module):

  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, 
              src_text, 
              src_text_lens,
              trg_text,
              trg_text_lens, 
              teacher_forcing_ratio=0.5):
    batch_size = src_text.size(0)
    max_trg_seq_len = trg_text.size(1)
    trg_vocab_size = self.decoder.output_dim

    outputs = torch.zeros(batch_size, max_trg_seq_len, trg_vocab_size, device=self.device)
    
    hidden, cell = self.encoder(src_text, src_text_lens)
    trg_input = trg_text[:, 0]

    for t in range(1, max_trg_seq_len):
      
      dec_output, hidden, cell = self.decoder(trg_input, hidden, cell)
      outputs[:, t, :] = dec_output

      teacher_force = random.random() < teacher_forcing_ratio

      top1 = dec_output.argmax(dim=1)
      trg_input = trg_text[:, t] if teacher_force else top1
      
    return outputs

In [13]:
encoder_config = Config(vocab_size=len(SRC.vocab),
                        emb_size=512,
                        hidden_size=256,
                        num_layers=2,
                        dropout=0.3,
                        pad_index=SRC.vocab.stoi[SRC.pad_token],
                        device=device)

decoder_config = Config(vocab_size=len(TRG.vocab),
                        emb_size=512,
                        hidden_size=256,
                        num_layers=2,
                        dropout=0.3,
                        pad_index=TRG.vocab.stoi[TRG.pad_token],
                        device=device)

encoder = Encoder(encoder_config)
decoder = Decoder(decoder_config)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=decoder_config.pad_index).to(device)
optimizer = torch.optim.Adam(seq2seq.parameters())

In [14]:
def train_epoch(model, iterator, optimizer, criterion):
  model.train()

  error = 0.
  for batch in iterator:
    optimizer.zero_grad()
    
    src_text, src_text_lens = batch.de
    src_text_lens = src_text_lens.cpu()
    trg_text, trg_text_lens = batch.en
    trg_text_lens = trg_text_lens.cpu()

    outputs = seq2seq(src_text, src_text_lens, trg_text, trg_text_lens)

    outputs = outputs[:, 1:, :]
    trg_text = trg_text[:, 1:]

    # outputs = outputs.contiguous()
    # trg_text = trg_text.contiguous()

    batch_size, seq_len, output_dim = outputs.shape

    outputs = outputs.reshape(batch_size * seq_len, output_dim)
    trg_text = trg_text.reshape(-1)

    loss = criterion(outputs, trg_text)
    loss.backward()
    optimizer.step()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    error += loss.detach().cpu().numpy()

  return error / len(iterator)


def valid_epoch(model, iterator, criterion):
  model.eval()

  error = 0.
  with torch.no_grad():
    for batch in iterator:
      
      src_text, src_text_lens = batch.de
      src_text_lens = src_text_lens.cpu()
      trg_text, trg_text_lens = batch.en
      trg_text_lens = trg_text_lens.cpu()

      outputs = seq2seq(src_text, src_text_lens, trg_text, trg_text_lens)

      outputs = outputs[:, 1:, :]
      trg_text = trg_text[:, 1:]

      # outputs = outputs.contiguous()
      # trg_text = trg_text.contiguous()

      batch_size, seq_len, output_dim = outputs.shape

      outputs = outputs.reshape(batch_size * seq_len, output_dim)
      trg_text = trg_text.reshape(-1)

      loss = criterion(outputs, trg_text)

      error += loss.detach().cpu().numpy()

  return error / len(iterator)

In [15]:
for i in range(15):
  train_error = train_epoch(seq2seq, train_iterator, optimizer, criterion)
  valid_error = valid_epoch(seq2seq, valid_iterator, criterion)
  print(f'Epoch: {i + 1}, Train Error: {train_error}, Valid Error {valid_error}')

Epoch: 1, Train Error: 4.600419607551516, Valid Error 3.956821009516716
Epoch: 2, Train Error: 3.835227196902661, Valid Error 3.678058870136738
Epoch: 3, Train Error: 3.5293373980096314, Valid Error 3.476690746843815
Epoch: 4, Train Error: 3.3301586492321853, Valid Error 3.347179137170315
Epoch: 5, Train Error: 3.164411359217096, Valid Error 3.313545249402523
Epoch: 6, Train Error: 3.027661863209263, Valid Error 3.1964599937200546
Epoch: 7, Train Error: 2.911147862048501, Valid Error 3.1830659583210945
Epoch: 8, Train Error: 2.8123396578539563, Valid Error 3.1906793415546417
Epoch: 9, Train Error: 2.72774759174839, Valid Error 3.1143994107842445
Epoch: 10, Train Error: 2.6405388830515095, Valid Error 3.1161623001098633
Epoch: 11, Train Error: 2.577104933338102, Valid Error 3.115902528166771
Epoch: 12, Train Error: 2.495716616494916, Valid Error 3.16264209151268
Epoch: 13, Train Error: 2.4386067537646308, Valid Error 3.1543151289224625
Epoch: 14, Train Error: 2.3780244065750527, Valid E

In [62]:
def translate_sentence(sentence, model, SRC, TRG, tokenizer, device, max_len=50):
  tokens = tokenizer(sentence)
  tokens = [SRC.init_token] + tokens + [TRG.eos_token]
  sentence_len = torch.LongTensor([len(tokens)])
  tokens_ids = [SRC.vocab.stoi[token] for token in tokens]
  token_tensor = torch.LongTensor([tokens_ids]).to(device)
  decoder_input = torch.LongTensor([TRG.vocab.stoi[TRG.init_token]]).to(device)

  pred_tokens = [TRG.vocab.stoi[TRG.init_token]]

  encoder = model.encoder
  decoder = model.decoder

  with torch.no_grad():
    hidden, cell = encoder(token_tensor, sentence_len)

  curr_len = 0
  while pred_tokens[-1] != TRG.vocab.stoi[TRG.eos_token] and pred_tokens[-1] != TRG.vocab.stoi['.'] and curr_len < max_len:
    with torch.no_grad():
      prediction, hidden, cell = decoder(decoder_input, hidden, cell)
    prediction = prediction.detach().cpu().numpy()
    pred_token_idx = np.argmax(prediction[0])
    pred_tokens.append(pred_token_idx)
    curr_len += 1
  result = ' '.join([TRG.vocab.itos[token] for token in pred_tokens[1:]])
  return result

In [69]:
print(translate_sentence('ein schwarzer hund und ein gefleckter hund kämpfen .', seq2seq, SRC, TRG, tokenize_de, device))

a black black dog dog dog dog dog dog .
