In [1]:
import datasets
import spacy
from collections import Counter
import torch.nn as nn

In [2]:
# Read the dataset using dataset.load_dataset()
dataset = datasets.load_dataset("data/Multi30k_HuggingFace")
train_set, val_set, test_set = dataset['train'], dataset['validation'], dataset['test']
train_set[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [3]:
# Use tokenizer from spacy
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [5]:
# Build the token frequency dict, to ignore those with small frequency
en_token_dict = Counter()
de_token_dict = Counter()

for example in train_set:
    en_tokens = [token.text.lower() for token in en_nlp.tokenizer(example['en'])]
    de_tokens = [token.text.lower() for token in de_nlp.tokenizer(example['de'])]
    en_token_dict.update(en_tokens)
    de_token_dict.update(de_tokens)

In [10]:
sos = '<sos>'
eos = '<eos>'
unk = '<unk'
pad = '<pad>'
min_freq = 2

def tokenize_example(example, en_nlp, de_nlp, sos, eos, unk, min_freq):
    en_tokens = [token.text.lower() if en_token_dict.get(token.text.lower(), 0) >= min_freq
                    else unk for token in en_nlp.tokenizer(example['en'])]
    
    de_tokens = [token.text.lower() if de_token_dict.get(token.text.lower(), 0) >= min_freq
                    else unk for token in de_nlp.tokenizer(example['de'])]

    example['en_tokens'] = en_tokens + [eos] # input only needs eos token
    example['de_tokens'] = [sos] + de_tokens + [eos] # output needs both sos and eos tokens

    return example


In [11]:
fn_kwargs = {
    'en_nlp': en_nlp,
    'de_nlp': de_nlp,
    'sos': sos,
    'eos': eos,
    'unk': unk,
    'min_freq': min_freq
}
train_set = train_set.map(tokenize_example, fn_kwargs=fn_kwargs)
val_set = val_set.map(tokenize_example, fn_kwargs=fn_kwargs)
test_set = test_set.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
print(train_set[0]['en'])
print(train_set[0]['en_tokens'])
print(train_set[0]['de'])
print(train_set[0]['de_tokens'])

Two young, White males are outside near many bushes.
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '<eos>']
Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
['<sos>', 'zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', '<unk', 'büsche', '.', '<eos>']


In [12]:
# Vocabulary
en_vocab = [k for (k, v) in en_token_dict.items() if v >= min_freq]
de_vocab = [k for (k, v) in de_token_dict.items() if v >= min_freq]

en_vocab = [unk, pad, sos, eos] + list(en_token_dict)
de_vocab = [unk, pad, sos, eos] + list(de_token_dict)

In [None]:
def lookup_tokenids(example, en_vocab, de_vocab):
    en_ids = []
    de_ids = []
    for w in example['en_tokens']:
        en_idx = en_vocab.index(w) if w in en_vocab else 0
        en_ids.append(en_idx)
        
    for w in example['de_tokens']:
        de_idx = de_vocab.index(w) if w in de_vocab else 0
        de_ids.append(de_idx)
    
    example['en_ids'] = en_ids
    example['de_ids'] = de_ids
    
    return example

In [None]:
train_set = train_set.map(lookup_tokenids, fn_kwargs={'en_vocab': en_vocab, 'de_vocab': de_vocab})
val_set = val_set.map(lookup_tokenids, fn_kwargs={'en_vocab': en_vocab, 'de_vocab': de_vocab})
test_set = test_set.map(lookup_tokenids, fn_kwargs={'en_vocab': en_vocab, 'de_vocab': de_vocab})

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, rnn_hidden_dim, rnn_num_layers):
        super().__init__()
        # 1 layer Embedding
        # 2 layers GRU
        # the latent space is the same as the hidden space of the last layer of the GRU
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.encoder = nn.GRU(embedding_dim, rnn_hidden_dim, num_layers=rnn_num_layers, batch_first=True, bias=True)
    
    def forward(self, x):
        x = self.embedding(x)
        # hidden state at the last layer for every word in the sequence:
        #       batch, sequence, hidden_dim
        # final hidden state at every layer
        #       layer, batch, hidden_dim
        x = self.embedding(x)
        state_sequence, state_layer = self.encoder(x)
        return state_sequence, state_layer

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, rnn_hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(embedding_dim, output_dim)
        self.decoder = nn.GRU(rnn_hidden_dim, embedding_dim)
    
    def forward(self, x):
        x = self.embedding(x)

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = Decoder()
        
    
    def forward(self, x):
