In [24]:
import datasets
import spacy
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn
from collections import Counter
from torch.utils.data import DataLoader

In [2]:
# Read the dataset using dataset.load_dataset()
dataset = datasets.load_dataset("data/Multi30k_HuggingFace")
train_set, val_set, test_set = dataset['train'], dataset['validation'], dataset['test']
train_set[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [3]:
# Use tokenizer from spacy
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [4]:
# Build the token frequency dict, ignore tokens with low frequency
en_token_dict = Counter()
de_token_dict = Counter()
unk, pad, sos, eos = '<unk>', '<pad>', '<sos>', '<eos>'
special_tokens = [unk, pad, sos, eos]
min_freq = 2

for example in train_set:
    en_tokens = [token.text.lower() for token in en_nlp.tokenizer(example['en'])]
    de_tokens = [token.text.lower() for token in de_nlp.tokenizer(example['de'])]
    en_token_dict.update(en_tokens)
    de_token_dict.update(de_tokens)

# No need to keep track of the frequency
en_token_dict = [k for (k, v) in en_token_dict.items() if v >= min_freq]
en_token_dict = special_tokens + en_token_dict
en_token_dict = {value: index for (index, value) in enumerate(en_token_dict)}

de_token_dict = [k for (k, v) in de_token_dict.items() if v >= min_freq]
de_token_dict = special_tokens + de_token_dict
de_token_dict = {value: index for (index, value) in enumerate(de_token_dict)}

In [5]:
# Create token list and token IDs for each sentence in the dataset
def tokenize_example(example, en_nlp, de_nlp, sos, eos):
    en_tokens, de_tokens = [], []
    en_ids, de_ids = [], []
    for token in en_nlp.tokenizer(example['en']):
        token = token.text.lower()
        if token not in en_token_dict:
            token = unk

        en_tokens.append(token)
        en_ids.append(en_token_dict[token])
    
    # input only needs eos token
    en_tokens = en_tokens + [eos]
    en_ids = en_ids + [en_token_dict[eos]]

    for token in de_nlp.tokenizer(example['de']):
        token = token.text.lower()
        if token not in de_token_dict:
            token = unk
            
        de_tokens.append(token)
        de_ids.append(de_token_dict[token])

    # output needs both sos and eos tokens
    de_tokens = [sos] + de_tokens + [eos]
    de_ids = [de_token_dict[sos]] + de_ids + [de_token_dict[eos]] 

    example['en_tokens'] = en_tokens
    example['en_ids'] = en_ids
    example['de_tokens'] = de_tokens
    example['de_ids'] = de_ids

    return example


In [6]:
fn_kwargs = {
    'en_nlp': en_nlp,
    'de_nlp': de_nlp,
    'sos': sos,
    'eos': eos,
}
train_set = train_set.map(tokenize_example, fn_kwargs=fn_kwargs)
val_set = val_set.map(tokenize_example, fn_kwargs=fn_kwargs)
test_set = test_set.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
print(train_set[0]['en'])
print(train_set[0]['en_tokens'])
print(train_set[0]['en_ids'])
print(train_set[0]['de'])
print(train_set[0]['de_tokens'])
print(train_set[0]['de_ids'])

Two young, White males are outside near many bushes.
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '<eos>']
[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 3]
Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
['<sos>', 'zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.', '<eos>']
[2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 3]


In [8]:
# Write a collate_fn to pad sequences with variable length into a batch of tensors for Dataloader
def get_collate_fn(pad_index):
    def collate_fn(batch):
        # pad each sequence in the batch to the same length
        en_ids = [sequence['en_ids'] for sequence in batch]
        de_ids = [sequence['de_ids'] for sequence in batch]
        en_ids = rnn.pad_sequence(en_ids, padding_value=pad_index)
        de_ids = rnn.pad_sequence(de_ids, padding_value=pad_index)

        for i in range(len(en_ids)-1):
            print(len(en_ids[i]) == len(en_ids[i+1]))
        
        for i in range(len(de_ids)-1):
            print(len(de_ids[i]) == len(de_ids[i+1]))

        return en_ids, de_ids


In [31]:
en_ids = [torch.tensor(s) for s in train_set[:10]['en_ids']]
pad_en_ids = rnn.pad_sequence(en_ids, padding_value=0, batch_first=True)
print(en_ids)
print(pad_en_ids)

[tensor([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  3]), tensor([15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  3]), tensor([21, 25, 26, 27, 28, 21, 29, 30, 14,  3]), tensor([21, 31, 17, 21, 32, 33, 34, 35, 36, 21, 37, 38, 21, 39, 14,  3]), tensor([ 4, 16,  9, 40, 41, 42, 43, 44, 14,  3]), tensor([21, 31, 17, 45, 46, 21, 47, 48, 41, 49, 31, 50, 51, 33, 14,  3]), tensor([21, 31, 34, 52, 40, 21, 53, 54,  3]), tensor([21, 55, 26, 56, 36, 57, 58, 48, 59, 60, 61, 41, 62, 14,  3]), tensor([21, 63, 64, 21, 65, 66, 34, 67, 68, 21, 69, 14,  3]), tensor([70, 71, 36, 72, 17, 41, 73, 74, 41, 75, 14,  3])]
tensor([[ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  3,  0,  0,  0,  0],
        [15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  3,  0,  0,  0],
        [21, 25, 26, 27, 28, 21, 29, 30, 14,  3,  0,  0,  0,  0,  0,  0],
        [21, 31, 17, 21, 32, 33, 34, 35, 36, 21, 37, 38, 21, 39, 14,  3],
        [ 4, 16,  9, 40, 41, 42, 43, 44, 14,  3,  0,  0,  0,  0,  0,  0],
        [21, 31, 17, 45, 

In [9]:
pad_idx = en_token_dict[pad]
collate_fn = get_collate_fn(pad_idx)
train_dl = DataLoader(train_set, collate_fn=collate_fn, batch_size=64, shuffle=True)

In [None]:
# for dl in train_dl:
#     en, de = dl
#     print(en.size())
#     print(de.size())
#     break

RuntimeError: each element in list of batch should be of equal size

In [None]:
class Encoder(nn.Module):
    def __init__(self, token_count, embedding_dim, rnn_hidden_dim, rnn_num_layers):
        super().__init__()
        # 1 layer Embedding
        # 2 layers GRU
        # the latent space is the same as the hidden space of the last layer of the GRU
        self.embedding = nn.Embedding(token_count, embedding_dim)
        self.encoder = nn.GRU(embedding_dim, rnn_hidden_dim, num_layers=rnn_num_layers, batch_first=True, bias=True)
    
    def forward(self, x):
        # hidden state at the last layer for every word in the sequence:
        #       batch, sequence, hidden_dim
        # final hidden state at every layer
        #       layer, batch, hidden_dim
        x = self.embedding(x)
        state_sequence, state_layer = self.encoder(x)
        return state_sequence, state_layer

In [None]:
class Decoder(nn.Module):
    def __init__(self, token_count, embedding_dim, rnn_num_layers):
        super().__init__()
        self.embedding = nn.Embedding(token_count, embedding_dim)
        self.decoder = nn.GRU(embedding_dim, token_count, num_layers=rnn_num_layers, batch_first=True, bias=True)
    
    def forward(self, x, latent):
        x = self.embedding(x)
        state_sequence, state_layer = self.decoder(x, latent)
        return state_sequence, state_layer


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, token_count, embedding_dim, rnn_hidden_dim, rnn_num_layers):
        super().__init__()
        self.encoder = Encoder(token_count, embedding_dim, rnn_hidden_dim, rnn_num_layers)
        self.decoder = Decoder(token_count, embedding_dim, rnn_num_layers)
    
    def forward(self, en, de):
        z, _ = self.encoder(en)
        z = z[:,-1,:]
        self.decoder()


