In [36]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import pickle as pkl

# load tokenized dataset

In [37]:
dataset_dir_path = '../../data/processed/tokenized_data/'
with open(dataset_dir_path + 'train_data.pkl', 'rb') as f:
    tokenized_train_data = pkl.load(f)

with open(dataset_dir_path + 'valid_data.pkl', 'rb') as f:
    tokenized_valid_data = pkl.load(f)

In [38]:
print(f'Input(de) {tokenized_train_data[0][0]}')
print(f'Output(en) {tokenized_train_data[0][1]}')

Input(de) tensor([   2,   21,   85,  256,   31,   86,   22,   93,    7,   16,  114, 5645,
        3245,    3])
Output(en) tensor([   2,   19,   25,   15, 1197,  817,   17,   58,   84,  332, 1319,    3])


# load vocab

In [39]:
vocab_dir_path = '../../data/processed/vocab/'

with open(vocab_dir_path + 'token2idx_de.pkl', 'rb') as f:
    token2idx_de= pkl.load(f)
with open(vocab_dir_path + 'token2idx_en.pkl', 'rb') as f:
    token2idx_en = pkl.load(f)
with open(vocab_dir_path + 'idx2token_de.pkl', 'rb') as f:
    idx2token_de = pkl.load(f)
with open(vocab_dir_path + 'idx2token_en.pkl', 'rb') as f:
    idx2token_en = pkl.load(f)

# making the batch

In [40]:
batch_size = 128
PAD_INDEX = token2idx_de['<pad>']
START_INDEX = token2idx_en['<start>']
END_INDEX = token2idx_en['<end>']

In [41]:
def generate_batch(data_batch):
    batch_src = []
    batch_tgt = []
    for src, tgt in data_batch:
        batch_src.append(src)
        batch_tgt.append(tgt)
    
    batch_src = pad_sequence(batch_src, padding_value=PAD_INDEX)
    batch_tgt = pad_sequence(batch_tgt, padding_value=PAD_INDEX)

    return batch_src, batch_tgt

In [42]:
train_iter = DataLoader(tokenized_train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(tokenized_valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [43]:
# show train_iter
# each column is a text
list(train_iter)[0]

(tensor([[   2,    2,    2,  ...,    2,    2,    2],
         [   5,    5,   21,  ...,    5,   21,    5],
         [ 542, 4415,   31,  ...,   12,   31, 4183],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]),
 tensor([[   2,    2,    2,  ...,    2,    2,    2],
         [ 890,    6,   19,  ...,    6,   19,    6],
         [  11,  345,   36,  ...,   12,   36, 1529],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]))

# Model

In [44]:
import math
import torch
import torch.nn as nn
from torch import Tensor

## token embedding

In [None]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        # nn.Embedding is a simple lookup table.
        # if token index is set, it will return the corresponding embedding vector.
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.embedding_size = embedding_size
    
    def forward(self, tokens: Tensor):
        # the reason for this multiply is to align the range of the values
        # It is to make the positional encoding relatively smaller. 
        # This means the original meaning in the embedding vector won’t be lost 
        # when we add them together.
        return self.embedding(tokens.long()) * math.sqrt(self.embedding_size)

In [46]:
token_embedding = TokenEmbedding(len(token2idx_de), 512)

In [47]:
len(token2idx_de)

19620

In [49]:
for src, tgt in train_iter:
    print(f'bached data shape : {src.shape[0]} words, {src.shape[1]} sentences')
    print(f'bached data {src}')
    res = token_embedding.forward(src)
    print(f'result shape : {res.shape}')
    print(f'result {res}')
    print(f'sqrt : {math.sqrt(token_embedding.embedding_size)}')
    break

bached data shape : 29 words, 128 sentences
bached data tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  5,  21,  14,  ...,  14,   5,   5],
        [ 12,  27,  17,  ...,  17, 507, 310],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
result shape : torch.Size([29, 128, 512])
result tensor([[[-10.5732,   5.5090, -45.3570,  ...,  -2.8929,  -1.4875, -19.1218],
         [-10.5732,   5.5090, -45.3570,  ...,  -2.8929,  -1.4875, -19.1218],
         [-10.5732,   5.5090, -45.3570,  ...,  -2.8929,  -1.4875, -19.1218],
         ...,
         [-10.5732,   5.5090, -45.3570,  ...,  -2.8929,  -1.4875, -19.1218],
         [-10.5732,   5.5090, -45.3570,  ...,  -2.8929,  -1.4875, -19.1218],
         [-10.5732,   5.5090, -45.3570,  ...,  -2.8929,  -1.4875, -19.1218]],

        [[ -1.0904, -35.0455, -63.6212,  ...,  21.4234,  28.6437, -42.5667],
         [-22.8153, -11.8565,  -0.7828,  ...,  44.