In [1]:
import datasets
import numpy as np
import spacy
import random
import evaluate
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from typing import List, Dict
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [3]:
dataset = datasets.load_dataset("bentrevett/multi30k")
dataset



DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [4]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [5]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [6]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower=True, sos_token="<sos>", eos_token="<eos>"):
    en_tokens = [token.text for token in en_nlp(example['en'])][:max_length]
    de_tokens = [token.text for token in de_nlp(example['de'])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}


In [7]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

In [8]:
specials = {pad_token: 0, unk_token: 1, sos_token: 2, eos_token: 3}

class Vocab:
    def __init__(self, token_count, min_freq, specials: Dict[str, int]):
        self.token2idx: Dict[str:int] = {}
        for token, idx in specials.items():
            self.token2idx[token] = idx

        for token, cnt in token_count.items():
            if cnt >= min_freq and token not in self.token2idx:

                self.token2idx[token] = len(self.token2idx)

        self.idx2token = {idx: token for token, idx in self.token2idx.items()}

        self.token2idx = dict(sorted(self.token2idx.items(), key=lambda kv:kv[1]))
        self.idx2token = dict(sorted(self.idx2token.items()))

    @classmethod
    def build_vocab_from_iterator(cls, iterator: List[List[str]], min_freq, specials: Dict[str, int]):
        """token_count: dict[str: int]"""
        token_count = Counter([token for sentence in iterator for token in sentence ])
        return cls(token_count, min_freq, specials)

    def get_itos(self):
        """int to string"""
        return list(self.token2idx.keys())

    def get_stoi(self):
        """string to int"""
        return self.token2idx

    def lookup_indices(self, tokens):
        return [self[token] for token in tokens]

    def lookup_tokens(self, indices):
        return [self.idx2token[index] for index in indices]

    def __getitem__(self, token: str) -> int:
        return self.token2idx.get(token, self.token2idx['<unk>'])

    def __contains__(self, token: str) -> bool:
        return token in self.token2idx

    def __len__(self) -> int:
        return len(self.token2idx)


In [9]:
en_vocab = Vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=specials,
)

de_vocab = Vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=specials,
)

In [10]:
tokens = ["i", "love", "watching", "crime", "shows"]

indices = en_vocab.lookup_indices(tokens)
print(indices)
en_vocab.lookup_tokens(indices)

[171, 4010, 225, 1, 1130]


['i', 'love', 'watching', '<unk>', 'shows']

In [11]:

def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [12]:

fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)
train_data

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens', 'en_ids', 'de_ids'],
    num_rows: 29000
})

In [13]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(type=data_type, columns=format_columns, output_all_columns=True)
valid_data = valid_data.with_format(type=data_type, columns=format_columns, output_all_columns=True)
test_data = test_data.with_format(type=data_type, columns=format_columns, output_all_columns=True)


In [14]:
def get_collate_fn(pad_idx):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_idx)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_idx)

        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids
        }
        return batch
    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    dataloader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return dataloader



In [15]:
batch_size = 128
pad_index = 0

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [16]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_in, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.n_layers = n_layers
        self.embedding = nn.Embedding(vocab_size_in, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        """src (L, B)
            embeded (L, B, D)
            hidden (n_layers * n_directions, B, H)
            cell (n_layers * n_directions, B, H)
            outputs (L, B, H * n_directions)

        """

        embeded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embeded)

        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, vocab_size_out, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.vocab_size_out = vocab_size_out
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding =  nn.Embedding(num_embeddings=vocab_size_out, embedding_dim=embedding_dim)
        self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, vocab_size_out)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, hidden, cell):
        """inputs (B)
           hidden (n_layers * n_directions, B, H)
           cell (n_layers * n_direction, B, H)
           """
        "(B), (1, B)"
        inputs = inputs.unsqueeze(0)
        "(1, B, D)"
        embeded = self.dropout(self.embedding(inputs))
        "output: (1, B, hidden)"
        "hidden: (n_layers * n_directions, B, H)"
        "cell: (n_layers * n_directions, B, H)"

        output, (hidden, cell) = self.rnn(embeded, (hidden, cell))

        "prediction: (B, vocab_size_out)"
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder  = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        """
        src: (L, B)
        trg: (L*, B)
        """
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.vocab_size_out

        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        """hidden: (n_layers * n_direction, B, L)
           cell: (n_layers * n_direction, B, L)
           output: (L*, B, vocab_size_out)
           """

        hidden, cell = self.encoder(src)
        "<sos> token (B)"
        inputs = trg[0,:]
        for t in range(1, trg_length):
            "output: (B, trg_vocab_size)"
            "random.random() 0~1"
            output, hidden, cell = self.decoder(inputs, hidden, cell)

            outputs[t] = output
            teacher_force:bool = random.random() < teacher_forcing_ratio
            "(B, trg_vocab_size) => (B)"
            top1 = output.argmax(dim=1)
            inputs = trg[t] if teacher_force else top1

        return outputs

In [17]:
vocab_size_in = len(de_vocab)
vocab_size_out = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    vocab_size_in=vocab_size_in, embedding_dim=encoder_embedding_dim,
    hidden_dim=hidden_dim, n_layers=n_layers, dropout=encoder_dropout
)

decoder = Decoder(
    vocab_size_out=vocab_size_out, embedding_dim=decoder_embedding_dim,
    hidden_dim=hidden_dim, n_layers=n_layers, dropout=decoder_dropout
)

model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters(),lr=0.001)

criterion = nn.CrossEntropyLoss(ignore_index=pad_index)




### Training loop

In [18]:
def train_fn(
        model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        "src [L, B]"
        "trg [L*, B]"
        src = batch['de_ids'].to(device)
        trg = batch['en_ids'].to(device)

        optimizer.zero_grad()
        "output: (L*, B, vocab_size_out)"
        output = model(src, trg, teacher_forcing_ratio)
        output_dim = output.shape[-1]
        "output: ((L*-1) * batch_size, vocab_size_out  )"
        output = output[1:].view(-1, output_dim) #start from <sos>
        "trg ((L*-1) * B)"
        trg = trg[1:].view(-1)
        "output: (N, C, d_1,...,d_k) or (N, C)"
        "trg: (N, d_1,...,d_k) or (N)"
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(data_loader)


def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch['de_ids'].to(device)
        trg = batch['en_ids'].to(device)
        optimizer.zero_grad()
        "output: (L*, B, vocab_size_out)"
        output = model(src, trg, 0) #turn off teacher forcing
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)



In [None]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5
best_valid_loss = float("inf")

for epoch in tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device
    )

    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device
    )

    if valid_loss < best_valid_loss:
        torch.save(model.state_dict(), "tutl-model.pt")

    print(f"\tTrain loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")



 10%|█         | 1/10 [00:21<03:09, 21.00s/it]

	Train loss:   3.943 | Train PPL:  51.571
	Valid loss:   4.255 | Valid PPL:  70.470


 20%|██        | 2/10 [00:42<02:48, 21.02s/it]

	Train loss:   3.717 | Train PPL:  41.144
	Valid loss:   4.166 | Valid PPL:  64.479


 30%|███       | 3/10 [01:03<02:27, 21.08s/it]

	Train loss:   3.540 | Train PPL:  34.479
	Valid loss:   3.926 | Valid PPL:  50.716


 40%|████      | 4/10 [01:24<02:06, 21.11s/it]

	Train loss:   3.394 | Train PPL:  29.795
	Valid loss:   3.923 | Valid PPL:  50.549


 50%|█████     | 5/10 [01:45<01:45, 21.16s/it]

	Train loss:   3.280 | Train PPL:  26.574
	Valid loss:   3.926 | Valid PPL:  50.710


 60%|██████    | 6/10 [02:06<01:24, 21.15s/it]

	Train loss:   3.167 | Train PPL:  23.729
	Valid loss:   3.811 | Valid PPL:  45.178


 70%|███████   | 7/10 [02:28<01:03, 21.22s/it]

	Train loss:   3.086 | Train PPL:  21.879
	Valid loss:   3.794 | Valid PPL:  44.455


 80%|████████  | 8/10 [02:48<00:42, 21.06s/it]

	Train loss:   2.978 | Train PPL:  19.642
	Valid loss:   3.793 | Valid PPL:  44.367


 90%|█████████ | 9/10 [03:09<00:21, 21.01s/it]

	Train loss:   2.914 | Train PPL:  18.436
	Valid loss:   3.743 | Valid PPL:  42.235


100%|██████████| 10/10 [03:30<00:00, 21.07s/it]

	Train loss:   2.836 | Train PPL:  17.055
	Valid loss:   3.744 | Valid PPL:  42.253





In [19]:
model.load_state_dict(torch.load("tutl-model.pt"))
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 3.711 | Test PPL:  40.894 |


In [None]:
def translate_sentence(
        sentence: str,
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
        max_output_length=25
):
    model.eval()
    with torch.no_grad():
        tokens = [token.text for token in de_nlp(sentence)]

        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        "(L) => (L,1)  1 is batch_size"
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device) #(1)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell) #output: [1, vocab_size]
            predicted_token = output.argmax(-1).item() #(1)
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
        return tokens




In [41]:
sentence = test_data[0]['de']
expected_translation = test_data[0]['en']
sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [48]:
translation = translate_sentence(
    sentence=sentence,
    model=model,
    en_nlp=en_nlp,
    de_nlp=de_nlp,
    en_vocab=en_vocab,
    de_vocab=de_vocab,
    lower=True,
    sos_token=sos_token,
    eos_token=eos_token,
    device=device
)
translation

['<sos>', 'a', 'man', 'sitting', 'on', 'a', 'bench', '.', '<eos>']

In [49]:
sentence = "Ein Mann sitzt auf einer Bank."
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)
translation

['<sos>', 'a', 'man', 'sitting', 'on', 'a', 'bench', '.', '<eos>']

In [51]:
translations = [
    translate_sentence(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    )
    for example in tqdm(test_data)
]

100%|██████████| 1000/1000 [00:06<00:00, 161.06it/s]


In [52]:
bleu = evaluate.load("bleu")

Downloading builder script: 5.94kB [00:00, 1.99MB/s]
Downloading extra modules: 4.07kB [00:00, 2.83MB/s]                   
Downloading extra modules: 3.34kB [00:00, 2.99MB/s]


In [58]:
predictions = [" ".join(translation[1:-1]) for translation in translations]

references = [example["en"] for example in test_data]

In [61]:
predictions[1], references[1]

('a brown and white dog is running through a field of a large white . .',
 'A Boston Terrier is running on lush green grass in front of a white fence.')

In [62]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens
    return tokenizer_fn

In [64]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

tokenizer_fn(predictions[0]), tokenizer_fn(references[0])

(['a', 'man', 'wearing', 'a', 'hat', 'hat', 'is', 'his', '.', '.'],
 ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.'])

In [65]:
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)

In [66]:
results

{'bleu': 0.16522427300599896,
 'precisions': [0.5139862894964936,
  0.2273543751603798,
  0.11411467589561314,
  0.06273862346507068],
 'brevity_penalty': 0.9714959966565813,
 'length_ratio': 0.9718946239852964,
 'translation_length': 12691,
 'reference_length': 13058}