In [1]:
!pip install -U torchdata
!pip install -U torchtext
!pip install -U spacy
!pip install sacrebleu
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 7.2 MB/s 
Collecting portalocker>=2.0.0
  Downloading portalocker-2.6.0-py2.py3-none-any.whl (15 kB)
Collecting urllib3>=1.25
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 67.1 MB/s 
[?25hCollecting torch==1.13.0
  Downloading torch-1.13.0-cp37-cp37m-manylinux1_x86_64.whl (890.2 MB)
[K     |██████████████████████████████  | 834.1 MB 1.2 MB/s eta 0:00:49tcmalloc: large alloc 1147494400 bytes == 0x34e4000 @  0x7f449fc29615 0x58ead6 0x4f355e 0x4d222f 0x51041f 0x5b4ee6 0x58ff2e 0x510325 0x5b4ee6 0x58ff2e 0x50d482 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4bac0a 0x538a76 0x590ae5 0x510280 0x5b4ee6 0x58ff2e 0x50d482 0x5b4ee6 0x58ff2e 0x50c4fc 0x58fd37

##Data Sourcing and Processing

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List, Tuple


# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}


# Create source and target language tokenizer. Make sure to install the dependencies.
# pip install -U torchdata
# pip install -U spacy
# python -m spacy download en_core_web_sm
# python -m spacy download de_core_news_sm
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)


##Seq2Seq Network using Transformer

In [3]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

##Hide source and target padding tokens using masks

In [4]:
def generate_square_subsequent_mask(sz: int) -> torch.Tensor:
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src: Tensor, tgt: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)

    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

##Define and instatiate the parameters of the model and the loss function

In [5]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

##Collation function

In [6]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

##Define training and evaluation loop

In [7]:
from torch.utils.data import DataLoader

def train_epoch(model: torch.nn.Module, optimizer: torch.optim) -> float:
    """
    Train the model

    Parameters:
      model: the model to train
      optimizer: optimization method
    Returns:
      The training loss
    """
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
    return losses / len(list(train_dataloader))


def evaluate(model: torch.nn.Module) -> float:
    """
    Evaluate the model

    Parameters:
      model: the model to evaluate
    Returns:
      The evaluation loss
    """
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

##Train model

In [8]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


Epoch: 1, Train loss: 5.344, Val loss: 4.114, Epoch time = 44.242s
Epoch: 2, Train loss: 3.761, Val loss: 3.320, Epoch time = 41.208s
Epoch: 3, Train loss: 3.162, Val loss: 2.895, Epoch time = 41.728s
Epoch: 4, Train loss: 2.768, Val loss: 2.639, Epoch time = 42.290s
Epoch: 5, Train loss: 2.481, Val loss: 2.441, Epoch time = 42.120s
Epoch: 6, Train loss: 2.250, Val loss: 2.317, Epoch time = 41.654s
Epoch: 7, Train loss: 2.060, Val loss: 2.202, Epoch time = 41.998s
Epoch: 8, Train loss: 1.897, Val loss: 2.113, Epoch time = 42.037s
Epoch: 9, Train loss: 1.755, Val loss: 2.062, Epoch time = 41.538s
Epoch: 10, Train loss: 1.631, Val loss: 2.004, Epoch time = 42.009s
Epoch: 11, Train loss: 1.524, Val loss: 1.973, Epoch time = 41.621s
Epoch: 12, Train loss: 1.419, Val loss: 1.941, Epoch time = 42.419s
Epoch: 13, Train loss: 1.333, Val loss: 1.968, Epoch time = 41.877s
Epoch: 14, Train loss: 1.252, Val loss: 1.943, Epoch time = 41.574s
Epoch: 15, Train loss: 1.173, Val loss: 1.932, Epoch time

## Greedy decoding

- Greedy decoding is by choosing the most probable token at any stage. 
- This method is effective in its ease of use but the shor-term most optimal solution is hardly the most optimal long term. (sentence integrity, semantic is not preserved as well)

In [9]:
def greedy_decode(model: torch.nn.Module, src: torch.Tensor, src_mask: torch.Tensor, max_len: int, start_symbol: int) -> torch.Tensor:
  """
  Generate output sequence using greedy algorithm
  
  Parameters:
    model: the transformer model
    src: the input
    src_mask: the input's mask
    max_len: maximum length of the input
    start_symbol: the start symbol of the input
  """
  src = src.to(DEVICE)
  src_mask = src_mask.to(DEVICE)

  memory = model.encode(src, src_mask)
  ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
  for i in range(max_len-1):
      memory = memory.to(DEVICE)
      tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                  .type(torch.bool)).to(DEVICE)
      out = model.decode(ys, memory, tgt_mask)
      out = out.transpose(0, 1)
      prob = model.generator(out[:, -1])
      _, next_word = torch.max(prob, dim=1)
      next_word = next_word.item()

      ys = torch.cat([ys,
                      torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
      if next_word == EOS_IDX:
          break
  return ys

def translate(model: torch.nn.Module, src_sentence: str) -> str:
    """
    Translate input sentence into target language

    Parameters:
      model: the transformer model
      src_sentence: the input sentence
    Returns:
      The translated sentence
    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

#Decoding functions (5 points)

## Top-K sampling (without temperature)

- At a given stage, selects the most-k probable next token.
- Redistribute the probability of each of the token based on its current probability.
- Randomly sample a token based on its weight and put it as a next token.

In [46]:
import random
import numpy as np

def top_k(model: torch.nn.Module, src: torch.Tensor, src_mask: torch.Tensor, max_len: int, start_symbol: int, k=2) -> torch.Tensor:
    """
    Function to generate output sequence using Top-K sampling (without temperature)

    Parameters:
      model: the transformer model
      src: the input
      src_mask: the input's mask
      max_len: the maximum length of the input
      start_symbol: the start symbol of the input
      k: the number of top tokens chosen (2 by default)
    """
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        next_probs, next_words = torch.topk(prob, k, dim=1)
        distribution = next_probs.cpu().detach().numpy()
        norm = [float(i)/sum(distribution[0]) for i in distribution[0]]
        sampleNumbers = np.random.choice(list(range(k)), 1, p=norm)
        # added weights to random function
        index = sampleNumbers[0]
        next_word = next_words[0][index].item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate_topk(model: torch.nn.Module, src_sentence: str, k=4) -> str:
    """
    Translate input sentence into target language
    Parameters:
      model: the model
      src_sentence: the input sentence
      k: the number of top tokens chosen (4 by default)
    Returns:
      The translated sentence
    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = top_k(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, k=k).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

## Beam search

- In the beginning, keep the top-K most probable token and compute its score by timing its probability.
- Next stage, compute the score from the top-K most probable token previously with its top-K and at the end only keep in totality the top-K highest score output.
- Keep going until a hypothesis reaches the end and remove that from the pool of candidate and minus k by one.

In [39]:
import torch.nn as nn
import bisect
softmax = nn.Softmax(dim=1)

def beam_search(model: torch.nn.Module, src: torch.Tensor, src_mask: torch.Tensor, max_len: int, start_symbol: int, beams=2) -> torch.Tensor:
    """
    Function to generate output sequence using Top-K sampling (without temperature)

    Parameters:
      model: the transformer model
      src: the input
      src_mask: the input's mask
      max_len: the maximum length of the input
      start_symbol: the start symbol of the input
      beams: the size of beam (2 by default)
    """
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    # negate the probability to have the right ordering since this is a minheap
    queue = [(-1, ys)]
    res = []
    fixed_beams = beams
    while queue and len(res) != fixed_beams:
        curr_prob, curr_ys = queue.pop(0)
        curr_prob = -curr_prob
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(curr_ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(curr_ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        prob = softmax(prob)
        next_probs, next_words = torch.topk(prob, beams, dim=1, largest=True)
        for i in range(beams):
            next_prob = (next_probs[0][i].item() * curr_prob) / (curr_ys.shape[0] + 1)
            next_ys = torch.cat([curr_ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_words[0][i])], dim=0)
            if next_words[0][i].item() == EOS_IDX:
                beams -= 1
                res.append((next_prob,next_ys))
                continue
            if len(queue) < beams:
                bisect.insort(queue, (-next_prob, next_ys))
        queue = queue[:beams]
    return max(res)[1]

def translate_beam(model: torch.nn.Module, src_sentence: str, beams=4) -> str:
    """
    Translate input sentence into target language
    Parameters:
      model: the model
      src_sentence: the input sentence
      beams: the size of beam (4 by default)
    Returns:
      The translated sentence
    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = beam_search(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, beams=beams)
    
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.flatten().cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



In [47]:
print(translate_beam(transformer, "Der Fuß ist mein Leben"))
print(translate_beam(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print(translate_beam(transformer, "Die Katze schläft"))
print(translate_topk(transformer, "Der Fuß ist mein Leben"))
print(translate_topk(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print(translate_topk(transformer, "Die Katze schläft"))
print(translate(transformer, "Der Fuß ist mein Leben"))
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))
print(translate(transformer, "Die Katze schläft"))

 This is foot just got off . 
 A group of people standing in front of an igloo 
 The cat is sleeping . 
 A high athlete just released life . The 
 Group in middle of watermelons . Located on a ATM . " A lot 
 The brown cat sits 
 The foot is just released the wood . 
 A group of people standing in front of an igloo . 
 The cat is sleeping . 


As we can see from the results, "Beam search" has the best results for these samples, followed by "Greedy decoding" and lastly "Top-k sampling".

#Compute the BLEU score of the model (2 points)

In [48]:
from sacrebleu.metrics import BLEU
bleu = BLEU()

In [49]:
i = 0
ref = []
hyp_beam = []
hyp_topk = []
hyp_greedy = []
val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
for src, tgt in val_iter:
    ref.append(tgt)
    hyp_beam.append(translate_beam(transformer, src))
    hyp_topk.append(translate_topk(transformer, src))
    hyp_greedy.append(translate(transformer, src))
    if i == 10:
        break
    i += 1
refs = [ref]
result_greedy = bleu.corpus_score(hyp_greedy, refs)
result_topk = bleu.corpus_score(hyp_topk, refs)
result_beam= bleu.corpus_score(hyp_beam, refs)


In [50]:
print(f"Greedy:{result_greedy}")
print(f"Top-K Sampling:{result_topk}")
print(f"Beam: {result_beam}")

Greedy:BLEU = 31.77 62.4/37.0/24.4/18.1 (BP = 1.000 ratio = 1.028 hyp_len = 149 ref_len = 145)
Top-K Sampling:BLEU = 2.61 30.7/5.8/0.7/0.4 (BP = 1.000 ratio = 1.145 hyp_len = 166 ref_len = 145)
Beam: BLEU = 26.84 60.6/34.5/22.9/19.1 (BP = 0.868 ratio = 0.876 hyp_len = 127 ref_len = 145)


##Intepreting BLEU score of Greedy

*   31.77 is the final BLEU score
*   62.4/37.0/24.4/18.1 represents the precision value for 1-4 ngram order
*   BP is the brevity penalty. This element is an exponential decay and will penalize results that are too short. With brevity penalty, the good results will match the reference translations in length, word choice, and word order
*   ratio is the ratio between hypothesis length and reference length
*   hyp_len is te total number of characters for hypothesis text
*   ref_len is the total number of characters for reference text

This intepretation applies the same for Top-K Sampling and Beam Search results.

As we can see from the results, Greedy has the highest BLEU score (31.77), followed by Beam (26.84) and lastly Top-K Sampling (4.03). According to some guidelines, having BLEU score between 30-40 means it is understandable to good translations, 20-29 means the gist is clear, but has significant grammatical errors and under 10 means it is almost useless.

For the Top-K sampling method, the result is not good on this model because it hasn't done well in realizing the most probable answer which creates a distribution array that have the Top-K probability almost uniformely distributed. This makes it indistinguishable from random sampling method.

