In [1]:
cd /kaggle/input/dataset-eng-por

/kaggle/input/dataset-eng-por


In [2]:
import pdb
import torch
import itertools
import numpy as np
import torch.nn as nn
from collections import Counter
from utils_PT import (sentences, train_dataset, val_dataset, train_loader, val_loader,
                   tokenizer_eng, tokenizer_por, masked_loss, masked_acc, ids_to_text, encode_sample, pt_lower_and_split_punct)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data preparation

In [4]:
english_sentences, portuguese_sentences = sentences

print(f"English (to translate) sentence:\n\n{english_sentences[-5]}\n")
print(f"Portuguese (translation) sentence:\n\n{portuguese_sentences[-5]}")

English (to translate) sentence:

No matter how much you try to convince people that chocolate is vanilla, it'll still be chocolate, even though you may manage to convince yourself and a few others that it's vanilla.

Portuguese (translation) sentence:

Não importa o quanto você tenta convencer os outros de que chocolate é baunilha, ele ainda será chocolate, mesmo que você possa convencer a si mesmo e poucos outros de que é baunilha.


In [5]:
source = pt_lower_and_split_punct(english_sentences[-5])
print(f"source: {source}\n")

encoded = tokenizer_eng.encode(source[0]).ids
print(f"encoded: {encoded}\n")
print(f"len(encoded): {len(encoded)}")

decoded = tokenizer_eng.decode(encoded)
print(f"decoded: {decoded}\n")
print(f"len(decoded): {len(decoded)}")

source: ['[SOS] no matter how much you try to convince people that chocolate is vanilla ,  itll still be chocolate ,  even though you may manage to convince yourself and a few others that its vanilla . [EOS]']

encoded: [3, 89, 473, 49, 106, 8, 220, 7, 1060, 135, 13, 1107, 12, 4151, 24, 502, 103, 33, 1107, 24, 251, 1169, 8, 219, 2558, 7, 1060, 344, 43, 11, 365, 791, 13, 52, 4151, 4, 2]

len(encoded): 37
decoded: [SOS] no matter how much you try to convince people that chocolate is vanilla , itll still be chocolate , even though you may manage to convince yourself and a few others that its vanilla . [EOS]

len(decoded): 195


In [6]:
del portuguese_sentences
del english_sentences
del sentences

In [7]:
ten_words_eng_vocab = sorted(tokenizer_eng.get_vocab().items(), key=lambda item: item[1])[:10]
ten_words_por_vocab = sorted(tokenizer_por.get_vocab().items(), key=lambda item: item[1])[:10]
print(f"First 10 words of the english vocabulary:\n\n{[token[0] for token in ten_words_eng_vocab]}\n")
print(f"First 10 words of the portuguese vocabulary:\n\n{[token[0] for token in ten_words_por_vocab]}")

First 10 words of the english vocabulary:

['[PAD]', '[UNK]', '[EOS]', '[SOS]', '.', 'tom', 'i', 'to', 'you', 'the']

First 10 words of the portuguese vocabulary:

['[PAD]', '[UNK]', '[EOS]', '[SOS]', '.', 'tom', 'que', 'o', 'nao', 'eu']


In [8]:
# Size of the vocabulary
vocab_size_por = tokenizer_eng.get_vocab_size()
vocab_size_eng = tokenizer_eng.get_vocab_size()

print(f"Portuguese vocabulary is made up of {vocab_size_por} words")
print(f"English vocabulary is made up of {vocab_size_eng} words")

Portuguese vocabulary is made up of 12000 words
English vocabulary is made up of 12000 words


In [9]:
def word_to_id(token):
    return tokenizer_por.token_to_id(token)


def ids_to_words(id):
    return tokenizer_por.id_to_token(id)

In [10]:
unk_id = word_to_id("[UNK]")
sos_id = word_to_id("[SOS]")
eos_id = word_to_id("[EOS]")
baunilha_id = word_to_id("baunilha")

print(f"The id for the [UNK] token is {unk_id}")
print(f"The id for the [SOS] token is {sos_id}")
print(f"The id for the [EOS] token is {eos_id}")
print(f"The id for baunilha (vanilla) is {baunilha_id}")

The id for the [UNK] token is 1
The id for the [SOS] token is 3
The id for the [EOS] token is 2
The id for baunilha (vanilla) is 5242


In [11]:
(to_translate, sr_translation), translation = next(iter(train_loader))

print(f"Tokenized english sentence:\n{to_translate[0, :].numpy()}\n\n")
print(f"Tokenized portuguese sentence (shifted to the right):\n{sr_translation[0, :].numpy()}\n\n")
print(f"Tokenized portuguese sentence:\n{translation[0, :].numpy()}\n\n")

print(tokenizer_eng.decode(to_translate[0, :].numpy()))
print(tokenizer_por.decode(sr_translation[0, :].numpy()))
print(tokenizer_por.decode(translation[0, :].numpy()))

Tokenized english sentence:
[   3  173   46   66  282   66   22 2167  793    4    2    0    0    0
    0    0    0    0    0]


Tokenized portuguese sentence (shifted to the right):
[  3 103 171   6  12 744 378   4   0   0   0   0   0   0   0   0   0   0
   0]


Tokenized portuguese sentence:
[103 171   6  12 744 378   4   2   0   0   0   0   0   0   0   0   0   0
   0]


[SOS] lets go as soon as it stops raining . [EOS]
[SOS] vamos assim que a chuva parar .
vamos assim que a chuva parar . [EOS]


# Encoder

In [12]:
VOCAB_SIZE = 12000
UNITS = 256

In [13]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, units):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, units, padding_idx=0)
        self.rnn = nn.LSTM(units, units, bidirectional=True, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        # Summarizing the bidirectional RNNs to follow the TF version
        forward_output = x[:, :, :UNITS]
        backward_output = x[:, :, UNITS:]
        x = forward_output + backward_output

        return x

In [14]:
encoder = Encoder(VOCAB_SIZE, UNITS)

encoder_output = encoder(to_translate)

print(f'Tensor of sentences in english has shape: {to_translate.shape}\n')
print(f'Encoder output has shape: {encoder_output.shape}')

Tensor of sentences in english has shape: torch.Size([64, 19])

Encoder output has shape: torch.Size([64, 19, 256])


# Cross Attention

In [15]:
class CrossAttention(nn.Module):
    def __init__(self, units):
        super().__init__()

        self.mha = nn.MultiheadAttention(units, 1, batch_first=True)
        self.layernorm = nn.LayerNorm(units)

    def forward(self, context, target):
        attn_output = self.mha(query=target,key=context, value=context)
        x = target + attn_output[0] # [0] because we only need the attention output and no weights
        x = self.layernorm(x) 

        return x

In [16]:
attention_layer = CrossAttention(UNITS)

sr_translation_embed = nn.Embedding(VOCAB_SIZE, UNITS, 0)(sr_translation)

attention_result = attention_layer(encoder_output, sr_translation_embed)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of translations has shape: {sr_translation_embed.shape}')
print(f'Tensor of attention scores has shape: {attention_result.shape}')

Tensor of contexts has shape: torch.Size([64, 19, 256])
Tensor of translations has shape: torch.Size([64, 19, 256])
Tensor of attention scores has shape: torch.Size([64, 19, 256])


# Decoder

In [17]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, units):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, units, padding_idx=0)
        self.pre_attention_rnn = nn.LSTM(units, units, batch_first=True)
        self.attention = CrossAttention(units)
        self.post_attention_rnn = nn.LSTM(units, units, batch_first=True)
        self.output_layer = nn.Linear(units, vocab_size)
        self.activation = nn.LogSoftmax(dim=-1)

    def forward(self, context, target_in, state=None, return_state=False):
        x = self.embedding(target_in)
        x, (hidden_state, cell_state) = self.pre_attention_rnn(x, state)
        x = self.attention(context, x)
        x, _ = self.post_attention_rnn(x)
        x = self.output_layer(x)
        logits = self.activation(x)

        if return_state:
            return logits, [hidden_state, cell_state]

        return logits

In [18]:
decoder = Decoder(VOCAB_SIZE, UNITS)

logits = decoder(encoder_output, sr_translation)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')

Tensor of contexts has shape: torch.Size([64, 19, 256])
Tensor of right-shifted translations has shape: torch.Size([64, 19])
Tensor of logits has shape: torch.Size([64, 19, 12000])


# Translator

In [19]:
class Translator(nn.Module):
    def __init__(self, vocab_size, units):
        super().__init__()

        self.encoder = Encoder(vocab_size, units)
        self.decoder = Decoder(vocab_size, units)

    def forward(self, inputs):
        context, targets = inputs

        encoded_context = self.encoder(context)
        logits = self.decoder(encoded_context, targets)

        return logits

In [20]:
translator = Translator(VOCAB_SIZE, UNITS).to(device)

# Loading the model
#translator.load_state_dict(torch.load('/kaggle/working/model_weights.pth', map_location=torch.device(device), weights_only=True))

logits = translator((to_translate.to(device), sr_translation.to(device)))

print(f'Tensor of sentences to translate has shape: {to_translate.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')

Tensor of sentences to translate has shape: torch.Size([64, 19])
Tensor of right-shifted translations has shape: torch.Size([64, 19])
Tensor of logits has shape: torch.Size([64, 19, 12000])


In [21]:
optimizer = torch.optim.Adam(params=translator.parameters())
criterion = masked_loss
acc = masked_acc

# Training

In [22]:
NUM_EPOCHS = 20
STEPS_PER_EPOCH = 500
VALIDATION_STEPS = 50
patience = 3
best_loss = float('inf')
num_batches_train = len(train_loader)
num_batches_val = len(val_loader)


for epoch in range(NUM_EPOCHS):
    translator.train()
    
    # Mini batch loss
    running_loss_train = 0.0
    running_accuracy_train = 0.0
    running_loss_val = 0.0
    running_accuracy_val = 0.0
    
    # Using itertools for fixed length iteration over non subscriptable DataLoader
    for i, data in enumerate(itertools.islice(train_loader,  STEPS_PER_EPOCH)):
        (context, target_in), target_out = data
        
        context, target_in, target_out = context.to(device), target_in.to(device), target_out.to(device)

        optimizer.zero_grad()
        outputs = translator((context, target_in))
        loss = criterion(target_out, outputs)
        accuracy = acc(target_out, outputs)
        loss.backward()
        optimizer.step()

        running_loss_train += loss.item()
        running_accuracy_train = accuracy 
        
    
    # Validation
    translator.eval()
    with torch.no_grad():
        for i, data in enumerate(itertools.islice(val_loader,  VALIDATION_STEPS)):
            (context, target_in), target_out = data

            context, target_in, target_out = context.to(device), target_in.to(device), target_out.to(device)

            outputs = translator((context, target_in))
            loss = criterion(target_out, outputs)
            accuracy = acc(target_out, outputs)

            running_loss_val += loss.item()
            running_accuracy_val = accuracy 
            
            
    # Print the data
    print(f"\n[epoch: {epoch+1}/{NUM_EPOCHS}] masked_loss: {(running_loss_train / num_batches_train):.4f}, masked_acc: {running_accuracy_train:.4f}, val_masked_loss: {(running_loss_val / num_batches_val):.4f}, val_masked_acc: {running_accuracy_val:.4f}\n")
    
    # Update the best loss if it's better than the previous one
    if running_loss_train < best_loss:
        best_loss = running_loss_train
        patience = 3

    else:
        # Losing patience
        patience -= 1

        if patience == 0:
            print("Early stopping was triggered")


[epoch: 1/20] masked_loss: 0.8821, masked_acc: 0.5078, val_masked_loss: 0.2637, val_masked_acc: 0.5397


[epoch: 2/20] masked_loss: 0.5508, masked_acc: 0.5787, val_masked_loss: 0.1937, val_masked_acc: 0.6485


[epoch: 3/20] masked_loss: 0.4207, masked_acc: 0.6913, val_masked_loss: 0.1574, val_masked_acc: 0.6841


[epoch: 4/20] masked_loss: 0.3460, masked_acc: 0.7121, val_masked_loss: 0.1379, val_masked_acc: 0.7218


[epoch: 5/20] masked_loss: 0.3029, masked_acc: 0.7320, val_masked_loss: 0.1252, val_masked_acc: 0.7490


[epoch: 6/20] masked_loss: 0.2720, masked_acc: 0.7563, val_masked_loss: 0.1156, val_masked_acc: 0.7657


[epoch: 7/20] masked_loss: 0.2447, masked_acc: 0.7901, val_masked_loss: 0.1107, val_masked_acc: 0.7531


[epoch: 8/20] masked_loss: 0.2250, masked_acc: 0.7753, val_masked_loss: 0.1050, val_masked_acc: 0.7594


[epoch: 9/20] masked_loss: 0.2137, masked_acc: 0.8035, val_masked_loss: 0.1016, val_masked_acc: 0.7699


[epoch: 10/20] masked_loss: 0.1966, masked_acc: 0.7647

In [23]:
#translator.load_state_dict(torch.load('/kaggle/working/model_state_dict.pth', weights_only=True))

# Using the model for inference

In [24]:
def generate_next_token(context, decoder, next_token, state, done, temperature=0.0):
    logits, state = decoder(context, next_token, state, return_state=True)
    logits = logits[:, -1, :]

    if temperature == 0.0:
        next_token = torch.argmax(logits, dim=-1)

    else:
        logits = torch.exp(logits)
        logits /= temperature
        next_token = torch.multinomial(logits, 1)
        logits = torch.log(logits)

    logits = torch.squeeze(logits)

    next_token = torch.squeeze(next_token)

    logit = logits[next_token].detach().numpy()

    next_token = torch.reshape(next_token, shape=(1,1))

    if next_token == eos_id:
        done = True

    return next_token, logit, state, done

In [25]:
eng_sentence = "I love languages"

context = torch.tensor(encode_sample(eng_sentence))
context = torch.unsqueeze(context, dim=0)
context = encoder(context)

next_token = torch.full((1,1), sos_id)

state = [torch.rand((1, 1, UNITS)), torch.rand((1, 1, UNITS))]
done = False

next_token, logit, state, done = generate_next_token(context, decoder, next_token, state, done, temperature=0.5)
print(f"Next token: {next_token}\nLogit: {logit:.4f}\nDone? {done}")
next_token = next_token.tolist()

Next token: tensor([[5132]])
Logit: -8.7250
Done? False


# Translate

In [26]:
def translate(model, text, max_length=50, temperature=0.0):
    model.eval()
    
    tokens, logits = [], []

    pre_text = text
    text = torch.tensor(encode_sample(pre_text))[None, :]
    context = model.encoder(text)

    next_token = torch.full((1,1), sos_id)

    state = [torch.zeros((1, 1, UNITS)), torch.zeros((1, 1, UNITS))]

    done = False
    for iteration in range(max_length):
        try:
            next_token, logit, state, done = generate_next_token(
                context=context,
                decoder=model.decoder,
                next_token=next_token,
                state=state,
                done=done,
                temperature=temperature
            )
        except:
            raise Exception("Problem generating the next token")

        if done:
            break
            
        tokens.append(next_token)
        
        logits.append(logit)

    tokens = torch.cat(tokens, dim=-1).tolist()
    
    translation = ids_to_text(tokens, tokenizer_por)

    return translation, logits[-1], tokens

In [51]:
# Running this cell multiple times should return the same output since temp is 0

temp = 0.0
original_sentence = "I love languages"

translation, logit, tokens = translate(translator.to("cpu"), original_sentence, temperature=temp)

print(f"Temperature: {temp}\n\nOriginal sentence: {original_sentence}\nTranslation: {translation}\nTranslation tokens:{tokens}\nLogit: {logit:.3f}")

Temperature: 0.0

Original sentence: I love languages
Translation: ['eu amo idiomas nelas notaram blog eu amo eu amo eu amo eu amo eu amo eu amo os eu amo eu amo eu amo eu amo eu amo eu amo amo eu amo os eu amo eu amo eu amo eu amo eu amo amo eu amo amo eu']
Translation tokens:[[9, 523, 888, 4761, 7783, 4009, 9, 523, 9, 523, 9, 523, 9, 523, 9, 523, 9, 523, 40, 9, 523, 9, 523, 9, 523, 9, 523, 9, 523, 9, 523, 523, 9, 523, 40, 9, 523, 9, 523, 9, 523, 9, 523, 9, 523, 523, 9, 523, 523, 9]]
Logit: -1.025


In [50]:
# Running this cell multiple times should return different outputs since temp is not 0
# You can try different temperatures

temp = 0.7
original_sentence = "I love languages"

translation, logit, tokens = translate(translator.to("cpu"), original_sentence, temperature=temp)

print(f"Temperature: {temp}\n\nOriginal sentence: {original_sentence}\nTranslation: {translation}\nTranslation tokens:{tokens}\nLogit: {logit:.3f}")

Temperature: 0.7

Original sentence: I love languages
Translation: ['eu amo cantame embriagada templos sofa sonho eu seu viajo meu tanto devo eu adoro arroz alteracoes eu brasil pus gordinho eu amo que pretendemos eu adoro anoitecer eu adoro gritos ridicula eu amo sensivel tarde trabalho encontrei perdi completamente desorientado esperarei eu eu amo estadunidenses destas nossa ostras boi']
Translation tokens:[[9, 523, 11801, 6204, 11065, 1065, 902, 9, 49, 5554, 43, 290, 400, 9, 537, 1524, 5580, 9, 1843, 5114, 10286, 9, 523, 6, 5923, 9, 537, 4893, 9, 537, 7641, 4562, 9, 523, 3182, 193, 143, 470, 555, 749, 7486, 3602, 9, 9, 523, 6237, 5705, 304, 6384, 2699]]
Logit: -6.641


# Minimum Bayes-Risk Decoding

In [29]:
def generate_samples(model, text, n_samples=4, temperature=0.6):
    samples, log_probs = [], []
    
    for _ in range(n_samples):
        _, log_prob, sample = translate(model, text, temperature=temperature)
        
        samples.append(sample)
        
        log_probs.append(log_prob)
        
    return samples, log_probs

In [30]:
samples, log_probs = generate_samples(translator, 'I love languages')

for s, l in zip(samples, log_probs):
    print(f"Translated tensor: {s} has logit: {l:.3f}")

Translated tensor: [[9, 523, 778, 11589, 6808, 8804, 60, 99, 7, 523, 9, 537, 8981, 9, 523, 5597, 9, 523, 5673, 9, 892, 159, 37, 7397, 1055, 9, 771, 7981, 461, 99, 5825, 9, 3607, 6576, 244, 9, 840, 5431, 1633, 155, 10716, 9, 523, 3165, 902, 257, 9, 537, 5545, 283]] has logit: -4.854
Translated tensor: [[9, 537, 5866, 4988, 319, 245, 9416, 1916, 2047, 416, 531, 49, 1, 9, 537, 7212, 1288, 5082, 9, 1145, 464, 7674, 807, 4377, 1, 2805, 229, 4698, 9, 99, 224, 9448, 183, 155, 2213, 216, 3517, 523, 9, 10439, 9, 5394, 3808, 8929, 8488, 840, 4048, 4728, 93, 3753]] has logit: -6.767
Translated tensor: [[9, 6835, 1, 5068, 248, 8646, 523, 9, 9, 523, 9, 306, 4, 1, 40, 3346, 9, 537, 7365, 9, 523, 6144, 1055, 4860, 1410, 6062, 775, 43, 11747, 9, 99, 6503, 3901, 9759, 3261, 1308, 2529, 2153, 647, 4810, 9, 416, 1882, 5756, 9, 523, 1334, 2047, 10642, 8211]] has logit: -7.727
Translated tensor: [[9, 523, 4939, 6704, 8821, 2418, 9, 523, 523, 99, 10370, 8170, 523, 268, 1397, 2441, 537, 5633, 3029, 9, 523, 4

# Comparing overlaps

In [31]:
def jaccard_similarity(candidate, reference):
    
    if (isinstance(candidate, list) and all(isinstance(i, list) for i in candidate)) and \
       (isinstance(reference, list) and all(isinstance(i, list) for i in reference)):
        candidate_set = set(candidate[0])
        reference_set = set(reference[0])

    else:
        candidate_set = set(candidate)
        reference_set = set(reference)    
    
    common_tokens = candidate_set.intersection(reference_set)
    
    all_tokens = candidate_set.union(reference_set)
    
    overlap = len(common_tokens) / len(all_tokens)
    
    return overlap

In [32]:
l1 = [1,2,3]
l2 = [1,2,3,4]

js = jaccard_similarity(l1, l2)

print(f"jaccard similarity between lists: {l1} and {l2} is {js:.3f}")

jaccard similarity between lists: [1, 2, 3] and [1, 2, 3, 4] is 0.750


# Rouge1 similarity

In [33]:
def rouge1_similarity(candidate, reference):
    candidate_word_counts = Counter(candidate)
    reference_word_counts = Counter(reference)    
    
    overlap = 0
    
    for token in candidate_word_counts.keys():
        token_count_candidate = candidate_word_counts[token]
        token_count_reference = reference_word_counts[token]        
        
        overlap += min(token_count_candidate, token_count_reference)
        
    precision = overlap / len(candidate)
    
    recall = overlap / len(reference)
    
    if precision + recall != 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
        return f1_score
    
    return 0

In [34]:
l1 = [0, 1]
l2 = [5, 5, 7, 0, 232]

r1s = rouge1_similarity(l1, l2)

print(f"rouge 1 similarity between lists: {l1} and {l2} is {r1s:.3f}")

rouge 1 similarity between lists: [0, 1] and [5, 5, 7, 0, 232] is 0.286


In [35]:
l1 = [1, 2, 3]
l2 = [1, 2, 3, 4]

r1s = rouge1_similarity(l1, l2)

print(f"rouge 1 similarity between lists: {l1} and {l2} is {r1s:.3f}")

rouge 1 similarity between lists: [1, 2, 3] and [1, 2, 3, 4] is 0.857


# Computing the overall score

# Average overlap

In [36]:
def average_overlap(samples, similarity_fn):
    
    scores = {}
    
    for index_candidate, candidate in enumerate(samples):
        overlap = 0
        
        for index_sample, sample in enumerate(samples):
            
            if index_candidate == index_sample:
                continue
                
            overlap += similarity_fn(candidate, sample)
            
        score = overlap / (len(samples) - 1)
        
        score = round(score, 3)
        
        scores[index_candidate] = score
        
    return scores

In [37]:
# Test with Jaccard similarity

l1 = [1, 2, 3]
l2 = [1, 2, 4]
l3 = [1, 2, 4, 5]

avg_ovlp = average_overlap([l1, l2, l3], jaccard_similarity)

print(f"average overlap between lists: {l1}, {l2} and {l3} using Jaccard similarity is:\n\n{avg_ovlp}")

average overlap between lists: [1, 2, 3], [1, 2, 4] and [1, 2, 4, 5] using Jaccard similarity is:

{0: 0.45, 1: 0.625, 2: 0.575}


In [38]:
# Test with Rouge1 similarity

l1 = [1, 2, 3]
l2 = [1, 4]
l3 = [1, 2, 4, 5]
l4 = [5,6]

avg_ovlp = average_overlap([l1, l2, l3, l4], rouge1_similarity)

print(f"average overlap between lists: {l1}, {l2}, {l3} and {l4} using Rouge1 similarity is:\n\n{avg_ovlp}")

average overlap between lists: [1, 2, 3], [1, 4], [1, 2, 4, 5] and [5, 6] using Rouge1 similarity is:

{0: 0.324, 1: 0.356, 2: 0.524, 3: 0.111}


In [39]:
def weighted_avg_overlap(samples, log_probs, similarity_fn):
    scores = {}
    
    for index_candidate, candidate in enumerate(samples):
        overlap, weighted_sum = 0.0, 0.0
        
        for index_sample, (sample, logprob) in enumerate(zip(samples, log_probs)):
            if index_candidate == index_sample:
                continue
                
            sample_prob = float(np.exp(logprob))
            weighted_sum += sample_prob
            
            sample_overlap = similarity_fn(candidate, sample)
            overlap += sample_overlap * sample_prob
            
        score = overlap / weighted_sum
        score = round(score, 3)
        
        scores[index_candidate] = score
        
    return scores

In [40]:
l1 = [1, 2, 3]
l2 = [1, 2, 4]
l3 = [1, 2, 4, 5]
log_probs = [0.4, 0.2, 0.5]

w_avg_ovlp = weighted_avg_overlap([l1, l2, l3], log_probs, jaccard_similarity)

print(f"weighted average overlap using Jaccard similarity is:\n\n{w_avg_ovlp}")

weighted average overlap using Jaccard similarity is:

{0: 0.443, 1: 0.631, 2: 0.558}


In [41]:
def mbr_decode(model, text, n_samples=5, temperature=0.6, similarity_fn=jaccard_similarity):
    samples, log_probs = generate_samples(model, text, n_samples=n_samples, temperature=temperature)
    
    scores = weighted_avg_overlap(samples, log_probs, similarity_fn)
    
    decoded_translations = [ids_to_text(sample,tokenizer_por) for sample in samples]
    
    max_score_key = max(scores, key=lambda k: scores[k])
    
    translation = decoded_translations[max_score_key]
    
    return translation, decoded_translations

In [42]:
english_sentence = "I love languages"

translation, candidates = mbr_decode(translator, english_sentence, n_samples=10, temperature=0.6)

print("Translation candidates:")
for c in candidates:
    print(c)

print(f"\nSelected translation: {translation}")

Translation candidates:
['eu gosto serio fiquemos gatinhos departamento altas eu eu adoro cooperacao professores treze duzentos eu amo altura eu eu tratamento tanto abrilo eu jogo domingos na dedos eu adoro perda linguas logo choro reconheco eu eu gosto velejar acelerar malucos eu amo escolha eletrica ensino sempre amo eu adoro quantidades']
['eu amo linguas garantir formularios vim discutir depois barato ontem eu adoro levado eu meu permitir altura eu amo meus droga eu adoro necessaria eu amo engracados eu realmente aprendi policiais brincar palavras cru comeco na gota principio eu jogo bancar surpresas grandes oro eu adoro meia eu amo tomates']
['eu amo espirrar dez assinado eu lhe passaros eu velho amo eu perdidos acima eu chame animais eu eu amo muito demissao perguntava parar adoro cervos amo parei eu amo tomates decepcionado acordo surpresa as pedacos tanto cinco fofocar detesto eu meia muitos a meus rindo ganho trombei tentado asilo']
['amo viagens projetado diferencas acho eu a