# Assignment 7

Train a Transformer model for Machine Translation from Russian to English.  
Dataset: http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz   
Make all source and target text to lower case.  
Use following tokenization for english:  
```
import sentencepiece as spm

...
spm.SentencePieceTrainer.Train('--input=data/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

...
TGT.build_vocab(..., min_freq=5)
...

```
Score: corpus-bleu `nltk.translate.bleu_score.corpus_bleu`  
Use last 1000 sentences for model evalutation (test dataset).  
Use your target sequence tokenization for BLEU score.  
Use max_len=50 for sequence prediction.  


Hint: You may consider much smaller model, than shown in the example.  

Baselines:  
[4 point] BLEU = 0.05  
[6 point] BLEU = 0.10  
[9 point] BLEU = 0.15  

[1 point] Share weights between target embeddings and output dense layer. Notice, they have the same shape.


Readings:
1. BLUE score how to https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
1. Transformer code and comments http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
!pip install tqdm
from tqdm import tqdm
from torchtext import datasets, data
!pip install sentencepiece
import sentencepiece as spm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import os
import copy
import logging
import math


DEVICE = 'cuda'



In [0]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, batch):
        src, tgt = batch.src, batch.tgt
        src_mask, tgt_mask = batch.src_mask, batch.tgt_mask
        "Take in and process masked src and target sequences."
        return self.decode(tgt, tgt_mask, self.encode(src, src_mask), src_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, tgt, tgt_mask, memory, src_mask):
        x = self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
        x = self.generator(x)
        return x


def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

    
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)


class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)


def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0


def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / np.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)


class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))


class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * np.sqrt(self.d_model)


class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].clone().detach()
        return self.dropout(x)
    
    
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        nn.Linear(d_model, tgt_vocab)
    )
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model


class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, tgt=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:, 1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data).detach()
        return tgt_mask

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
!ln -s "/content/drive/My Drive/" /content/mydrive

ln: failed to create symbolic link '/content/mydrive/My Drive': Operation not supported


In [0]:
START_PATH = '/content/mydrive'

In [0]:
# tokenize english

with open(os.path.join(START_PATH, 'data', 'news-commentary-v13.ru-en.en')) as f:
    with open(os.path.join(START_PATH, 'data', 'text.en'), 'w') as out:
        out.write(f.read().lower())

spm.SentencePieceTrainer.Train(
    f'--input={os.path.join(START_PATH, "data", "text.en")} \
    --model_prefix={os.path.join(START_PATH, "data", "bpe_en")} \
    --vocab_size=32000 --character_coverage=0.98 --model_type=bpe'
)

In [0]:
# tokenize russian

with open(os.path.join(START_PATH, 'data', 'news-commentary-v13.ru-en.ru')) as f:
    with open(os.path.join(START_PATH, 'data', 'text.ru'), 'w') as out:
        out.write(f.read().lower())

spm.SentencePieceTrainer.Train(
    f'--input={os.path.join(START_PATH, "data", "text.ru")} \
    --model_prefix={os.path.join(START_PATH, "data", "bpe_ru")} \
    --vocab_size=32000 --character_coverage=0.98 --model_type=bpe'
)

In [0]:
tok_ru = spm.SentencePieceProcessor()
tok_ru.load(os.path.join(START_PATH, 'data', 'bpe_ru.model'))

tok_en = spm.SentencePieceProcessor()
tok_en.load(os.path.join(START_PATH, 'data', 'bpe_en.model'))

SRC = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_ru.encode_as_pieces(x),
    batch_first=True,
)

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

fields = (('src', SRC), ('tgt', TGT))

In [9]:
with open(os.path.join(START_PATH, 'data', 'text.ru')) as f:
    src_snt = list(map(str.strip, f.readlines()))
    
with open(os.path.join(START_PATH, 'data', 'text.en')) as f:
    tgt_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm(zip(src_snt, tgt_snt))]
test = data.Dataset(examples[-1000:], fields)
train, valid = data.Dataset(examples[:-1000], fields).split(0.9)

235159it [01:01, 3804.81it/s]


In [10]:
print('src: ' + " ".join(train.examples[100].src))
print('tgt: ' + " ".join(train.examples[100].tgt))

src: ▁лондон . ▁американское ▁чувство ▁ “ исключ ительности ” , ▁особенно ▁когда ▁оно ▁превышает ▁пределы , ▁похоже ▁на ▁цунами , ▁которого ▁следует ▁избегать .
tgt: ▁london ▁ – ▁american ▁e x ceptionalism , ▁when ▁it ▁runs ▁rampant , ▁is ▁a ▁tsunami ▁to ▁be ▁avoided .


In [11]:
len(train), len(valid), len(test)

(210743, 23416, 1000)

In [0]:
TGT.build_vocab(train, min_freq=5)
SRC.build_vocab(train, min_freq=5)

In [0]:
class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
        # super(BucketIteratorWrapper,self).__init__()
        self.batch_size = iterator.batch_size
        self.num_workers = -1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: Batch(batch.src, batch.tgt, pad=TGT.vocab.stoi['<pad>']),
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)


class MyCriterion(nn.Module):
    def __init__(self, pad_idx):
        super(MyCriterion, self).__init__()
        self.pad_idx = pad_idx
        self.criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=pad_idx)
        
    def forward(self, x, target):
        x = x.contiguous().permute(0,2,1)
        ntokens = (target != self.pad_idx).data.sum()

        return self.criterion(x, target) / ntokens

In [0]:
def perhaps_convert_float(param, total):
    if isinstance(param, float):
        param = int(param * total)
    return param


class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
    """
    Learning rate scheduler with exponential warmup and step decay.
    """
    def __init__(self, optimizer, iterations, warmup_steps=0,
                 remain_steps=1.0, decay_interval=None, decay_steps=4,
                 decay_factor=0.5, last_epoch=-1):
        """
        Constructor of WarmupMultiStepLR.
        Parameters: warmup_steps, remain_steps and decay_interval accept both
        integers and floats as an input. Integer input is interpreted as
        absolute index of iteration, float input is interpreted as a fraction
        of total training iterations (epochs * steps_per_epoch).
        If decay_interval is None then the decay will happen at regulary spaced
        intervals ('decay_steps' decays between iteration indices
        'remain_steps' and 'iterations').
        :param optimizer: instance of optimizer
        :param iterations: total number of training iterations
        :param warmup_steps: number of warmup iterations
        :param remain_steps: start decay at 'remain_steps' iteration
        :param decay_interval: interval between LR decay steps
        :param decay_steps: max number of decay steps
        :param decay_factor: decay factor
        :param last_epoch: the index of last iteration
        """

        # iterations before learning rate reaches base LR
        self.warmup_steps = perhaps_convert_float(warmup_steps, iterations)
        logging.info(f'Scheduler warmup steps: {self.warmup_steps}')

        # iteration at which decay starts
        self.remain_steps = perhaps_convert_float(remain_steps, iterations)
        logging.info(f'Scheduler remain steps: {self.remain_steps}')

        # number of steps between each decay
        if decay_interval is None:
            # decay at regulary spaced intervals
            decay_iterations = iterations - self.remain_steps
            self.decay_interval = decay_iterations // (decay_steps)
            self.decay_interval = max(self.decay_interval, 1)
        else:
            self.decay_interval = perhaps_convert_float(decay_interval,
                                                        iterations)
        logging.info(f'Scheduler decay interval: {self.decay_interval}')

        # multiplicative decay factor
        self.decay_factor = decay_factor
        logging.info(f'Scheduler decay factor: {self.decay_factor}')

        # max number of decay steps
        self.decay_steps = decay_steps
        logging.info(f'Scheduler max decay steps: {self.decay_steps}')

        if self.warmup_steps > self.remain_steps:
            logging.warn(f'warmup_steps should not be larger than '
                         f'remain_steps, setting warmup_steps=remain_steps')
            self.warmup_steps = self.remain_steps

        super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        if self.last_epoch <= self.warmup_steps:
            # exponential lr warmup
            if self.warmup_steps != 0:
                warmup_factor = math.exp(math.log(0.01) / self.warmup_steps)
            else:
                warmup_factor = 1.0
            inv_decay = warmup_factor ** (self.warmup_steps - self.last_epoch)
            lr = [base_lr * inv_decay for base_lr in self.base_lrs]

        elif self.last_epoch >= self.remain_steps:
            # step decay
            decay_iter = self.last_epoch - self.remain_steps
            num_decay_steps = decay_iter // self.decay_interval + 1
            num_decay_steps = min(num_decay_steps, self.decay_steps)
            lr = [
                base_lr * (self.decay_factor ** num_decay_steps)
                for base_lr in self.base_lrs
                ]
        else:
            # base lr
            lr = [base_lr for base_lr in self.base_lrs]
        return lr

In [0]:
batch_size = 128
num_epochs = 10

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    sort_key=lambda x: len(x.src),
    shuffle=True,
    device=DEVICE,
    sort_within_batch=False
)

train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
test_iter = BucketIteratorWrapper(test_iter)

model = make_model(len(SRC.vocab), len(TGT.vocab))
model = model.to(DEVICE)

pad_idx = TGT.vocab.stoi['<pad>']
criterion = MyCriterion(pad_idx=pad_idx)
criterion = criterion.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9)

steps = 25
scheduler = WarmupMultiStepLR(optimizer, steps)

# Share weights between target embeddings and output dense layer. Notice, they have the same shape.
# model.src_embed[0].lut.weight = model.tgt_embed[0].lut.weight
model.generator.weight = model.tgt_embed[0].lut.weight

In [0]:
torch.cuda.empty_cache()
for instance in list(tqdm._instances): 
    tqdm._decr_instances(instance)

In [18]:
def train_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm(data_iter)
    counter = 0
    for batch in data_iter:
        optimizer.zero_grad()
        pred = model(batch)
        loss = criterion(pred, batch.tgt_y)
        loss.backward()
        curr_loss = loss.data.item()
        optimizer.step()

        total_loss += curr_loss
        data_iter.set_postfix(loss=curr_loss)
        counter +=1

    total_loss /= counter
    return total_loss


def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm(data_iter)
    counter = 0
    for batch in data_iter:
        pred = model(batch)
        curr_loss = criterion(pred, batch.tgt_y).data.item()

        total_loss += curr_loss
        data_iter.set_postfix(loss=curr_loss)
        counter += 1

    total_loss /= counter
    return total_loss


for epoch in range(num_epochs):
    model.train()
    loss = train_epoch(train_iter, model, criterion)
    print(f'\ntrain: {loss}\n')

    model.eval()
    with torch.no_grad():
        loss = valid_epoch(valid_iter, model, criterion)
        scheduler.step(loss)
        print(f'\nvalid: {loss}\n')

100%|██████████| 1647/1647 [15:56<00:00,  1.72it/s, loss=4.52]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 5.187718317880729



100%|██████████| 183/183 [00:35<00:00,  5.21it/s, loss=4.96]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 4.3222730993573135



100%|██████████| 1647/1647 [15:53<00:00,  1.73it/s, loss=3.82]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 4.138006118549748



100%|██████████| 183/183 [00:35<00:00,  5.20it/s, loss=4.64]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 3.6707735296155586



100%|██████████| 1647/1647 [15:58<00:00,  1.74it/s, loss=3.42]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 3.6640484396732425



100%|██████████| 183/183 [00:35<00:00,  5.20it/s, loss=4.43]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 3.31609748621456



100%|██████████| 1647/1647 [15:55<00:00,  1.73it/s, loss=3.37]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 3.368395047399154



100%|██████████| 183/183 [00:35<00:00,  5.20it/s, loss=4.31]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 3.119629490570944



100%|██████████| 1647/1647 [15:51<00:00,  1.74it/s, loss=2.95]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 3.1540818428660278



100%|██████████| 183/183 [00:35<00:00,  5.22it/s, loss=4.12]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 2.919658299352302



100%|██████████| 1647/1647 [15:50<00:00,  1.73it/s, loss=2.87]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 2.9456366962856717



100%|██████████| 183/183 [00:35<00:00,  5.19it/s, loss=3.93]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 2.7581621030640733



100%|██████████| 1647/1647 [15:51<00:00,  1.73it/s, loss=2.71]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 2.7891828085917303



100%|██████████| 183/183 [00:35<00:00,  5.24it/s, loss=3.81]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 2.6518022681845994



100%|██████████| 1647/1647 [15:52<00:00,  1.73it/s, loss=2.74]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 2.6559777901110393



100%|██████████| 183/183 [00:35<00:00,  5.23it/s, loss=3.74]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 2.5640838328606446



100%|██████████| 1647/1647 [15:50<00:00,  2.04it/s, loss=2.59]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 2.5374351459774163



100%|██████████| 183/183 [00:35<00:00,  5.21it/s, loss=3.66]
  0%|          | 0/1647 [00:00<?, ?it/s]


valid: 2.488338206635147



100%|██████████| 1647/1647 [15:49<00:00,  1.73it/s, loss=2.39]
  0%|          | 0/183 [00:00<?, ?it/s]


train: 2.432182786677487



100%|██████████| 183/183 [00:35<00:00,  5.20it/s, loss=3.59]


valid: 2.4265884428076405






In [0]:
hypotheses = []
references = []

model.eval()
with torch.no_grad():
    for batch in test_iter:
        pred = model(batch)
        sents = torch.argmax(torch.softmax(pred, dim=-1), dim=-1)
        hypotheses.extend([[TGT.vocab.itos[ix] for ix in sent] for sent in sents])
        references.extend([[[TGT.vocab.itos[ix] for ix in sent]] for sent in batch.tgt_y])

In [20]:
corpus_bleu(
    references, hypotheses, smoothing_function=SmoothingFunction().method3,
    auto_reweigh=True
)

0.1531192190510838

In [0]:
# start_token = len(TGT.vocab)
# end_token = start_token + 1

# def beam_search(model, src, src_mask, max_len=50, k=5, tau=1):
#     """
#     Generate sequence in target language from NMT model *model* conditioned on input sequence *src* in source language
#     model: NMT model
#     src: batch sequence of token ids in source language
#     src_mask: batch input sequence *src* mask
#     max_len: max generated sequence length (50 by default: "Use max_len=50 for sequence prediction.")
#     k: size of beam
#     tau: temperature
#     """

#     pred = model.encode(src, src_mask)
#     # ...
#     # beam = [([start_token], 0)]  # 0 as log(1) == 0

#     # for i in range(max_len):
#     #     candidates = []
#     #     candidates_proba = []
#     #     for snt, snt_proba in beam:
#     #         if snt[-1] == end_token:
#     #             candidates.append(snt)
#     #             candidates_proba.append(snt_proba)
#     #         else:
#     #             # probability vector of the next token
#     #             model.encode(src, src_mask)
#     #             if len(snt) == 1:
#     #                 proba = lm.infer(start_token, snt[-1], tau)
#     #             else:
#     #                 proba = lm.infer(*snt[-2:], tau)

#     #             # top-k most probable (token ids, token proba) pairs
#     #             best_k_pairs = sorted(list(enumerate(proba)), key=lambda elem: elem[1], reverse=True)[:k]

#     #             # TODO update candidates' sequences and corresponding probabilities
#     #             for token_id, token_proba in best_k_pairs:
#     #                 candidates.append(snt + [token_id])
#     #                 candidates_proba.append(snt_proba + math.log(token_proba))

In [0]:
# model.eval()
# with torch.no_grad():
#     for i, batch in enumerate(valid_iter):
#         src = batch.src[:1]
#         src_key_padding_mask = src != SRC.vocab.stoi["<pad>"]
#         beam = beam_search(model, src, src_key_padding_mask)
        
#         seq = []
#         for i in range(1, src.size(1)):
#             sym = SRC.vocab.itos[src[0, i]]
#             if sym == "</s>": break
#             seq.append(sym)
#         seq = tok_ru.decode_pieces(seq)
#         print("\nSource:", seq)
        
#         print("Translation:")
#         for pred, pred_proba in beam:                
#             seq = []
#             for i in range(1, pred.size(1)):
#                 sym = TGT.vocab.itos[pred[0, i]]
#                 if sym == "</s>": break
#                 seq.append(sym)
#             seq = tok_en.decode_pieces(seq)
#             print(f"pred {pred_proba:.2f}:", seq)

#         seq = []
#         for i in range(1, batch.tgt.size(1)):
#             sym = TGT.vocab.itos[batch.tgt[0, i]]
#             if sym == "</s>": break
#             seq.append(sym)
#         seq = tok_en.decode_pieces(seq)
#         print("Target:", seq)
#         break