In [1]:
%matplotlib inline

from tqdm.notebook import tqdm
import os

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

try:
    import youtokentome as yttm
except:
    # Install YouTokenToMe for tokenization
    !pip install youtokentome
    import youtokentome as yttm

vocab_size = 2 ** 15 # TODO: maybe use power of two?
PADDING_TOKEN = 0
UNK_TOKEN = 1
BOS_TOKEN = 2
EOS_TOKEN = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 16

# Choose dataset
data_path = 'data'
# path = f'{data_path}/paracrawl-clean'
path = f'{data_path}/yandex'
# Tokenizer model path.
tokenizer_path = f'{path}_v{vocab_size}.tokenizer'

# Model save path.
save_path = 'models'
sub_name = '_1'
save_model_name = f'test_model{sub_name}.pth'
model_save_path = os.path.join(save_path, save_model_name)

if not os.path.exists(save_path):
    os.makedirs(save_path)

def load_files(path):
    res = ([], [])
    for i, ext in enumerate(['.en', '.ru']):
        with open(path + ext, encoding='utf8') as in_file:
            res[i].extend(in_file.readlines())
    return res

In [2]:
BATCH_SIZE = 32

## Preparation
Update/Install packages and download datasets

In [42]:
data_en, data_ru = load_files(path)

raw_data = {'English' : [line for line in data_en], 'Russian': [line for line in data_ru]}

df = pd.DataFrame(raw_data, columns=list(raw_data.keys()))
df.shape

(1000000, 2)

In [43]:
df['en_len'] = df['English'].str.count(' ')
df['ru_len'] = df['Russian'].str.count(' ')
# df = df.query('ru_len < 30 & en_len < 30')
# df = df.query('ru_len < en_len * 1.5 & ru_len * 1.5 > en_len')
# df.sort_values(['ru_len', 'en_len'], ascending=[True, True], inplace=True)
del df['en_len'], df['ru_len']
df.shape

(1000000, 2)

In [44]:
df = df[:200000]
# import nltk
# df['English'] = df.English.apply(lambda x: ' '.join(nltk.word_tokenize(x)))
# df['Russian'] = df.Russian.apply(lambda x: ' '.join(nltk.word_tokenize(x)))
df.shape

(200000, 2)

In [45]:
# Create train, test, val sets.
train, test = train_test_split(df, test_size=0.2)
test, val = train_test_split(test, test_size=0.5)
train.to_csv(os.path.join(data_path, 'train.csv'), index=False)
test.to_csv(os.path.join(data_path, 'test.csv'), index=False)
val.to_csv(os.path.join(data_path, 'val.csv'), index=False)

## Load data

In [3]:
if os.path.exists(tokenizer_path):
    tokenizer = yttm.BPE(model=tokenizer_path)
else:
    # Create temp file with data to train tokenizer.
    data_en, data_ru = load_files(path)
    temp_file_path = 'tokenizer_text.temp'
    with open(temp_file_path, 'w', encoding='utf8') as out_file:
        out_file.write('\n'.join(map(str.lower, data_en)))
        out_file.write('\n'.join(map(str.lower, data_ru)))
    # Train tokenizer.
    tokenizer = yttm.BPE.train(data=temp_file_path, vocab_size=vocab_size, model=tokenizer_path)
    # Delete temp file.
    os.remove(temp_file_path)

In [47]:
class TextDataset(torch.utils.data.Dataset):

    __output_types = { 'id': yttm.OutputType.ID,
                       'subword':yttm.OutputType.SUBWORD }

    def __init__(self, csv_file, tokenizer, max_len=50, max_len_ratio=1.5):
        self.tokenizer = tokenizer
        df = pd.read_csv(csv_file)
        # Tokenize sentences using tokenizer.
        # TODO: Improve code by rewriting lambdas to smth else.
        tokenize_lambda = lambda x: self.tokenize(x.lower().strip(), 'subword')
        df['eng_enc'] = df.English.apply(tokenize_lambda)
        df['rus_enc'] = df.Russian.apply(tokenize_lambda)
        # Delete sentences that exceed the max length and max length ratio.
        df['en_len'] = df['eng_enc'].str.len()
        df['ru_len'] = df['rus_enc'].str.len()
        df.query(f'ru_len < {max_len} & en_len < {max_len}', inplace=True)
        df.query(f'ru_len < en_len * {max_len_ratio} & ru_len * {max_len_ratio} > en_len', inplace=True)
        # Sort the values for less padding in batching.
        df.sort_values(['ru_len', 'en_len'], ascending=[False, False], inplace=True)
        # TODO: better unpacking
        raw_src, raw_tgt = zip(df[['Russian', 'English']].T.values)
        src, tgt = zip(df[['rus_enc', 'eng_enc']].T.values)
        self.tgt, self.src = tgt[0], src[0]
        self.raw_src, self.raw_tgt = raw_src[0], raw_tgt[0]
        

    def tokenize(self, s, output_type='id'):
        """Tokenize the sentence.
        :param s: the sentence to tokenize
        :param output_type: either 'id' or 'subword' for corresponding output
        :return: tokenized sentence"""
        return self.tokenizer.encode(s, output_type=self.__output_types[output_type],
                                bos=True, eos=True)
    def decode(self, tokens):
        return self.tokenizer.id_to_subword(tokens)

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        src = self.src[idx]
        src = [self.tokenizer.subword_to_id(token) for token in src]
        tgt = self.tgt[idx]
        tgt = [self.tokenizer.subword_to_id(token) for token in tgt]
        return src, tgt

def load_datasets(path, tokenizer, ext='.csv'):
    res = []
    for name in  ['train', 'val', 'test']:
        dataset_path = os.path.join(path, name + ext)
        res.append(TextDataset(dataset_path, tokenizer))
    return res

train_data, val_data, test_data = load_datasets(data_path, tokenizer)
print('Train:', len(train_data),
      '\nVal:', len(val_data),
      '\nTest:', len(test_data))

Train: 133329 
Val: 16692 
Test: 16736


## Dataloaders

In [48]:
def my_collate(batch):
    src, tgt = zip(*batch)
    src = [Tensor(s) for s in src]
    tgt = [Tensor(t) for t in tgt]
    # TODO: Generalize padding value
    src = pad_sequence(src, batch_first=True, padding_value=PADDING_TOKEN).long()
    tgt = pad_sequence(tgt, batch_first=True, padding_value=PADDING_TOKEN).long()
    return src.t(), tgt.t()

def make_dataloaders(datasets, batch_size, num_workers=0):
    res = []
    for dataset in datasets:
        res.append(DataLoader(dataset, batch_size=batch_size,
                        shuffle=False, num_workers=num_workers, collate_fn=my_collate))
    return res

(train_iterator,
 val_iterator,
 test_iterator) = make_dataloaders([train_data, val_data, test_data],
                                   batch_size=BATCH_SIZE,
                                   num_workers=0)

data_iterators = {
    'train': train_iterator,
    'val': val_iterator,
    'test': test_iterator,
}

In [49]:
# Test dataset
print('Raw input:', train_data.raw_src[0], train_data.raw_tgt[0])
print('Tokenized input:', train_data.src[0], train_data.tgt[0])
# Test iterator
for _, (src, tgt) in enumerate(train_iterator):
    print('src shape:', src.shape)
    print('tgt shape:', tgt.shape)
    for s, t in zip(src.t()[:2], tgt.t()[:2]):
        print([tokenizer.id_to_subword(token) for token in s])
        print([tokenizer.id_to_subword(token) for token in t])
    break

Raw input: Она неутомимо и страстно атакует соперницу, стараясь решительно сокрушить ее (из 47 побед, 31 Кристи завершила нокаутом) - Кристи называет себя "бойцом концентрированного действия".
 She tirelessly and passionately batters her opponent trying to decidedly trounce her (she ended her fights with 31 knockouts out of 47 wins) - Christy called herself an "action-packed fighter".

Tokenized input: ['<BOS>', '▁она', '▁неу', 'то', 'ми', 'мо', '▁и', '▁стра', 'стно', '▁ата', 'ку', 'ет', '▁сопер', 'ницу', ',', '▁ста', 'раясь', '▁решительно', '▁со', 'кру', 'шить', '▁ее', '▁(из', '▁47', '▁побе', 'д,', '▁31', '▁кри', 'сти', '▁заверши', 'ла', '▁но', 'ка', 'у', 'том', ')', '▁-', '▁кри', 'сти', '▁называет', '▁себя', '▁"', 'бой', 'цом', '▁концентри', 'рованного', '▁действия', '".', '<EOS>'] ['<BOS>', '▁she', '▁t', 'ire', 'lessly', '▁and', '▁passion', 'ately', '▁bat', 'ters', '▁her', '▁opponent', '▁trying', '▁to', '▁decided', 'ly', '▁tr', 'ounce', '▁her', '▁(s', 'he', '▁ended', '▁her', '▁fight

Defining our ``nn.Module`` and ``Optimizer``
----------------


In [7]:
class PositionalEncoding(nn.Module):
    # From https://pytorch.org/tutorials/beginner/transformer_tutorial.html

    def __init__(self, d_model, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model
    
    def create_pe(self, seq_len):
        pe = torch.zeros(seq_len, self.d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-math.log(10000.0) / self.d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

    def forward(self, x):
        pe = self.create_pe(x.size(0))
        x = x + pe.to(x.device)
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntokens_src, ntokens_tgt, ninp, nhead, dim_feedforward, nlayers, pad_token, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import Transformer
        self.model_type = 'Transformer'
        self.ninp = ninp
        self.pad_token = pad_token
        self.masks = {
            'src': None,
            'tgt': None,
            'memory': None,
        }
        # Token Encoders
        self.src_encoder = nn.Embedding(ntokens_src, ninp)
        self.tgt_encoder = nn.Embedding(ntokens_tgt, ninp)
        # Positional Encoding
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        # Transformer
        self.transformer = Transformer(
            d_model=ninp,
            nhead=nhead,
            num_encoder_layers=nlayers,
            num_decoder_layers=nlayers,
            dropout=dropout,
            dim_feedforward=dim_feedforward,
        )
        self.out = nn.Linear(ninp, ntokens_tgt)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sx, sy=None):
        """Generate matrix for seqential reveal of tokens."""
        sy = sy or sx
        mask = (torch.triu(torch.ones((sx, sy))) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        self.transformer._reset_parameters()
    
    def preprocess(self, x, x_type):
        # Create masks
        padding_mask = (x == self.pad_token).bool().t()
        if self.masks[x_type] is None or self.masks[x_type].size(0) != len(x):
            self.masks[x_type] = self._generate_square_subsequent_mask(len(x), len(x)).to(x.device)
        
        x_enc = self.src_encoder(x) if x_type == 'src' else self.tgt_encoder(x)
        x_enc *= math.sqrt(self.ninp) # TODO: * or / or remove?
        x_enc = self.pos_encoder(x_enc)
        
        return x_enc, self.masks[x_type], padding_mask
        
    def forward(self, src, tgt):

        # TODO: Do we need memory mask?
        if (    self.masks['memory'] is None or
                self.masks['src'].size(0) != len(src) or
                self.masks['tgt'].size(0) != len(tgt)):
            self.masks['memory'] = self._generate_square_subsequent_mask(len(src), len(tgt)).to(src.device)
        
        src_enc, _, src_key_padding_mask = self.preprocess(src, 'src')
        tgt_enc, _, tgt_key_padding_mask = self.preprocess(tgt, 'tgt')
        memory_key_padding_mask = src_key_padding_mask.clone().detach()
        
        output = self.transformer(src_enc, tgt_enc,
#                                   src_mask=self.masks['src'],
                                  tgt_mask=self.masks['tgt'],
#                                   memory_mask=self.masks['memory'],
                                  src_key_padding_mask=src_key_padding_mask,
                                  tgt_key_padding_mask=tgt_key_padding_mask,
                                  memory_key_padding_mask=memory_key_padding_mask,
                                  )
        output = self.out(output)
        return output

In [50]:
def run_model(model, criterion, optimizer, data_iterator, is_train_phase, n_words=1, desc=''):
    """Run one epoch of a model with given data.
    
    :param model: model to run on
    :param criterion: critetion to use
    :param optimizer: optimizer to use
    :param data_iterator: iterator of (x, y) data tuples
    :param is_train_phase: True if you want to train
    :param n_words: number of words to predict, the bigger the longer it takes to run
    :param desc: description for tqdm bar
    :return: epoch loss
    """
    if is_train_phase:
        model.train() # Turn on the train mode
    else:
        model.eval()
    total_loss = 0.0
    pbar = tqdm(total=len(data_iterator), desc=desc, position=0, leave=True)
    for i, (src, tgt) in enumerate(data_iterator):
        src, tgt = src.to(device), tgt.to(device)
        
        tgt_losses = 0.0
        # Predict `n_words` last words.
        for j in range(max(1, len(tgt) - n_words), len(tgt)):
            optimizer.zero_grad()
            tgt_in = tgt[:j, :]
            tgt_out = tgt[1:j+1, :]
            
            with torch.set_grad_enabled(is_train_phase):
                output = model(src, tgt_in).transpose(1, 2)
                loss = criterion(output, tgt_out)

                if is_train_phase:
                    loss.backward()
                    # Clip gradient to deal with gradient explosion
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
                    optimizer.step()
            tgt_losses += loss.item()
        total_loss += tgt_losses / j
        pbar.update(1)
        pbar.set_description(desc + f'- loss: {total_loss / (i+1):7.4}')
        if i % 2000 == 0:
            translate(model, 'Машинное обучение это здорово!', verbose=True)
            rand_ind = np.random.randint(0, len(test_data))
            translate(model, test_data.raw_src[rand_ind], verbose=True)
    return total_loss / (i+1)

def train_model(model, n_epochs, data_iterators,
                criterion, optimizer, n_words=1, scheduler=None, model_save_path=None):
    stats = {'train':{'loss':[]},
             'val':{'loss':[]}}
    best_loss = None
    
    for epoch in range(n_epochs):
        lr = optimizer.state_dict()['param_groups'][0]['lr']
        print(f'------------ Epoch {epoch}; lr: {lr:.5f} ------------')
        for phase in ['train', 'val']:
            desc = f'{phase.title()} Epoch #{epoch} '
            epoch_loss = run_model(model, criterion, optimizer,
                                   data_iterators[phase], phase == 'train',
                                   n_words, desc)
            stats[phase]['loss'].append(epoch_loss)
            print_hist = lambda l: ' -> '.join(map(lambda x:f"{x:.4}", l[-2:]))
            tqdm.write(f'{phase.title()} Loss: ' + print_hist(stats[phase]['loss']))
        if best_loss == None or stats['val']['loss'][-1] < best_loss:
            best_loss = stats['val']['loss'][-1]
            print('Smallest val loss')
            print('Saving model...')
            if model_save_path:
                try:
                    torch.save(model, model_save_path)
                    print('Saved successfully')
                except FileNotFoundError:
                    print('Error during saving!')
        try:
            translate(model, 'Машинное обучение это здорово!', verbose=True)
            rand_ind = np.random.randint(0, len(test_data))
            translate(model, test_data.raw_src[rand_ind], verbose=True)
        except:
            print('Error while translation.')
        if scheduler:
            scheduler.step()
    return stats

In [51]:
ntokens_src = tokenizer.vocab_size() # the size of vocabulary
ntokens_tgt = tokenizer.vocab_size() # the size of vocabulary
pad_token = PADDING_TOKEN
emsize = 256 # embedding dimension
nhid = 256 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
# model = TransformerModel(ntokens_src, ntokens_tgt, emsize, nhead, nhid, nlayers, pad_token, dropout).to(device)

In [52]:
# Model save path.
save_path = 'models'
sub_name = '_1_2'
save_model_name = f'test_model{sub_name}.pth'
model_save_path = os.path.join(save_path, save_model_name)

In [22]:
# torch.save(model, model_save_path)
model = torch.load(model_save_path)

In [94]:
# Ignore padding index during the loss computation.
PAD_IDX = PADDING_TOKEN
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, reduction='mean')
lr = 0.1 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.9)

In [None]:
torch.cuda.empty_cache()
n_epochs = 3
n_words = 1
stats = train_model(model, n_epochs, data_iterators,
                    criterion, optimizer, n_words, scheduler, model_save_path)

n_epochs = 1
n_words = 2
stats = train_model(model, n_epochs, data_iterators,
                    criterion, optimizer, n_words, scheduler, model_save_path)

n_epochs = 5
n_words = 1
stats = train_model(model, n_epochs, data_iterators,
                    criterion, optimizer, n_words, scheduler, model_save_path)
# 1.444

------------ Epoch 0; lr: 0.10000 ------------


HBox(children=(FloatProgress(value=0.0, description='Train Epoch #0 ', max=4167.0, style=ProgressStyle(descrip…

------------ Translation ------------
Input: Машинное обучение это здорово!
Output weights:
  0 {'▁the': 0.11004267632961273, '▁this': 0.07904434949159622, '▁it': 0.059231411665678024}
  1 {'▁way': 0.01204590406268835, '▁d': 0.0060770632699131966, '▁second': 0.005667828489094973}
  2 {'▁to': 0.1039687916636467, '▁of': 0.09857677668333054, '▁it': 0.071696437895298}
translation: <BOS> the way to make it a great way to make it a great way to make it a great way to understand the way!<EOS>
------------ Translation ------------
Input: У меня уже есть трудовой опыт с немцами и итальянцами и могу констатировать, что с чехами у меня большее взаимопонимание.

Output weights:
  0 {'▁the': 0.11565890163183212, '▁it': 0.05007937550544739, '▁i': 0.029847433790564537}
  1 {'▁second': 0.011151408776640892, '▁idea': 0.009973309934139252, '▁decision': 0.008864009752869606}
  2 {'▁reason': 0.037126556038856506, '▁thing': 0.02684709243476391, '▁stage': 0.013499141670763493}
translation: <BOS> the second 

HBox(children=(FloatProgress(value=0.0, description='Val Epoch #0 ', max=522.0, style=ProgressStyle(descriptio…

------------ Translation ------------
Input: Машинное обучение это здорово!
Output weights:
  0 {'▁let': 0.04806356132030487, "▁it's": 0.040228333324193954, '▁the': 0.024042978882789612}
  1 {'▁us': 0.4976542294025421, '▁me': 0.18254466354846954, '▁the': 0.0315854549407959}
  2 {'▁know': 0.037218738347291946, '▁see': 0.034714680165052414, '▁understand': 0.031841401010751724}
translation: <BOS> let us know<EOS>
------------ Translation ------------
Input: ArcSight ESM контролирует все события по всему предприятию и применяет мощные инструменты для анализа и корреляции с целью выявления деловых и технологических угроз.

Output weights:
  0 {'▁the': 0.059592097997665405, '▁a': 0.010768134146928787, '▁special': 0.007886426523327827}
  1 {'▁main': 0.01938093639910221, '▁best': 0.012697727419435978, '▁key': 0.00845651887357235}
  2 {'▁features': 0.04411599412560463, '▁thing': 0.024239061400294304, '▁feature': 0.021930957213044167}
translation: <BOS> the main features<EOS>
Val Loss: 0.4337
Sm

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Saved successfully
------------ Translation ------------
Input: Машинное обучение это здорово!
Output weights:
  0 {'▁let': 0.04806356132030487, "▁it's": 0.040228333324193954, '▁the': 0.024042978882789612}
  1 {'▁us': 0.4976542294025421, '▁me': 0.18254466354846954, '▁the': 0.0315854549407959}
  2 {'▁know': 0.037218738347291946, '▁see': 0.034714680165052414, '▁understand': 0.031841401010751724}
translation: <BOS> let us know<EOS>
------------ Translation ------------
Input: Учитывая глобальный характер Интернета, пользование Сайтом для сбора и обработки личной информации неизбежно подразумевает передачу данных на международной основе.

Output weights:
  0 {'▁the': 0.04498930275440216, '▁a': 0.01855502650141716, '▁special': 0.00816279835999012}
  1 {'▁main': 0.012413336895406246, '▁author': 0.00936434231698513, '▁current': 0.00710848206654191}
  2 {'▁features': 0.032591190189123154, '▁element': 0.025745755061507225, '▁topic': 0.023528005927801132}
translation: <BOS> the main features<EOS

HBox(children=(FloatProgress(value=0.0, description='Train Epoch #1 ', max=4167.0, style=ProgressStyle(descrip…

------------ Translation ------------
Input: Машинное обучение это здорово!
Output weights:
  0 {'▁let': 0.04936174303293228, "▁it's": 0.04003230854868889, '▁the': 0.024674812331795692}
  1 {'▁us': 0.4916578233242035, '▁me': 0.18118657171726227, '▁the': 0.03361755982041359}
  2 {'▁know': 0.037714261561632156, '▁see': 0.03622036799788475, '▁understand': 0.03315741941332817}
translation: <BOS> let us know<EOS>
------------ Translation ------------
Input: 310 на первом этапе Принципиальное соглашение с МФ и 316 на втором о поэтапном (два этапа) отказе от этапеc) БМ к концу 2008 года 127 203?

Output weights:
  0 {'▁the': 0.07105135917663574, '▁a': 0.015260101296007633, '▁special': 0.008429670706391335}
  1 {'▁main': 0.017750665545463562, '▁author': 0.01018123421818018, '▁current': 0.009392685256898403}
  2 {'▁thing': 0.024097789078950882, '▁topic': 0.019400086253881454, '▁event': 0.014818182215094566}
translation: <BOS> the main thing<EOS>


In [55]:
def subword_to_str(tokens):
    return ''.join(tokens).replace('▁', ' ')

def tokens_to_str(tokens):
    return subword_to_str([tokenizer.id_to_subword(ix) for ix in tokens])

def translate(model, text, max_len=80, verbose=False):
    model.eval()
    
    if verbose:
        print('------------ Translation ------------')
        print('Input:', text)
    # Prepare text
    src = tokenizer.encode(text, output_type=yttm.OutputType.ID,
                           bos=True, eos=True)
    src = Tensor(src).long().to(device)
    # Run encoder
    src_enc, src_mask, _ = model.preprocess(src, 'src')
    e_outputs = model.transformer.encoder(src_enc, 
                                          src_mask,
                                          )
    
    # Prepare tensor for answers
    outputs = torch.zeros(max_len).type_as(src.data)
    # Set the first token as '<sos>'
    outputs[0] = torch.LongTensor([BOS_TOKEN])
    vals = []
    for i in range(1, max_len):
        outputs_enc, tgt_mask, _ = model.preprocess(outputs[:i].unsqueeze(1), 'tgt')
#         memory_mask = model._generate_square_subsequent_mask(len(src), i+1).to(src.device)
        d_out = model.transformer.decoder(outputs_enc, e_outputs,
                                          tgt_mask=tgt_mask,
#                                           memory_mask=memory_mask,
                                          )
        out = model.out(d_out)
        out = F.softmax(out, dim=-1)
        val, ix = out.data.topk(3, dim=-1)
        outputs[i] = ix[-1][0][0]
        if outputs[i] == EOS_TOKEN:
            break
    result = tokens_to_str(outputs[:i+1])
    if verbose:
        print('Output weights:')
        for j in range(min(3, i)):
            print(f'  {j}', {tokenizer.id_to_subword(k):v.item()
                             for k, v in zip(ix[j][0], val[j][0])})
        print('translation:', result)
    return result

In [56]:
translate(model, 'Машинное обучение это здорово!', verbose=True)

------------ Translation ------------
Input: Машинное обучение это здорово!
Output weights:
  0 {'▁the': 0.10451338440179825, '▁this': 0.08264591544866562, '▁it': 0.06017918884754181}
  1 {'▁way': 0.013550149276852608, '▁best': 0.00585717149078846, '▁d': 0.005779571365565062}
  2 {'▁to': 0.10261761397123337, '▁of': 0.08953908085823059, '▁it': 0.07421953231096268}
translation: <BOS> the way to make it easy to understand the way of the brain is to understand the way of the universe!<EOS>


'<BOS> the way to make it easy to understand the way of the brain is to understand the way of the universe!<EOS>'

In [89]:
index = np.random.randint(0, 50)
translate(model, train_data.raw_src[index], verbose=True)
print('--------------')
print('real:', train_data.raw_tgt[index])

------------ Translation ------------
Input: При участии астронавтки Хайди Стефанишин-Пайпер и всемирно известного писателя Пауло Коэльо состоялось награждение победителей стипендиальной программы Фонда Виктора Пинчука "Завтра.

Output weights:
  0 {'▁the': 0.16819918155670166, '▁a': 0.019818376749753952, '▁8.': 0.009165264666080475}
  1 {'▁main': 0.007805072236806154, '▁author': 0.0066763185895979404, '▁second': 0.006010976620018482}
  2 {'▁idea': 0.004995526280254126, '▁award': 0.00440758652985096, '▁material': 0.004048016853630543}
translation: <BOS> the main idea of the kel's office is to be the main part of the kitten.<EOS>
--------------
real: The astronaut Heidi Stefanyshyn-Piper and the world famous writer Paulo Coelho attended the ceremony of rewarding winners of the scholarship programme of Victor Pinchuk Fund ZAVTRA.UA



In [82]:
def translate_beam(model, text, max_len=10, beam_capacity=3, verbose=False):
    """
    Algorithm: https://www.youtube.com/watch?v=RLWuzLLSIgw
    """
    model.eval()
    if verbose:
        print('------------ Translation ------------')
        print('Input:', text)
    # Prepare text
    src = tokenizer.encode(text, output_type=yttm.OutputType.ID,
                           bos=True, eos=True)
    src = Tensor(src).long().to(device)
    # Run encoder
    src_enc, src_mask, _ = model.preprocess(src, 'src')
    e_outputs = model.transformer.encoder(src_enc, 
#                                           src_mask,
                                          )

    # Prepare tensor for answers
    basic_vec = torch.zeros(max_len).type_as(src.data)
    basic_vec[0] = torch.LongTensor([BOS_TOKEN])

    beam_pool = [(basic_vec, 1.0)]

    def beam_filter(pool, top_k=beam_capacity):
        return sorted(pool, key=lambda x: x[1], reverse=True)[:top_k]

    for i in range(1, max_len):
        if verbose:
            print("Beam epoch: ", i)
        new_pool = []
        # For each candidate path:
        for beam, old_prob in beam_pool:
            outputs_enc, tgt_mask, _ = model.preprocess(beam[:i].unsqueeze(1), 'tgt')
            d_out = model.transformer.decoder(outputs_enc, e_outputs,
                                              tgt_mask=tgt_mask,
                                              )
            out = model.out(d_out).cpu().detach()
            out = F.softmax(out, dim=-1)
            probs, ixs = out[-1, :].topk(beam_capacity)
            for prob, token_id in zip(probs.squeeze(), ixs.squeeze()):
                tmp_beam = beam.clone()
                tmp_beam[i] = token_id.item()
                new_pool.append((tmp_beam, prob * old_prob))
        beam_pool = beam_filter(new_pool)
        if verbose:
            for beam, old_prob in beam_pool:
                print("Candidate '{}' with prob: {:.7f}".format(
                    tokens_to_str(beam[1:i + 1]), prob * old_prob
                ))
        # Stop if EOS_TOKEN
        if beam_pool[0][0][i] == EOS_TOKEN:
            break
    the_best = beam_filter(beam_pool, 1)[0][0]
    # Cut by EOS_TOKEN
#     if EOS_TOKEN in the_best:
#         i = (the_best == EOS_TOKEN).nonzero()[0]
    result = tokens_to_str(the_best[:i+1])
    return result

In [90]:
translate_beam(model, train_data.raw_src[index], verbose=False, beam_capacity=4, max_len=50)

'<BOS> the author of the kings of the russian federation of the ussr, and the general assembly of the ussr.<EOS>'

### Test BLEU score

In [93]:
from nltk.translate.bleu_score import corpus_bleu
references = []
candidates = []
pbar = tqdm(total=len(test_data), desc='Test BLEU score')
# TODO: batch translation.
for raw_src, raw_tgt in tqdm(zip(test_data.raw_src[:30], test_data.raw_tgt), total=len(test_data)):
    references.append([raw_tgt])
    candidate = translate_beam(model, raw_src, beam_capacity=4, max_len=50)
    candidate = candidate.replace('<BOS>', '').replace('<EOS>', '')
    candidates.append(candidate)
    pbar.update(1)
score = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0))
print(f'Test BLEU score - {score:.4f}')

HBox(children=(FloatProgress(value=0.0, description='Test BLEU score', max=16736.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, max=16736.0), HTML(value='')))


Test BLEU score - 0.2212


In [92]:
score = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0))
print(f'Test BLEU score - {score:.4f}')

Test BLEU score - 0.2564


In [34]:
with open('data/truth.txt', encoding='utf8') as in_file:
    sentences = in_file.readlines()
    sentences = list(map(lambda s: s.lower().strip(), sentences))
#     encode = lambda s: tokenizer.encode(s, output_type=yttm.OutputType.ID, bos=True, eos=True)
#     truth_data = [encode(s) for s in sentences]

In [35]:
sentences[0]

'исполнительный совет'

In [39]:
pred = []
for sentence in tqdm(sentences):
    pred.append(translate(model, sentence, max_len=150))

HBox(children=(FloatProgress(value=0.0, max=7768.0), HTML(value='')))

KeyboardInterrupt: 

In [40]:
with open('answer.txt', 'w') as out_file:
    out_file.writelines('\n'.join(pred))