In [0]:
%matplotlib inline

import time
import tqdm
import os

import torch
from torch import nn, Tensor
import torch.nn.functional as F

import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from torchtext import datasets
from torchtext.data import Field, BucketIterator, TabularDataset

# Choose dataset
# path = '/content/paracrawl-release1.en-ru.zipporah0-dedup-clean'
path = 'corpus.en_ru.1m'

def load_files(path):
    res = ([], [])
    for i, ext in enumerate(['.en', '.ru']):
        with open(path + ext) as in_file:
            res[i].extend(in_file.readlines())
    return res

In [2]:
# Connect to drive for yandex dataset downloading.
from google.colab import drive
drive.mount('/content/drive')

path_to_yandex_dataset = '/content/drive/My\ Drive/To\ delete/1mcorpus.zip'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preparation
Update/Install packages and download datasets

In [10]:
# Update torchtext
!pip install torchtext -U
# Install YouTokenToMe for tokenization
!pip install youtokentome

Requirement already up-to-date: torchtext in /usr/local/lib/python3.6/dist-packages (0.5.0)
Collecting youtokentome
[?25l  Downloading https://files.pythonhosted.org/packages/a3/65/4a86cf99da3f680497ae132329025b291e2fda22327e8da6a9476e51acb1/youtokentome-1.0.6-cp36-cp36m-manylinux2010_x86_64.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 7.2MB/s 
Installing collected packages: youtokentome
Successfully installed youtokentome-1.0.6


In [4]:
# Load Datasets

# ParaCrawl dataset
!curl https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz --output paracrawl.tgz
!tar -xvzf paracrawl.tgz
# Yandex dataset
os.system(f'cp {path_to_yandex_dataset} yandex.zip')
!unzip yandex.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  637M  100  637M    0     0  33.9M      0  0:00:18  0:00:18 --:--:-- 36.5M
paracrawl-release1.en-ru.zipporah0-dedup-clean.en
paracrawl-release1.en-ru.zipporah0-dedup-clean.ru
Archive:  yandex.zip
  inflating: corpus.en_ru.1m.en      
  inflating: corpus.en_ru.1m.ru      


In [5]:
data_en, data_ru = load_files(path)

raw_data = {'English' : [line for line in data_en], 'Russian': [line for line in data_ru]}

df = pd.DataFrame(raw_data, columns=list(raw_data.keys()))
df.shape

(1000000, 2)

In [6]:
df['en_len'] = df['English'].str.count(' ')
df['ru_len'] = df['Russian'].str.count(' ')
df = df.query('ru_len < 80 & en_len < 80')
df = df.query('ru_len < en_len * 1.5 & ru_len * 1.5 > en_len')
df.shape

(789101, 4)

In [7]:
df = df[:50000]
df.shape

(50000, 4)

In [0]:
# Create train, test, val sets.
train, test = train_test_split(df, test_size=0.2)
test, val = train_test_split(test, test_size=0.5)
train.to_csv('yandex_train.csv', index=False)
test.to_csv('yandex_test.csv', index=False)
val.to_csv('yandex_val.csv', index=False)

## Load data

In [0]:
import youtokentome as yttm
vocab_size = 30000

model_path = f'{path[:6]}_v{vocab_size}.tokenizer'
if os.path.exists(model_path):
    tokenizer = yttm.BPE(model=model_path)
else:
    data_en, data_ru = load_files(path)
    temp_file_path = 'tokenizer_text.temp'
    with open(temp_file_path, 'w') as out_file:
        out_file.write('\n'.join(map(str.lower, data_en)))
        out_file.write('\n'.join(map(str.lower, data_ru)))
    tokenizer = yttm.BPE.train(data=temp_file_path, vocab_size=vocab_size, model=model_path)
    # TODO: Delete temp file

In [3]:
def return_tokenizer_encoder(tokenizer, output_type='id'):
    output_types = {'id':yttm.OutputType.ID, 'subword':yttm.OutputType.SUBWORD}
    return lambda x: tokenizer.encode(x, output_type=output_types[output_type], bos=True, eos=True)

SRC = Field(tokenize=return_tokenizer_encoder(tokenizer, 'subword'))
TGT = Field(tokenize=return_tokenizer_encoder(tokenizer, 'subword'))

data_fields = [('tgt', TGT), ('src', SRC)]
(train_data,
 val_data,
 test_data) = TabularDataset.splits(path='./',
                                    train='yandex_train.csv',
                                    test='yandex_test.csv',
                                    validation='yandex_val.csv',
                                    format='csv', fields=data_fields,
                                    skip_header = True)

print(len(train_data), len(val_data), len(test_data))

40000 5000 5000


In [4]:
print(f"src: {vars(train_data.examples[0])['src']}")
print(f"tgt: {vars(train_data.examples[0])['tgt']}")

src: ['<BOS>', '▁-', '▁', 'Я', '▁си', 'роди', 'лец', '▁из', '?', '<EOS>']
tgt: ['<BOS>', '▁"', 'I', "'m", '▁a', '▁', 'C', 'y', 'ro', 'di', 'il', '▁from', '?', '<EOS>']


In [5]:
# TODO: Move to usage of `tokenizer` vocabulary
SRC.build_vocab(train_data)
TGT.build_vocab(train_data)
print(len(SRC.vocab.stoi), len(TGT.vocab.stoi))

22770 15967


## Dataloaders

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32

# FIX: iterator batches sentences of different lengths
train_iterator, val_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data),
    sort_key=lambda x: (len(x.src), len(x.tgt)),
    batch_size = BATCH_SIZE,
    device = device)

data_iterators = {
    'train': train_iterator,
    'val': val_iterator,
    'test': test_iterator,
}

In [14]:
# Test iterator
for _, batch in enumerate(train_iterator):
    src = batch.src
    tgt = batch.tgt
    print('src shape:',src.shape)
    print('tgt shape:', tgt.shape)
    if (src > 4).any():
        for s, t in zip(src.t()[:2], tgt.t()[:2]):
            print([SRC.vocab.itos[token] for token in s])
            print([TGT.vocab.itos[token] for token in t])
    else:
        print('Something is wrong')
    break

src shape: torch.Size([78, 32])
tgt shape: torch.Size([67, 32])
['<BOS>', '▁', 'В', '▁следующем', '▁бло', 'ке', '▁мы', '▁указыва', 'ем', '▁как', '▁мы', '▁пере', 'писы', 'вались', '▁с', '▁бан', 'ком.', '<EOS>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<BOS>', '▁', 'I', 'n', '▁the', '▁next', '▁block', '▁we', '▁specify', '▁how', '▁we', '▁correspond', 'ed', '▁with', '▁the', '▁bank.', '<EOS>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',

Defining our ``nn.Module`` and ``Optimizer``
----------------


In [0]:
# From https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntokens_src, ntokens_tgt, ninp, nhead, dim_feedforward, nlayers, pad_token, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import Transformer
        self.model_type = 'Transformer'
        self.ninp = ninp
        self.pad_token = pad_token
        self.src_mask = None
        # Token Encoders
        self.src_encoder = nn.Embedding(ntokens_src, ninp)
        self.tgt_encoder = nn.Embedding(ntokens_tgt, ninp)
        # Positional Encoding
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        # Transformer
        self.transformer = Transformer(
            d_model=ninp,
            nhead=nhead,
            num_encoder_layers=nlayers,
            num_decoder_layers=nlayers,
            dropout=dropout,
            dim_feedforward=dim_feedforward,
        )
        self.out = nn.Linear(ninp, ntokens_tgt)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        """Generate matrix for seqential reveal of tokens."""
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        self.transformer._reset_parameters()

    def forward(self, src, tgt):

        src_key_padding_mask = (src == self.pad_token).bool().t()
        tgt_key_padding_mask = (tgt == self.pad_token).bool().t()
        memory_key_padding_mask = src_key_padding_mask.clone().detach()

        if self.src_mask is None or self.src_mask.size(0) != len(src):
            self.src_mask = self._generate_square_subsequent_mask(len(src)).to(src.device)

        src_enc = self.src_encoder(src) * math.sqrt(self.ninp)
        src_enc = self.pos_encoder(src_enc)
        tgt_enc = self.tgt_encoder(tgt) * math.sqrt(self.ninp)
        tgt_enc = self.pos_encoder(tgt_enc)
        output = self.transformer(src_enc, tgt_enc,
                                  src_mask=self.src_mask,
                                  src_key_padding_mask=src_key_padding_mask,
                                  tgt_key_padding_mask=tgt_key_padding_mask,
                                  memory_key_padding_mask=memory_key_padding_mask,
                                  )
        output = self.out(output)
        del src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask, src_enc, tgt_enc
        return output

In [0]:
ntokens_src = len(SRC.vocab.stoi) # the size of vocabulary
ntokens_tgt = len(TGT.vocab.stoi) # the size of vocabulary
pad_token = SRC.vocab.stoi['<pad>']
emsize = 256 # embedding dimension
nhid = 128 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 8 # the number of heads in the multiheadattention models
dropout = 0.5 # the dropout value
model = TransformerModel(ntokens_src, ntokens_tgt, emsize, nhead, nhid, nlayers, pad_token, dropout).to(device)

In [0]:
# Ignore padding index during the loss computation.
PAD_IDX = TGT.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
lr = 1.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5.0, gamma=0.95)

In [0]:
def run_model(model, criterion, optimizer, data_iterator, is_train_phase, desc):
    if is_train_phase:
        model.train() # Turn on the train mode
    else:
        model.eval()
    total_loss = 0.0
    pbar = tqdm.tqdm(total=len(data_iterator), desc=desc, position=0, leave=True)
    for i, batch in enumerate(data_iterator):
        src, tgt = batch.src, batch.tgt
        optimizer.zero_grad()
        with torch.set_grad_enabled(is_train_phase):
            output = model(src, tgt).transpose(1, 2)
            loss = criterion(output, tgt)

            if is_train_phase:
                loss.backward()
                # Clip gradient to deal with gradient explosion
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
                optimizer.step()

        total_loss += loss.item()
        pbar.update(1)
        pbar.set_description(desc + f'- loss: {total_loss / (i+1):7.4}')
        # Delete all new vars due to memory leak
        del batch, src, tgt, output, loss
    return model

def train_model(model, n_epochs, data_iterators, criterion, optimizer, scheduler):
    for epoch in range(n_epochs):
        print(f'------------ Epoch {epoch} ------------')
        for phase in ['train', 'val']:
            desc = f'{phase.title()} Epoch #{epoch} '
            run_model(model, criterion, optimizer,
                      data_iterators[phase], phase == 'train',
                      desc)
        scheduler.step()
    return model

In [0]:
n_epochs = 3
model = train_model(model, n_epochs, data_iterators,
                    criterion, optimizer, scheduler)

Train Epoch #0 :   0%|          | 0/1250 [00:00<?, ?it/s]

------------ Epoch 0 ------------


Train Epoch #0 - loss: 0.007051:   2%|▏         | 27/1250 [00:10<07:33,  2.69it/s]

In [0]:
def translate(model, text, max_len=80, custom_string=False):
    model.eval()
    # Prepare text
    src = SRC.preprocess(text)
    print(len(src))
    print([token for token in src])
    src = SRC.process([src], device=device)
    input_pad = SRC.vocab.stoi['<pad>']
    src_mask = (src != input_pad)
    # Run encoder
    src_enc = model.src_encoder(src) * math.sqrt(model.ninp)
    src_enc = model.pos_encoder(src_enc)
    e_outputs = model.transformer.encoder(src_enc, 
                                          src_mask
                                          )
    
    # Prepare tensor for answers
    outputs = torch.zeros(max_len).type_as(src.data)
    # Set the first token as '<sos>'
    outputs[0] = torch.LongTensor([TGT.vocab.stoi['<BOS>']])
    for i in range(1, max_len):
        tgt_mask = model._generate_square_subsequent_mask(i).to(device)
        outputs_enc = outputs[:i].unsqueeze(1)
        outputs_enc = model.tgt_encoder(outputs_enc)
        outputs_enc = model.pos_encoder(outputs_enc)

        d_out = model.transformer.decoder(outputs_enc, e_outputs,
                                          tgt_mask=tgt_mask,
                                          )
        out = model.out(d_out)
        out = F.softmax(out, dim=-1)
        _, ix = out[:, -1].data.topk(1)
        
        outputs[i] = ix[0][0]
        # if ix[0][0] == TGT.vocab.stoi['<EOS>']:
        #     break
    return ' '.join([TGT.vocab.itos[ix] for ix in outputs[1:i]])

In [38]:
translate(model, train_data[1].__dict__['src'])

11
['<BOS>', '▁', 'С', 'мер', 'тная', '▁каз', 'нь', '▁за', '▁измен', 'у.', '<EOS>']


'<BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS>'

In [39]:
translate(model, 'Машинное обучение это здорово!')

12
['<BOS>', '▁', 'М', 'а', 'ши', 'нное', '▁обучение', '▁это', '▁здоро', 'во', '!', '<EOS>']


'<BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS>'