# Assignment 7

Train a Transformer model for Machine Translation from Russian to English.  
Dataset: http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz   
Make all source and target text to lower case.  
Use following tokenization for english:  
```
import sentencepiece as spm

...
spm.SentencePieceTrainer.Train('--input=data/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

...
TGT.build_vocab(..., min_freq=5)
...

```
Score: corpus-bleu `nltk.translate.bleu_score.corpus_bleu`  
Use last 1000 sentences for model evalutation (test dataset).  
Use your target sequence tokenization for BLEU score.  
Use max_len=50 for sequence prediction.  


Hint: You may consider much smaller model, than shown in the example.  

Baselines:  
[4 point] BLEU = 0.05  
[6 point] BLEU = 0.10  
[9 point] BLEU = 0.15  

[1 point] Share weights between target embeddings and output dense layer. Notice, they have the same shape.


Readings:
1. BLUE score how to https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
1. Transformer code and comments http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [1]:
!wget http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz
!tar -xzvf training-parallel-nc-v13.tgz
!mv training-parallel-nc-v13 data

--2020-03-02 09:01:33--  http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz
Resolving data.statmt.org (data.statmt.org)... 129.215.197.184
Connecting to data.statmt.org (data.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113157482 (108M) [application/x-gzip]
Saving to: ‘training-parallel-nc-v13.tgz’


2020-03-02 09:01:46 (9.13 MB/s) - ‘training-parallel-nc-v13.tgz’ saved [113157482/113157482]

training-parallel-nc-v13/
training-parallel-nc-v13/news-commentary-v13.ru-en.ru
training-parallel-nc-v13/news-commentary-v13.cs-en.en
training-parallel-nc-v13/news-commentary-v13.de-en.de
training-parallel-nc-v13/news-commentary-v13.ru-en.en
training-parallel-nc-v13/news-commentary-v13.zh-en.zh
training-parallel-nc-v13/news-commentary-v13.zh-en.en
training-parallel-nc-v13/news-commentary-v13.cs-en.cs
training-parallel-nc-v13/news-commentary-v13.de-en.en


In [2]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |▎                               | 10kB 19.0MB/s eta 0:00:01[K     |▋                               | 20kB 1.8MB/s eta 0:00:01[K     |█                               | 30kB 2.6MB/s eta 0:00:01[K     |█▎                              | 40kB 3.4MB/s eta 0:00:01[K     |█▋                              | 51kB 2.2MB/s eta 0:00:01[K     |██                              | 61kB 2.6MB/s eta 0:00:01[K     |██▏                             | 71kB 2.9MB/s eta 0:00:01[K     |██▌                             | 81kB 3.3MB/s eta 0:00:01[K     |██▉                             | 92kB 3.7MB/s eta 0:00:01[K     |███▏                            | 102kB 2.8MB/s eta 0:00:01[K     |███▌                            | 112kB 2.8MB/s eta 0:00:01[K     |███▉                     

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import tqdm
from torchtext import datasets, data
import sentencepiece as spm


DEVICE = 'cuda'

In [4]:
!ls data

news-commentary-v13.cs-en.cs  news-commentary-v13.ru-en.en
news-commentary-v13.cs-en.en  news-commentary-v13.ru-en.ru
news-commentary-v13.de-en.de  news-commentary-v13.zh-en.en
news-commentary-v13.de-en.en  news-commentary-v13.zh-en.zh


In [5]:
# tokenize english 
with open('data/news-commentary-v13.ru-en.en') as f:
    with open('data/text.en', 'w') as out:
            out.write(f.read().lower())
        
spm.SentencePieceTrainer.Train('--input=data/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

True

In [6]:
# tokenize russian

with open('data/news-commentary-v13.ru-en.ru') as f:
    with open('data/text.ru', 'w') as out:
            out.write(f.read().lower())
        
spm.SentencePieceTrainer.Train('--input=data/text.ru --model_prefix=bpe_ru --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

True

In [0]:
tok_ru = spm.SentencePieceProcessor()
tok_ru.load('bpe_ru.model')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

SRC = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_ru.encode_as_pieces(x),
    batch_first=True,
)

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

fields = (('src', SRC), ('tgt', TGT))

In [8]:
with open('data/text.ru') as f:
    src_snt = list(map(str.strip, f.readlines()))
    
with open('data/text.en') as f:
    tgt_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm.tqdm_notebook(zip(src_snt, tgt_snt))]
test = data.Dataset(examples[-1000:], fields)
train, valid = data.Dataset(examples[:-1000], fields).split(0.9)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [9]:
print('src: ' + " ".join(train.examples[100].src))
print('tgt: ' + " ".join(train.examples[100].tgt))

src: ▁скорее , ▁она ▁по ходит ▁на ▁корей скую ▁маргарет ▁т э тчер ▁ – ▁леди , ▁не ▁склон ную ▁к ▁разво ро там , ▁следуя ▁известной ▁ ф раз е ▁т э тчер , ▁на ▁человека ▁с ▁чет кими , ▁продуман ными ▁политическими ▁принципами , ▁которые ▁ожи вля ют ▁ее ▁поступки .
tgt: ▁indeed , ▁she ▁looks ▁more ▁like ▁a ▁korean ▁margaret ▁thatcher ▁ – ▁a ▁lady ▁not ▁for ▁turning , ▁in ▁thatcher ’ s ▁famous ▁phrase , ▁and ▁with ▁clearly ▁thought - through ▁political ▁principles ▁animating ▁her ▁actions .


In [10]:
len(train), len(valid), len(test)

(210743, 23416, 1000)

In [0]:
TGT.build_vocab(train, min_freq=5)
SRC.build_vocab(train, min_freq=5)

In [14]:
!wget https://raw.githubusercontent.com/thedenaas/hse_seminars/master/2019/seminar_10/transformer.py

--2020-03-02 09:04:45--  https://raw.githubusercontent.com/thedenaas/hse_seminars/master/2019/seminar_10/transformer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9478 (9.3K) [text/plain]
Saving to: ‘transformer.py.1’


2020-03-02 09:04:45 (104 MB/s) - ‘transformer.py.1’ saved [9478/9478]



In [0]:
from transformer import make_model, Batch

    
class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
#         super(BucketIteratorWrapper,self).__init__()
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: Batch(batch.src, batch.tgt, pad=TGT.vocab.stoi['<pad>']),
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)
    
class MyCriterion(nn.Module):
    def __init__(self, pad_idx):
        super(MyCriterion, self).__init__()
        self.pad_idx = pad_idx
        self.criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=pad_idx)
        
    def forward(self, x, target):
        x = x.contiguous().permute(0,2,1)
        ntokens = (target != self.pad_idx).data.sum()
        
        return self.criterion(x, target) / ntokens

In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [0]:
torch.cuda.empty_cache()

batch_size = 128
num_epochs = 4

train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                              batch_sizes=(batch_size, batch_size, batch_size), 
                                  sort_key=lambda x: len(x.src),
                                  shuffle=True,
                                  device=DEVICE,
                                  sort_within_batch=False)
                                  
train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
test_iter = BucketIteratorWrapper(test_iter)

model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
model = model.to(DEVICE)
criterion = MyCriterion(TGT.vocab.stoi['<pad>'])
#criterion = criterion.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
#scheduler = <TODO>
#NoamOpt(model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

# share weights
#<TODO>

# Train


In [20]:
def train_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm.tqdm_notebook(data_iter)
    counter = 0
    for batch in data_iter:
        optimizer.zero_grad()
        epoch_losses = []
        
        pred = model.forward(batch)
        loss = criterion.forward(pred, batch.tgt_y)
        loss.backward()
        optimizer.step()
        
        curr_loss = loss.data.detach().item()
        total_loss += curr_loss
        epoch_losses.append(curr_loss)
        general_loss = np.mean(epoch_losses)
        data_iter.set_postfix(loss = general_loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm.tqdm_notebook(data_iter)
    counter = 0
    for batch in data_iter:
        epoch_losses = []
        
        pred = model.forward(batch)
        loss = criterion.forward(pred, batch.tgt_y)
        
        curr_loss = loss.data.detach().item()
        total_loss += curr_loss
        epoch_losses.append(curr_loss)
        general_loss = np.mean(epoch_losses)
        data_iter.set_postfix(loss = general_loss)
        counter +=1
        
    total_loss /= counter
    return total_loss


for epoch in range(num_epochs-1):
    model.train()
    loss = train_epoch(train_iter, model, criterion)
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss = valid_epoch(valid_iter, model, criterion)
        #scheduler.step(loss)
        print('valid', loss)

HBox(children=(IntProgress(value=0, max=1647), HTML(value='')))

KeyboardInterrupt: ignored

In [13]:
def train_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm.tqdm_notebook(data_iter)
    counter = 0
    for batch in data_iter:
        optimizer.zero_grad()
        epoch_losses = []
        
        pred = model.forward(batch)
        loss = criterion.forward(pred, batch.tgt_y)
        loss.backward()
        optimizer.step()
        
        curr_loss = loss.data.detach().item()
        total_loss += curr_loss
        epoch_losses.append(curr_loss)
        general_loss = np.mean(epoch_losses)
        data_iter.set_postfix(loss = general_loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm.tqdm_notebook(data_iter)
    counter = 0
    for batch in data_iter:
        epoch_losses = []
        
        pred = model.forward(batch)
        loss = criterion.forward(pred, batch.tgt_y)
        
        curr_loss = loss.data.detach().item()
        total_loss += curr_loss
        epoch_losses.append(curr_loss)
        general_loss = np.mean(epoch_losses)
        data_iter.set_postfix(loss = general_loss)
        counter +=1
        
    total_loss /= counter
    return total_loss


for epoch in range(num_epochs-1):
    model.train()
    loss = train_epoch(train_iter, model, criterion)
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss = valid_epoch(valid_iter, model, criterion)
        #scheduler.step(loss)
        print('valid', loss)

NameError: ignored

In [0]:
print('valid', loss)

valid tensor(6.1940, device='cuda:0')


In [0]:
torch.save(model.state_dict(), 'iwslt.pt')

In [0]:
!ls -lAh iwslt.pt

-rw-r--r-- 1 root root 363M Feb 29 15:14 iwslt.pt


In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
2+1

3

In [0]:
!cp iwslt.pt /content/drive/My\ Drive

In [0]:
!cp /content/drive/My\ Drive/iwslt.pt .

In [0]:
def beam_search(model, src, src_mask, max_len=10, k=5):
    <TODO>

In [0]:
model.eval()
with torch.no_grad():
    for i, batch in enumerate(valid_iter):
        src = batch.src[:1]
        src_key_padding_mask = src != SRC.vocab.stoi["<pad>"]
        beam = beam_search(model, src, src_key_padding_mask)
        
        seq = []
        for i in range(1, src.size(1)):
            sym = SRC.vocab.itos[src[0, i]]
            if sym == "</s>": break
            seq.append(sym)
        seq = tok_ru.decode_pieces(seq)
        print("\nSource:", seq)
        
        print("Translation:")
        for pred, pred_proba in beam:                
            seq = []
            for i in range(1, pred.size(1)):
                sym = TGT.vocab.itos[pred[0, i]]
                if sym == "</s>": break
                seq.append(sym)
            seq = tok_en.decode_pieces(seq)
            print(f"pred {pred_proba:.2f}:", seq)
                
        seq = []
        for i in range(1, batch.tgt.size(1)):
            sym = TGT.vocab.itos[batch.tgt[0, i]]
            if sym == "</s>": break
            seq.append(sym)
        seq = tok_en.decode_pieces(seq)
        print("Target:", seq)
        break


Source: рост
Translation:
pred -1.31: growth
pred -2.03: growth growth
pred -3.63: rising growth
pred -3.89: growth in growth
pred -4.38: growth growth growth
Target: inflation



Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.



In [0]:
from nltk.translate.bleu_score import corpus_bleu

In [0]:
hypotheses = []
references = []

model.eval()
with torch.no_grad():
    for batch in test_iter:
        <TODO>

In [0]:
corpus_bleu(references, hypotheses, 
            smoothing_function=translate.bleu_score.SmoothingFunction().method3,
            auto_reweigh=True
           )

0.22829332685417014

# Eval

In [16]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!cp /content/drive/My\ Drive/iwslt.pt .

In [49]:
model_loaded = make_model(32709, 28276, N=6)
model_loaded.load_state_dict(torch.load('iwslt.pt', map_location='cpu'))
model_loaded = model_loaded.to(DEVICE)
model_loaded.eval()

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SublayerConnection(
            (norm): LayerNorm()

In [0]:
model = model_loaded

In [0]:
model = model.to(DEVICE)

In [0]:
hypotheses = []
references = []

model.eval()
with torch.no_grad():
    for batch in test_iter:
        pred = model.forward(batch)
        sents = torch.argmax(torch.softmax(pred, dim=-1), dim=-1)
        hypotheses.extend([[TGT.vocab.itos[ix] for ix in sent] for sent in sents])
        references.extend([[[TGT.vocab.itos[ix] for ix in sent]] for sent in batch.tgt_y])

In [0]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [27]:
corpus_bleu(
    references, hypotheses, smoothing_function=SmoothingFunction().method3,
    auto_reweigh=True
)

0.0015378640393985269

In [0]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [0]:
#from transformer import subsequent_mask

def beam_search(model, src, src_mask, max_len=10, k=5):
    memory = model.encode(src, src_mask)
    start_token = TGT.vocab.stoi["<s>"]
    end_token = TGT.vocab.stoi["</s>"]
    ys = torch.ones(1, 1).fill_(start_token).type_as(src.data)
    beam = [(ys, 0)]
    for i in range(max_len):
        candidates= []
        candidates_proba = []
        prev_prob = None
        for snt, snt_proba in beam:
            if snt[0][-1] == end_token:
                candidates.append(snt)
                candidates_proba.append(snt_proba)
            else:
                proba = model.decode(memory, src_mask, snt,
                                     subsequent_mask(snt.size(1)).type_as(src.data))
                proba = proba[0][i]
                best_k = torch.argsort(-proba)[:k].tolist()
                proba = proba.tolist()
                prev_prob = proba
                for tok in best_k:
                    candidates.append(torch.cat([snt, torch.ones(1, 1).type_as(src.data).fill_(tok)], dim=1))
                    candidates_proba.append(snt_proba + np.log(proba[tok])) 
         
        best_candidates = np.argsort(-np.array(candidates_proba))[:k]
        beam = [(candidates[j], candidates_proba[j]) for j in best_candidates]
    return beam

In [0]:
eos = '</s>'
pad = "<pad>"
len_test = len(list(iter(test_iter)))

In [45]:
type(src)

torch.Tensor

In [48]:
model

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SublayerConnection(
            (norm): LayerNorm()

In [57]:
model.eval()
with torch.no_grad():
    for i, batch in enumerate(valid_iter):
        src = batch.src[:1]
        src_key_padding_mask = src != TGT.vocab.stoi['<pad>']
        beam = beam_search(model, src, src_key_padding_mask, max_len=8, k=5)
        
        seq = []
        for i in range(1, src.size(1)):
            sym = SRC.vocab.itos[src[0, i]]
            if sym == eos: break
            seq.append(sym)
        seq = tok_ru.decode_pieces(seq)
        print("\nSource:", seq)
        
        print("Translation:")
        for pred, pred_proba in beam:                
            seq = []
            for i in range(1, pred.size(1)):
                sym = TGT.vocab.itos[pred[0, i]]
                if sym == eos: break
                seq.append(sym)
            seq = tok_en.decode_pieces(seq)
            print(f"pred {pred_proba:.2f}:", seq)
                
        seq = []
        for i in range(1, batch.tgt.size(1)):
            sym = TGT.vocab.itos[batch.tgt[0, i]]
            if sym == eos: break
            seq.append(sym)
        seq = tok_en.decode_pieces(seq)
        print("Target:", seq)
        break

RuntimeError: ignored

In [0]:
'''
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
for epoch in range(10):
    model_par.train()
    run_epoch((rebatch(pad_idx, b) for b in train_iter), 
              model_par, 
              MultiGPULossCompute(model.generator, criterion, 
                                  devices=devices, opt=model_opt))
    model_par.eval()
    loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), 
                      model_par, 
                      MultiGPULossCompute(model.generator, criterion, 
                      devices=devices, opt=None))
    print(loss)
'''