### TensorBoard in PaperSpace

Docs: https://docs.paperspace.com/gradient/notebooks/tensorboard

Run in console:

```
tensorboard --logdir . --bind_all
```

Open:

In [None]:
NOTEBOOK="nolw4352wz"
print(f"https://tensorboard-{NOTEBOOK}.clg07azjl.paperspacegradient.com")

### Training the model

In [None]:
import src.lightning as l

import torch
import torch.utils.data as data
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, Callback

import os
import pickle

In [None]:
from torchtext.datasets import IWSLT2017

vocab_path = 'huggingface_vocab.bin'
if os.path.exists(vocab_path):
    vocab = l.HuggingfaceTranslationVocab.load(vocab_path)
else:
    vocab = l.HuggingfaceTranslationVocab.default_dokenizer()
    vocab.train(IWSLT2017(split='train', language_pair=('en', 'de')))
    vocab.save(vocab_path)

In [None]:
indices_path = 'huggingface_iwslt.bin'
if os.path.exists(indices_path):
    with open(indices_path, "rb") as fp:
        train, val, test = pickle.load(fp)
else:
    train = vocab.numericalize(IWSLT2017(split='train', language_pair=('en', 'de')))
    val   = vocab.numericalize(IWSLT2017(split='valid', language_pair=('en', 'de')))
    test  = vocab.numericalize(IWSLT2017(split='test', language_pair=('en', 'de')))
    with open(indices_path, "wb") as fp:
        pickle.dump((train, val, test), fp)
        
train, val, test = l.wrap_data(4000, train, val, test)

In [None]:
checkpoint_path = 'checkpoints/iwslt2017'
ckpt_path = 'checkpoints/iwslt2017/...'

if os.path.exists(ckpt_path):
    model = l.LightningSeq2Seq.load_from_checkpoint(ckpt_path, src_vocab=len(en))
else:
    model = l.LightningSeq2Seq(len(vocab), warmup=4000, N=6, factor=1)

In [None]:
trainer = pl.Trainer(
    default_root_dir=checkpoint_path,
    gpus=1,
    max_epochs=50,
    callbacks=[
        # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer
        ModelCheckpoint(
            save_weights_only=False, save_top_k=1, save_last=True,
            mode="min", monitor="val_loss",
            every_n_epochs=1
        ),
        # Log learning rate every epoch
        LearningRateMonitor("step"),
        l.LogDistributions()
    ],
    fast_dev_run=False,
    log_every_n_steps=25,
    accumulate_grad_batches={0:1, 2:5, 8:10}
)

In [None]:
trainer.fit(model, train, val, ckpt_path=ckpt_path if os.path.exists(ckpt_path) else None)

In [None]:
import torch.nn as nn

def greedy_decode(seq2seq, src, start_symbol=l.BOS_IDX, max_len=50, padding_idx=0, repeat_allowed=False):
    model = seq2seq.model
    model.eval()
    memory = model.transformer.encoder(model.src_embed(src), src_key_padding_mask=l.paddingmask.padding_mask(src, padding_idx))
    
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data).to(seq2seq.device)
    for _ in range(max_len-1):
        out = model.transformer.decoder(model.trg_embed(ys), memory,
                                       tgt_mask=nn.Transformer.generate_square_subsequent_mask(ys.size(-1)).to(seq2seq.device),
                                       tgt_key_padding_mask=None)
        prob = model.generator(out[:, -1])
        if not repeat_allowed:
            prob[0, ys[0][-1]] = -100 # do not return the same word as the last word in the input
        
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == l.EOS_IDX:
            break
    return ys

def greedy_translate(seq2seq, text, vocabs, tokenizers):
    tokenizer,_ = tokenizers
    en, de = vocabs
    src = torch.LongTensor([l.BOS_IDX] + en(tokenizer(text)) + [l.EOS_IDX]).unsqueeze(0).to(seq2seq.device)
    tgt = greedy_decode(seq2seq, src)
    tokens = de.lookup_tokens(tgt[0].tolist())
    return ' '.join(tokens)
    #return tgt