<a
href="https://colab.research.google.com/github/wingated/cs474_labs_f2019/blob/master/DL_Lab7.ipynb"
  target="_parent">
  <img
    src="https://colab.research.google.com/assets/colab-badge.svg"
    alt="Open In Colab"/>
</a>

## Setup


In [1]:
! pip install opustools

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
import opustools
from torchmetrics.text import CHRFScore
seaborn.set_context(context="talk")
%matplotlib inline

device = torch.device('cuda')

# Download english and spanish vocab and general conference texts
!pip install torchtext==0.6.0
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm
# !wget  -O ./spanish "https://raw.githubusercontent.com/nickwalton/translation/master/gc_2010-2017_conglomerated_20171009_es.txt"
# !wget -O ./english "https://raw.githubusercontent.com/nickwalton/translation/master/gc_2010-2017_conglomerated_20171009_en.txt"
!cd french
!opus_express -s en -t fr -c Books -q
!cd ../finnish
!opus_express -s en -t fi -c Books -q
!cd ..

Collecting opustools
  Downloading opustools-1.6.1-py3-none-any.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ruamel.yaml (from opustools)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml->opustools)
  Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.7/526.7 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ruamel.yaml.clib, ruamel.yaml, opustools
Successfully installed opustools-1.6.1 ruamel.yaml-0.18.6 ruamel.yaml.clib-0.2.8
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [

# Model

## Model Helpers


In [2]:
class PositionwiseFeedForward(nn.Module):
    "Simple linear layers with dropout and relu"
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

class Embeddings(nn.Module):
    "Create word embeddings"
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

class LayerNorm(nn.Module):
    "Construct a layernorm module "
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

## Encoder

The encoder is composed of a stack of $N=6$ identical layers.

In [3]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)



class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward "
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size
        self.visualizing = False

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask, viz=self.visualizing))
        return self.sublayer[1](x, self.feed_forward)

## Decoder

The decoder is also composed of a stack of $N=6$ identical layers.  


In [4]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

## Implement Attention


https://arxiv.org/pdf/1706.03762.pdf         
                                                                                                                                                                     

In [5]:
ALL_SCORES = []

def attention(query, key, value, mask, viz=False, h=-1):
    # Compute 'Scaled Dot Product Attention'

    # scores = QK^T/scale
    scores = (torch.bmm(query, key.permute(0, 2, 1))) / math.sqrt(query.size(-1))

    if viz:
        ALL_SCORES.append((h, scores.cpu().detach().numpy()))

    # Apply the mask
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    # output = softmax(scores)(V)
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.bmm(attention_weights, value)

    return output

# query = torch.empty(33, 26, 256).uniform_(-1, 1)
# key = torch.empty(33, 26, 256).uniform_(-1, 1)
# value = torch.empty(33, 26, 256).uniform_(-1, 1)
# mask = torch.empty(33, 1, 26).uniform_(-1, 1)

# a = attention(query, key, value, mask)
# print(a.shape)

In [6]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1, d_v=None):
        super(MultiHeadedAttention, self).__init__()
        # Implement Multi-head attention mechanism
        assert d_model % h == 0
        self.d_model = d_model
        self.d_k = d_model // h
        self.h = h
        self.d_v = d_v
        if d_v is None:
            self.d_v = self.d_k

        # Make an attention head (linear layers for q, k, and v)
        # Make h copies of the attention head (Hint: See the `clone()` helper function)
        # (turns out "clone" doesn't work on nn.Linear)
        self.heads = nn.ModuleList([
            nn.ModuleList([
                nn.Linear(d_model, self.d_k),
                nn.Linear(d_model, self.d_k),
                nn.Linear(d_model, self.d_k)
            ]) for _ in range(h)
        ])
        self.linear_out = nn.Linear(h * self.d_v, d_model)

    def forward(self, query, key, value, mask, viz=False):
        # For each attention head
            # Calculate the portion of the query, key, value that will be passed into the head
            # Pass the query, key, value through their respective layers
            # Compute attention using attention(query, value, key, mask)
            # attention() outputs
        batch_size, seq_length, _ = query.shape
        results = []
        for head, (WQ, WK, WV) in enumerate(self.heads):
            Q, K, V = WQ(query), WK(key), WV(value)
            a = attention(Q, K, V, mask, viz, head)
            results.append(a)
        output = torch.cat(results, dim=2)
        return self.linear_out(output)



## Positional Encoding                                                                                                                             
Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence.  To this end, we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks.  The positional encodings have the same dimension $d_{\text{model}}$ as the embeddings, so that the two can be summed.   There are many choices of positional encodings, learned and fixed [(cite)](https://arxiv.org/pdf/1705.03122.pdf).

In this work, we use sine and cosine functions of different frequencies:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
$$PE_{(pos,2i)} = sin(pos / 10000^{2i/d_{\text{model}}})$$

$$PE_{(pos,2i+1)} = cos(pos / 10000^{2i/d_{\text{model}}})$$                                                                                                                                                                                                                                                        
where $pos$ is the position and $i$ is the dimension.  That is, each dimension of the positional encoding corresponds to a sinusoid.  The wavelengths form a geometric progression from $2\pi$ to $10000 \cdot 2\pi$.  We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset $k$, $PE_{pos+k}$ can be represented as a linear function of $PE_{pos}$.

In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks.  For the base model, we use a rate of $P_{drop}=0.1$.
                                                                                                                                                                                                                                                    


In [7]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = 1 / (10000 ** (torch.arange(0., d_model, 2) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        return self.dropout(x)

## Full Model

In [9]:
class TransformerModel(nn.Module):
    """
    Full transformer model
    """
    def __init__(self, src_vocab, tgt_vocab, N=6, d_model=256, d_ff=1024, h=4, dropout=0.1, dec=1):
        super(TransformerModel, self).__init__()

        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        c = copy.deepcopy

        self.encoder = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N)

        self.decoders = [Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N) for _ in range(dec)]
        self.src_embeds = [nn.Sequential(Embeddings(d_model, src_vocab[i]), c(position)) for i in range(dec)]
        self.tgt_embeds = [nn.Sequential(Embeddings(d_model, tgt_vocab[i]), c(position)) for i in range(dec)]
        self.generators = [Generator(d_model, tgt_vocab[i]) for i in range(dec)]
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

        self.index = 1

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoders[self.index](self.src_embeds[self.index](src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoders[self.index](self.tgt_embeds[self.index](tgt), memory, src_mask, tgt_mask)

# Training


## Batches and Masking

In [10]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask


global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

## Label Smoothing

During training, we employed label smoothing of value $\epsilon_{ls}=0.1$ [(cite)](https://arxiv.org/abs/1512.00567).  This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.  

In [11]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

## Data Loading


In [12]:
from torchtext import data, datasets
import torchtext
import spacy

# Load spacy tokenizers.
# spacy_es = spacy.load('es_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')
spacy_fr = spacy.load('fr_core_news_sm')

# def tokenize_es(text):
#     return [tok.text for tok in spacy_es.tokenizer(text)]

def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC1 = data.Field(tokenize=tokenize_en, pad_token=BLANK_WORD)
TGT1 = data.Field(tokenize=tokenize_fr, init_token = BOS_WORD,
                 eos_token = EOS_WORD, pad_token=BLANK_WORD)
SRC2 = data.Field(tokenize=tokenize_en, pad_token=BLANK_WORD)
TGT2 = data.Field(tokenize=tokenize_fr, init_token = BOS_WORD,
                 eos_token = EOS_WORD, pad_token=BLANK_WORD)

print("Loading Dataset")
with open('./french/train.en', 'r', encoding='utf-8') as english_text:
    english_lines1 = list(english_text)
with open('./french/train.fr', 'r', encoding='utf-8') as french_text:
    french_lines1 = list(french_text)
with open('./finnish/train.en', 'r', encoding='utf-8') as english_text:
    english_lines2 = list(english_text)
with open('./finnish/train.fr', 'r', encoding='utf-8') as finnish_text:
    finnish_lines2 = list(finnish_text)

fields = (["src", SRC], ["trg", TGT])
examples1 = [data.Example.fromlist((english_lines1[i], french_lines1[i]), fields ) for i in range(len(english_lines1))]
examples2 = [data.Example.fromlist((english_lines2[i], finnish_lines2[i]), fields ) for i in range(len(english_lines2))]

MAX_LEN = 200
train1, val1 = data.Dataset(examples1, fields=fields, filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and
        len(vars(x)['trg']) <= MAX_LEN).split()
train2, val2 = data.Dataset(examples1, fields=fields, filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and
        len(vars(x)['trg']) <= MAX_LEN).split()

MIN_FREQ = 1
SRC1.build_vocab(train1.src, min_freq=MIN_FREQ)
TGT1.build_vocab(train1.trg, min_freq=MIN_FREQ)
SRC2.build_vocab(train2.src, min_freq=MIN_FREQ)
TGT2.build_vocab(train2.trg, min_freq=MIN_FREQ)

Loading Dataset


## Training Code

In [13]:
class LossFunction:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)),
                              y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.zero_grad()
        return loss.data * norm

class DataIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)

        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

def rebatch(pad_idx, batch):
    "Fix order in torchtext to match ours"
    src, trg = batch.src.transpose(0, 1).to(device), batch.trg.transpose(0, 1).to(device)
    return Batch(src, trg, pad_idx)


def run_epoch(data_iter1, dataiter2, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batches in enumerate(zip(data_iter1, data_iter2)):
        for j in range(2):
            model.index = j+1
            batch = batches[j]

            out = model.forward(batch.src, batch.trg,
                                batch.src_mask, batch.trg_mask)
            loss = loss_compute(out, batch.trg_y, batch.ntokens)
            total_loss += loss
            total_tokens += batch.ntokens
            tokens += batch.ntokens
            if i % 50 == 1:
                elapsed = time.time() - start
                print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                        (i, loss / batch.ntokens, tokens / elapsed))
                start = time.time()
                tokens = 0
    return total_loss / total_tokens


## Train

In [None]:
import gc
gc.collect()

pad_idx1 = TGT1.vocab.stoi["<blank>"]
pad_idx2 = TGT2.vocab.stoi["<blank>"]
model = TransformerModel(len(SRC1.vocab), len(TGT1.vocab), N=2, dec=2, decvs=len(SRC2.vocab), decvt=len(TGT2.vocab)).to(device)
n_epochs = 3

def scope():
    criterion1 = LabelSmoothing(size=len(TGT1.vocab), padding_idx=pad_idx1, smoothing=0.1).to(device)
    criterion2 = LabelSmoothing(size=len(TGT2.vocab), padding_idx=pad_idx2, smoothing=0.1).to(device)
    BATCH_SIZE = 1000
    train_iter1 = DataIterator(train1, batch_size=BATCH_SIZE, device=device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=True)
    train_iter2 = DataIterator(train2, batch_size=BATCH_SIZE, device=device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=True)

    model_opt = torch.optim.Adam(model.parameters(), lr=5e-4)
    for epoch in range(n_epochs):
        model.train()
        run_epoch(((rebatch(pad_idx1, b1), rebatch(pad_idx2, b2)) for b1, b2 in zip(train_iter1, train_iter2)),
                  model,
                  LossFunction(model.generator, criterion, model_opt))
        model.eval()
scope()

## Translate

In [17]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask,
                           Variable(ys),
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys

In [None]:
BATCH_SIZE = 1000
n_train_iters = len(train) / BATCH_SIZE
valid_iter = DataIterator(val, batch_size=BATCH_SIZE, device=device,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=False)

for outside_i, batch in enumerate(valid_iter):
    src = batch.src.transpose(0, 1)[:1].to(device)
    src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2).to(device)
    out = greedy_decode(model, src, src_mask,
                        max_len=60, start_symbol=TGT.vocab.stoi["<s>"])
    print("Original:", end="\t")
    for i in range(0, src.size(1)):
        sym = SRC.vocab.itos[src[0, i]]
        if sym == "</s>": break
        print(sym, end =" ")
    print()
    print("Translation:", end="\t")
    for i in range(1, out.size(1)):
        sym = TGT.vocab.itos[out[0, i]]
        if sym == "</s>": break
        print(sym, end =" ")
    print()
    print("Target:\t", end="\t")
    for i in range(1, batch.trg.size(0)):
        sym = TGT.vocab.itos[batch.trg.data[i, 0]]
        if sym == "</s>": break
        print(sym, end =" ")
    print()
    print()

    if outside_i > 40 and outside_i < 1100:
        break

### ChrF

## Dissection

In [15]:
torch.save(model, "./saved-model")

In [None]:
model = torch.load("./saved-model")

In [None]:
model.encoder.layers[0].visualizing = True
model.encoder.layers[1].visualizing = True

sentences = (
    "He had three children .",
    "He will have three friends .",
    "His friends will have two parties .",
    "Her name was on the list of people at the ball ."
)

for raw_sentence in sentences:

  ALL_SCORES = []
  sentence = raw_sentence.split(" ")
  src1 = torch.tensor([SRC1.vocab.stoi[word] for word in sentence]).unsqueeze(-2).to(device)
  src_mask1 = (src1 != SRC1.vocab.stoi["<blank>"]).unsqueeze(-2).to(device)
  model.index = 1
  out1 = greedy_decode(model, src1, src_mask1,
                          max_len=60, start_symbol=TGT1.vocab.stoi["<s>"])
  src2 = torch.tensor([SRC2.vocab.stoi[word] for word in sentence]).unsqueeze(-2).to(device)
  src_mask2 = (src2 != SRC2.vocab.stoi["<blank>"]).unsqueeze(-2).to(device)
  model.index = 2
  out2 = greedy_decode(model, src2, src_mask2,
                          max_len=60, start_symbol=TGT2.vocab.stoi["<s>"])

  print("Original:", end="\t\t")
  for i in range(0, src1.size(1)):
      sym = SRC.vocab.itos[src1[0, i]]
      if sym == "</s>": break
      print(sym, end =" ")
  print("\nFrench:", end="\t")
  for i in range(1, out1.size(1)):
      sym = TGT1.vocab.itos[out1[0, i]]
      if sym == "</s>": break
      print(sym, end =" ")
  print("\nFinnish:", end="\t")
  for i in range(1, out2.size(1)):
      sym = TGT2.vocab.itos[out2[0, i]]
      if sym == "</s>": break
      print(sym, end =" ")
  print()

  fig = plt.figure(figsize=(10,5))
  for i, (h, ten) in enumerate(ALL_SCORES):
      plt.subplot(2, 4, i+1)
      # plt.axis('off')
      plt.imshow(ten[0], cmap='viridis')
      plt.xticks(range(len(sentence)), sentence, size='10', rotation=90)
      plt.yticks(range(len(sentence)), sentence, size='10', rotation=0)

  plt.tight_layout()
  plt.show()
  plt.savefig("test.svg")