### Implementing a Transformer from Scratch
Based on `The Annotated Transformer` by Harvard NLP. Let's do this ...

In [2]:
import os
from os.path import exists
import math
import spacy
import copy
import time
import GPUtil
import warnings

import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import torchtext.datasets as datasets

from torch.nn.functional import log_softmax, pad
from torch.optim.lr_scheduler import LambdaLR
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP

import pandas as pd
import altair as alt

In [3]:
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [4]:
# some helper functions
def is_interactive_notebook():
    return '__name__' == '__main__'

def show_example(fn, args=[]):
    if __name__ == '__main__' and RUN_EXAMPLES:
        return fn(*args)
    
def execute_example(fn, args=[]):
    if __name__ == '__main__' and RUN_EXAMPLES:
        fn(*args)

class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{'lr': 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None

class DummyScheduler:
    def step(self):
        None

#### Model Architecture
The transformer follows an encoder-decoder architecture

<center>
    <img src='transformer.png' width='400'>
</center>

In [5]:
class EncoderDecoder(nn.Module):
    '''
    A standard Encoder-Decoder architecture. Base for this and many other models
    '''

    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked source and target sequences"
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [6]:
class Generator(nn.Module):
    '''
    Define standard linear + softmax generation step
    '''
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    
    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

#### Encoder and Decoder Stacks
##### Encoder
Composed of a stack of $N=6$ identical layers

In [7]:
def clones(module, N):
    '''Produce N identical layers'''
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [8]:
class Encoder(nn.Module):
    '''Core encoder is a stack of N layers'''

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = nn.LayerNorm(layer.size)

    def forward(self, x, mask):
        '''Pass the input (and mask) through each layer in turn'''
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

There is a residual connection around each of the two sub-layers in the encoder, followed by layer normalisation. The normalised input is given by:

$$\hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}}$$

First, we implement a layer normalisation module.

In [9]:
class LayerNorm(nn.Module):
    '''Construct a layer normalization module'''
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x:torch.Tensor):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

The output of each sublayer is $\mathbf{LayerNorm}(x + \mathbf{Sublayer}(x))$, where $\mathbf{Sublayer}(x)$ is the function implemented by the sub-layer itself. We then apply dropout to the output of each sublayer before it is added to the sub-layer input and normalised.

To facilitate residual connections, all sub-layers in the model, including embedding layers, produce outputs of dimension $d_{model}=512$.

In [10]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    NOTE: For code simplicity, the norm is first as opposed to last
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        """Apply residual connection to any sublayer with the same size."""
        return x + self.dropout(sublayer(self.norm(x)))

Each sublayer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a position-wise full connected feed-forward network.

In [11]:
class EncoderLayer(nn.Module):
    '''Encoder is made of self attention and feed forward portions'''

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)

    def forward(self, x, mask):
        '''Follow the architecture figure (left) for connections'''
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

##### Decoder
Composed of a stack of $N=6$ identical layers

In [12]:
class Decoder(nn.Module):
    '''Generic N layer decoder with masking.'''

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

Apart from the two sub-layers in each encoder layer, the encoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. There are also residual connections around each sub-layer, followed by layer normalisation.

In [13]:
class DecoderLayer(nn.Module):
    '''
    Decoder is made of self attention, source attention, and feed forward
    '''

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        '''Follow the architecture figure (right) for connections'''
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

We need to ensure that the predictions for position $i$ can depend only on the known outputs at positions less than $i$. To achieve this, the self-attention sub-layer in the decoder stack is modified to prevent positions from attending to subsequent positions. This is also aided by the fact that the output embeddings are offset by one position.

In [14]:
def subsequent_mask(size):
    '''Mask out subsequent positions.'''
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)
    return subsequent_mask == 0

During training, words are blocked from attending to future words.
Let's see an illustration of the masking ...

In [15]:
def example_mask():
    LS_data = pd.concat([pd.DataFrame( {'Subsequent Mask': subsequent_mask(20)[0][x, y].flatten(), 'Window': y,  'Masking': x}) for y in range(20) for x in range(20)])

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X('Window:O'),
            alt.Y('Masking:O'),
            alt.Color('Subsequent Mask:Q', scale=alt.Scale(scheme='viridis')),
        )
        .interactive()
    )

show_example(example_mask)

##### Attention
An attention function maps a query and a set of key-value pairs to an output, where the query, keys, and values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.

It is implemented as a **Scaled Dot-Product Attention** where the input consists of queries and keys of dimension $d_k$, values of dimension $d_v$. We compute the dot products of the query with all keys, divide each by $\sqrt{d_k}$, and apply a softmax function to obtain the weights on the values.

In practice, attention is computed on a set of queries, packed together into a matrix $Q$, simultaneously. The keys and values are also packed together into matrices $K$ and $V$. The output is calculated as:

\begin{equation}
    \mathbf{Attention}(Q,K,V) = \text{softmax} \left(\frac{QK^T}{\sqrt{d_k}} \right)V
\end{equation}

In [16]:
def attention(query:torch.Tensor, key:torch.Tensor, value:torch.Tensor, mask=None, dropout=None):
    '''
    Compute "scaled dot-product attention"

    -------------------------------------------------------------------------------------------- #
    # NOTES:
    - Each query, key, and value has shape (batch_size, h, 1, d_k) where h is the number of heads
    - The operation key.transpose(-2, -1) changes the shape of key to (batch_size, h, d_k, 1)
    - So, scores will have shape (batch_size, h, 1, 1)
    
    -------------------------------------------------------------------------------------------- #'
    '''

    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    
    return torch.matmul(p_attn, value), p_attn

Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. When using a single attention head, averaging inhibits this.

\begin{align}
    \mathbf{MultiHead}(Q, K, V) &= \mathbf{Concat}(head_1, \cdots, head_h)W^O \\
    \mathbf{where} &= \mathbf{Attention}(QW^Q_i, KW^K_i, VW^V_i)
\end{align}

where the projections are parameter matrices $W^Q_i \in \mathbb{R}^{d_{model} \times d_k}$, $W^K_i \in \mathbb{R}^{d_{model} \times d_k}$, $W^V_i \in \mathbb{R}^{d_{model} \times d_v}$, and $W^O_i \in \mathbb{R}^{hd_v \times d_{model}}$

In the transformer, we employ $h=8$ parallel attention layers/heads. For each of these, we use $d_k = d_v = d_{model}/h=64$. Due to this reduced dimensionality of each head, the total computational cost is similar to that of single-head attention with full dimensionality.

<center>
    <img src='multihead_attention.jpg' width=500>
</center>

In the following implementation of multi-head attention:
* Each of the $h$ learned $\mathbf{W_i^{Q/K/V}}$ matrices in the paper are implemented as single linear layers of dimensions $d_{model} \times d_{model}$ rather than $h$ different matrices of shape $d_{model} \times d_k$. The single matrix mirrors the concatenation of $h$ different matrices.
* Each linear layer implements the transformation $x\cdot A^T + b$, which differs from just multiplying the query by the matrix $W^Q_i$ because there is a bias vector added. However, the presence of the bias terms makes this implementation more robust during the training process.
* Each query, key, and value matrix is of dimensions $(n \times \text{seq\_len} \times d_{model})$ where $n$ is the number of batches of sequences in the query, corresponding to the number of tokens being passed at once, and `seq_len` being the number of tokens in each sequence.
* The queries (and keys and values) are transformed at once using the $d_{model} \times d_{model}$ linear layer before being reshaped into tensors of dimensions ($\text{nbatches}, \text{seq\_len}, h, d_k$) which mimics having the queries being transformed by $h$ different matrices. A query of dimensions $(n \times \text{seq\_len} \times d_{model})$ is transformed by the linear layer to a tensor of shape $(n \times \text{seq\_len} \times d_{model})$ i.e. it retains the same shape, but has been projected to a new subspace.

In [17]:
class MultiheadedAttention(nn.Module):
    '''Implements multi-headed attention'''
    
    def __init__(self, h, d_model, dropout=0.1):
        '''Take in the model size and number of heads'''
        super(MultiheadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query:torch.Tensor, key:torch.Tensor, value:torch.Tensor, mask:torch.Tensor=None):
        '''
        #### --------------------------------------------------------------------------------------------
        # NOTES:
        - Each query, key, and value begins with the shape (nbatches, seq_len, d_model)
        - Each linear layer has shape (d_model, d_model), which is like concatenating together h matrices of shape (d_model, d_k)
        - The view operation reshapes the tensor to (nbatches, h, seq_len, d_k). Recall that d_k = d_model // h
        - The transpose(1, 2) operation swaps the second and third dimensions
        #### --------------------------------------------------------------------------------------------'
        '''

        if mask is not None:
            mask = mask.unsqueeze(1) # same mask applied to all h heads
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            linear(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for linear, x in zip(self.linears, (query, key, value))
        ]

        
        #2) Apply attention on all the projected vectors in batch
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear
        x = (x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k))        

        del query
        del key
        del value
        
        # this is the top-most linear layer before the output of the multi-headed attention
        return self.linears[-1](x)


##### Position-wise Feed-Forward Networks
Each of the layers in our encoder and decoder also contains a fully-connected feedforward network which is applied to each position separately and identically. It comprises two linear transformations with a ReLU activation between them.

$$FFN(x) = \mathbf{max} (0, xW_1 + b_1)W_2 + b_2$$

The linear transformations are the same across different positions. That is, the same weights and biases are used for every position/token.However, as you move to the next layer, the parameters change and so do the transformations of the tokens. 

The input to the network is the output of the multi-head attention which has dimensionality $d_{model}=512$. The output (second layer) of the FFN also has this dimensionality. The first (inner) layer of the FFN has dimensionality $d_{ff}=2048$, effectively expanding the token's representation to a vector of length 2048, which helps to capture more complex patterns.

In [18]:
class PositionwiseFeedForward(nn.Module):
    """Implements FFN equation."""

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

In [19]:
p = PositionwiseFeedForward(512, 2048)
x = torch.randn(4, 6, 512)
p(x).shape

torch.Size([4, 6, 512])

##### Embeddings and Softmax
The transformer uses learned embeddings to convert input tokens and output tokens to vectors of dimension $d_{model}$. Learned linear transformations and softmax function are also used to convert the decoder output to predicted next-token probabilities. For the model implemented in the paper, the same weight matrix is shared between the two embedding layers and the pre-softmax linear transformation. For the embedding layers, the weights are multiplied by $\sqrt{d_{model}}$

In [20]:
class Embeddings(nn.Module):

    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

##### Positional Encoding
For the model to make use of the order of the sequence, we need ot inject some information about the relative or absolute position of the tokens in the sequence. We do this by adding "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks. These encodings have the same dimensionality $d_{model}$ as the embeddings to make their addition posible. 

The transformer uses sine and cosine functions of different frequencies:
\begin{align}
    PE_{pos, 2i} &= \sin \left( \frac{pos}{10000^{\frac{2i}{d_{model}}}} \right) \\
    PE_{pos, 2i+1} &= \cos \left( \frac{pos}{10000^{\frac{2i}{d_{model}}}} \right)
\end{align}

where $pos$ is the position of the token in the input sequence and $i$ is the index of the dimension in the embedding vector. In short, if the sequence is 10 tokens long, then $pos$ varies between $0$ and $9$ while $i$ varies between $0$ and $d_{model}-1$ since each token has dimensionality $d_{model}$ in the embedding space. Each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from $2\pi$ to $10000\cdot 2\pi$. 

The authors hypothesized that the sinusoidal function would allow the model to easily learn to attend by relative positions because, for any fixed offset $k$, $PE_{pos+k}$ can be represented as a **linear function** of $PE_{pos}$. That is, the encoding of a position $pos + k$ can be expressed in terms of the encoding of position $pos$. Let's derive this mathematically:

1. For even $i$:
$$PE_{pos+k, i} = \sin \left(  \frac{pos + k}{10000^{\frac{2i}{d}}} \right)$$

2. For odd $i$:
$$PE_{pos+k, i} = \cos \left(  \frac{pos + k}{10000^{\frac{2i}{d}}} \right)$$

Using trigonometric identities, these expressions can be rewritten in terms of $PE_{pos,i}$:

1. For even $i$:
$$ \sin \left(  \frac{pos + k}{10000^{\frac{2i}{d}}} \right) = \sin \left(  \frac{pos}{10000^{\frac{2i}{d}}} \right) \cos \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) + \cos \left(  \frac{pos}{10000^{\frac{2i}{d}}} \right) \sin \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) $$

2. For odd $i$:
$$ \cos \left(  \frac{pos + k}{10000^{\frac{2i}{d}}} \right) = \cos \left(  \frac{pos}{10000^{\frac{2i}{d}}} \right) \cos \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) - \sin \left(  \frac{pos}{10000^{\frac{2i}{d}}} \right) \sin \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) $$

**How is this a Linear Transformation?**

Let's rewrite the above expansions in terms of $PE_{pos, i}$:

1. For even $i$:
$$ PE_{pos + k, i} = PE_{pos, i} \cos \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) + PE_{pos, i + 1} \sin \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) $$

2. For odd $i$:
$$ PE_{pos + k, i} = PE_{pos, i} \cos \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) - PE_{pos, i - 1} \sin \left(  \frac{k}{10000^{\frac{2i}{d}}} \right) $$

Observe that 
* $PE_{pos, i}$ and $PE_{pos, i+1}$ (or $PE_{pos, i-1}$) are the positional encodings for dimension $i$ and its neighbouring dimension.
* The terms $\cos \left( \frac{k}{10000^{\frac{2i}{d}}} \right)$ and $\sin \left( \frac{k}{10000^{\frac{2i}{d}}} \right)$ are constants that depend only on the offset $k$ and the dimension $i$.

This means that the encoding for $pos + k$ is a **linear combination** of the encoding for $pos$ and its neighbouring dimensions, with coefficients that depend only on $k$ and $i$.

This property is important because it allows the model to generalise to relative positions. 
* If the model learns to attend to a token at position $pos$, it can easily attend to a token at position $pos + k$ by applying a linear transformation to the positional encoding of $pos$.
* It makes it easier for the model to capture patterns like "the next word", "the previous word", or "two words ahead" without needing to explicitly learn these relationships from scratch.

**Example**
Consider a sequence with two tokens and dimensionality of 4 (i.e., $i=0,1,2,3$). The positional encodings are:
* $ PE_{0,i} = [\sin (0), \cos (0), \sin (0), \cos (0)] = [0, 1, 0, 1] $
* $ PE_{1,i} = [\sin (1), \cos (0.01), \sin (0.0001), \cos (0.000001)] = [0.84147, 0.99995, 0.00001, 1] $

Now, we need to express $PE_1$ as a **linear combination** of $PE_0$.
* For even $i$:
$$ PE_{1,i} = PE_{0,i} \cdot \cos \left(\frac{k}{10000^{\frac{2i}{d}}}\right) + PE_{0,i+1} \cdot \sin \left(\frac{k}{10000^{\frac{2i}{d}}}\right) $$
* For odd $i$:
$$ PE_{1,i} = PE_{0,i} \cdot \cos \left(\frac{k}{10000^{\frac{2i}{d}}}\right) - PE_{0,i-1} \cdot \sin \left(\frac{k}{10000^{\frac{2i}{d}}}\right) $$

where $k=1$ and $i$ is the dimension index. 

Using these two relations, we can compute the elements of $PE_{1,i}$ from those of $PE_{0,i}$.

In [21]:
import numpy as np

pe_0 = [0, 1, 0, 1]
d = 4
pe_1 = []
for i in range(4):
    if i%2 == 0:
        ans = pe_0[i] * np.cos(1/(10000**(2*i/d))) + pe_0[i+1] * np.sin(1/(10000**(2*i/d)))
    else:
        ans = pe_0[i] * np.cos(1/(10000**(2*i/d))) - pe_0[i-1] * np.sin(1/(10000**(2*i/d)))

    pe_1.append(ans)

print(pe_1)

[0.8414709848078965, 0.9999500004166653, 9.999999983333334e-05, 0.9999999999995]


In [22]:
class PositionalEncoding(nn.Module):
    """Implement the PE function."""

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -math.log(10000.0) / d_model)
        pe[:, 0::2] = torch.sin(position * div_term) # even indices
        pe[:, 1::2] = torch.cos(position * div_term) # odd indices
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [23]:
def example_positional():
    pe = PositionalEncoding(20, 0)
    y = pe(torch.zeros(1, 100, 20))

    data = pd.concat([pd.DataFrame({"embedding": y[0, :, dim], "dimension": dim, "position": list(range(100)),}) for dim in [4, 5, 6, 7]])
    
    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )

example_positional()

Another benefit of sinusoidal functions is that they may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.

#### Full Model

In [24]:
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    '''
    Helper function to construct a model from hyperparameters.
    '''
    c = copy.deepcopy
    attn = MultiheadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)

    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    # Initialize parameters with Glorot / fan_avg
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model

#### Inference
We make a forward pass to generate a prediction of the model.

In [46]:
def inference_test():
    test_model = make_model(11, 11, 2)
    test_model.eval()
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    src_mask = torch.ones(1, 1, 10)

    memory = test_model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src.data)

    for i in range(9):
        out = test_model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data))
        prob = test_model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1)

    print("Prediction: ", ys)

In [47]:
def run_tests():
    for _ in range(10):
        inference_test()

run_tests()

Prediction:  tensor([[ 0,  4,  4, 10,  4, 10,  4, 10,  4,  4]])
Prediction:  tensor([[0, 6, 1, 1, 1, 1, 1, 1, 1, 1]])
Prediction:  tensor([[ 0,  8,  9,  0,  2, 10,  7,  4,  1,  0]])
Prediction:  tensor([[ 0,  1,  7, 10,  5,  2,  5,  2,  0,  0]])
Prediction:  tensor([[0, 2, 5, 7, 9, 6, 7, 5, 7, 5]])
Prediction:  tensor([[0, 4, 0, 0, 0, 0, 4, 0, 0, 0]])
Prediction:  tensor([[0, 5, 0, 5, 0, 5, 0, 2, 5, 0]])
Prediction:  tensor([[0, 5, 7, 2, 1, 2, 1, 7, 2, 1]])
Prediction:  tensor([[0, 3, 3, 3, 3, 3, 3, 3, 3, 3]])
Prediction:  tensor([[0, 8, 2, 3, 5, 5, 5, 5, 5, 5]])


In [None]:
nn.Linear()