## Annotated Transformer

* http://nlp.seas.harvard.edu/2018/04/03/attention.html

![](http://nlp.seas.harvard.edu/images/the-annotated-transformer_14_0.png)

In [3]:
from IPython.display import Image
import math, copy
import mxnet as mx
from mxnet import autograd, gluon, nd
from mxnet.gluon import nn, utils, loss
from mxnet.ndarray import linalg_gemm2 as gemm2
import numpy as np
import pandas as pd

ctx = mx.gpu()

In [4]:
# Just calculate attention context and attention
def attention(query, key, value, mask = None, dropout = None):
    """
    Compute scaled dot product attention
    """
    d_k = query.shape[-1]
    #print('d_k ={}'.format(d_k)) # 1
    #print('query ={}'.format(query.shape)) # 1
    #print('key ={}'.format(key.shape)) # 1
    scores = gemm2(query, key, transpose_b = True) / math.sqrt(d_k) 
    #print('scores ={}'.format(scores.shape)) # 5,1,3,3,1,5
    #if mask is not None:
    #    scores[scores == 0] = -1e9
    p_attn = nd.softmax(scores, axis = -1)
    if dropout is not None:
        p_atten = dropout(p_attn)
    return gemm2(p_attn, value), p_attn

In [5]:
batch = 32
in_seq_len = 10
d_model = 20
embedding_dim = d_model
# Data
value = [nd.ones((batch, in_seq_len, embedding_dim), ctx = ctx)] * 3
print([x.shape for x in value])

# network
res_1, res_2 = attention(*value)

print('Context: {}, Attention: {}'.format(res_1.shape, res_2.shape))

[(32, 10, 20), (32, 10, 20), (32, 10, 20)]
Context: (32, 10, 20), Attention: (32, 10, 10)


In [6]:
class MultiHeadedAttention(nn.Block):
    def __init__(self, h, d_model, dropout = .1):
        """
        Take in model size and number of heads.
        h: number of heads
        d_model: size of latent space for multihead self attention
        """
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        #print('d_k = {}'.format(self.d_k))
        self.h = h
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        #print('h = {}'.format(h))
        with self.name_scope():
            self.attn = []
            self.linear_q = nn.Dense(d_model, in_units = d_model, flatten = False)
            self.linear_k = nn.Dense(d_model, in_units = d_model, flatten = False)
            self.linear_v = nn.Dense(d_model, in_units = d_model, flatten = False)
            self.linear_o = nn.Dense(d_model, in_units = d_model, flatten = False)
        
    def forward(self, query, key, value, mask = None):
        '''
        query: nd.array of size (batch, in_seq_len, embedding_dim)
        key: nd.array of size (batch, in_seq_len, embedding_dim)
        value: nd.array of size (batch, in_seq_len, embedding_dim)
        '''
        if mask is not None:
            # Same mask applied to all h heads
            mask = mask.expand_dims(axis = 1)
        batch = query.shape[0]

        # 1) Do all the linear projections in batch from d_model => h x d_k
        # gluon does not accept more than two dim for Dense layer
        # We have to reshape to two dim and reshape back to the original
        query = self.linear_q(query)  
        key = self.linear_k(key)
        value = self.linear_v(value.reshape((-1, self.d_model)))
        # Transform back to (Batch size * H * Embedding * Filters in each head)
        # Source and target sequences may differ in length. 
        # If we fix in_seq_len, there will be an error in Attention layer
        query = query.reshape((batch, self.h, -1, self.d_k))
        key = key.reshape((batch, self.h, -1, self.d_k))
        value = value.reshape((batch, self.h, -1, self.d_k))
        
        # 2) Apply attention on all the projected vectors in batch.
        x = []
        for i in range(self.h):
            _x, _attn = attention(query[:, i, :, :]
                                     , key[:, i, :, :]
                                     , value[:, i, :, :]
                                     , mask = mask
                                     , dropout = self.dropout)
            x.append(_x)
            self.attn.append(_attn)
            
        # 3) "Concat"  
        x = nd.concat(*x, dim = 2)
        # Inflate batch
        x = self.linear_o(x.reshape(-1, self.d_model))
        # Transform back to the original
        x = x.reshape((batch, -1, self.h * self.d_k)) # Batch size *  * 
        return x

In [7]:
batch = 32
in_seq_len = 10
d_model = 20
embedding_dim = d_model

h = 2 # Number of heads

# Data
value = [nd.ones((batch, in_seq_len, embedding_dim), ctx = ctx)] * 3

# Network
mlh = MultiHeadedAttention(h, d_model)
mlh.collect_params().initialize(mx.init.Xavier(), ctx = ctx)
res = mlh(*value)
res.shape

(32, 10, 20)

## Generator

In [8]:
class Generator(nn.Block):
    """
    Define standard linear + softmax generation step.
    Apply generator per time step
    """
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.vocab = vocab
        with self.name_scope():
            self.proj = nn.Dense(vocab, in_units = d_model)
        
    def forward(self, x):
        return nd.softmax(self.proj(x))

In [7]:
d_model = 20
vocab = 100
batch = 32
in_seq_len = 10
    
gen = Generator(d_model, vocab)
gen.collect_params().initialize(mx.init.Xavier(), ctx = ctx)
data = nd.array(np.random.choice(range(vocab), (batch, d_model)), ctx = ctx)

print(res.shape)
los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = gen(data)
    _los = los_fn(res, nd.random.normal(shape = res.shape, ctx = ctx))
    _los.backward()

(32, 10, 20)


## LayerNorm

In [41]:
class LayerNorm(nn.HybridBlock):
    """
    Constuct a layernorm module
    features: tuple (batch * in_seq_len * embedding_dim)
    """
    def __init__(self, d_model, eps = 1e-6): # Number of layers
        super(LayerNorm, self).__init__()
        self.eps = eps
        with self.name_scope():
            self.a = self.params.get('a', shape = (1, d_model), allow_deferred_init = False)
            self.b = self.params.get('b', shape = (1, d_model), allow_deferred_init = False)
        
    def hybrid_forward(self, F, x, a, b):
        #print('x size = {}'.format(x))
        mean = x.mean(axis = -1) # batch * _in_seq_len
        #print('mean size = {}'.format(mean.shape))
        _mean = nd.repeat(mean.expand_dims(axis = -1), repeats = x.shape[-1], axis = -1) # batch * _in_seq_len * embedding_dim
        #print('_mean size = {}'.format(_mean))
        std = nd.sqrt(nd.sum(nd.power((x - _mean), 2), axis = -1) / x.shape[1]) # batch * _in_seq_len
        _std = nd.repeat(std.expand_dims(axis = -1), repeats = x.shape[-1], axis = -1) # batch * _in_seq_len * embedding_dim
        #print('_std size = {}'.format(_std))
        #print('self.a size = {}'.format(self.a.shape))
        #print('a = {}'.format(a))
        #print('multi = {}'.format((x - _mean)))
        #print('b = {}'.format(F.multiply((x - _mean), a.reshape((1,3)))))
        #print('c = {}'.format((_std  + self.eps)))
        
        return F.elemwise_div(F.multiply((x - _mean), a), (_std  + self.eps)) + b

In [42]:
d_model = 3
vocab = 5
batch = 1
in_seq_len = 6

layer_norm = LayerNorm(d_model)
print(layer_norm.collect_params())
layer_norm.collect_params().initialize(mx.init.Xavier(), ctx = ctx)
data = nd.array([[[1, 2, 3],[1, 2, 3],[1, 2, 3],[1, 2, 3],[1, 2, 3],[4, 5, 6]]], ctx = ctx)
print('data = {}'.format(data.shape))
res = layer_norm(data)
print(res.shape)
print(res)
los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = layer_norm(data)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
_los.backward()

layernorm18_ (
  Parameter layernorm18_a (shape=(1, 3), dtype=<class 'numpy.float32'>)
  Parameter layernorm18_b (shape=(1, 3), dtype=<class 'numpy.float32'>)
)
data = (1, 6, 3)
x size = 
[[[1. 2. 3.]
  [1. 2. 3.]
  [1. 2. 3.]
  [1. 2. 3.]
  [1. 2. 3.]
  [4. 5. 6.]]]
<NDArray 1x6x3 @gpu(0)>
_mean size = 
[[[2. 2. 2.]
  [2. 2. 2.]
  [2. 2. 2.]
  [2. 2. 2.]
  [2. 2. 2.]
  [5. 5. 5.]]]
<NDArray 1x6x3 @gpu(0)>
_std size = 
[[[0.57735026 0.57735026 0.57735026]
  [0.57735026 0.57735026 0.57735026]
  [0.57735026 0.57735026 0.57735026]
  [0.57735026 0.57735026 0.57735026]
  [0.57735026 0.57735026 0.57735026]
  [0.57735026 0.57735026 0.57735026]]]
<NDArray 1x6x3 @gpu(0)>
a = 
[[-1.179376  -0.805915   0.7194096]]
<NDArray 1x3 @gpu(0)>
multi = 
[[[-1.  0.  1.]
  [-1.  0.  1.]
  [-1.  0.  1.]
  [-1.  0.  1.]
  [-1.  0.  1.]
  [-1.  0.  1.]]]
<NDArray 1x6x3 @gpu(0)>
b = 
[[[ 1.179376  -0.         0.7194096]
  [ 1.179376  -0.         0.7194096]
  [ 1.179376  -0.         0.7194096]
  [ 1.179376  -0. 

## Encoder and Decoder Stacks

  * The encoder is composed of a stack of $N=6$ identical layers
  * No moduleList in gluon mxnet

In [47]:
class SublayerConnection(nn.Block):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    size: tuple (batch * in_seq_len * embedding_dim)
    """
    
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.dropout = nn.Dropout(dropout)
        with self.name_scope():
            self.norm = LayerNorm(size)
            

    def forward(self, x, sublayer):
        """
        Apply residual connection to any sublayer with the same size.
        x: nd.array of size (batch * in_seq_len * embedding_dim)
        """
        #print('aa = {}'.format(self.norm(x).shape))
        #print('bb = {}'.format(sublayer(self.norm(x)).shape))
        #print('x after sublayer = {}'.format(sublayer(self.norm(x)).shape))
        return x + self.dropout(sublayer(self.norm(x)))

In [11]:
d_model = 512
vocab = 100
batch = 1
in_seq_len = 10

sublayer_connection = SublayerConnection(d_model, 0.5)
sublayer_connection.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

# Data
data = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)
    
# Define sublayer
sublayer = nn.Dense(d_model, flatten = False)
sublayer.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

#network
res = sublayer_connection(data, sublayer)
print(res.shape)

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = sublayer_connection(data, sublayer)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
    _los.backward()

x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
(1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = 

## Position-wise Feed-Forward Networks

* In addition to attention sub-layers, each of the layers in our encoder and decoder contains a **fully connected feed-forward network**, which is **applied to each position separately and identically**. This consists of two linear transformations with a ReLU activation in between.

$$FFN(x)=\max(0,xW_1+b_1)W_2+b_2$$

* While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. 

* The dimensionality of input and output is $d_{model} = 512$, and the inner-layer has dimensionality $d_{ff} = 2048$

In [12]:
class PositionwiseFeedForward(nn.Block):
    """
    Implements FFN equation
    d_model: size of latent space for multihead self attention
    hidden_dim: Hidden space for feedforward network
    """
    
    def __init__(self, d_model, hidden_dim, dropout = .1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Dense(hidden_dim, in_units= d_model, flatten = False)
        self.w_2 = nn.Dense(d_model, in_units = hidden_dim, flatten = False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # Point-wise feed-forward network does not have in_seq dimension
        # We have to apply the same network for each time sequence
        # In this case use Dense without flatten since Dense(flatten = True) is default
        #print('x shape in PFF = {}'.format(x.shape))
        #print('x-1 shape = {}'.format(self.w_1(x).shape))
        #print('w_1 param = {}'.format(self.w_1.weight))
        return self.w_2(self.dropout(nd.relu(self.w_1(x))))

In [13]:
d_model = 20
vocab = 100
batch = 32
in_seq_len = 10
d_hidden_feed_forward = 5

positionwise_feed_forward = PositionwiseFeedForward(d_model, d_hidden_feed_forward)        
positionwise_feed_forward.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

# Data
data = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)

#network
res = positionwise_feed_forward(data)
print(res.shape)

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = positionwise_feed_forward(data)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
    _los.backward()

(32, 10, 20)
los = 
[-10.387033]
<NDArray 1 @gpu(0)>


In [14]:
positionwise_feed_forward

PositionwiseFeedForward(
  (w_1): Dense(20 -> 5, linear)
  (w_2): Dense(5 -> 20, linear)
  (dropout): Dropout(p = 0.1, axes=())
)

## Positional Encoding

* Since our model contains **no recurrence and no convolution**, in order for the model to make use of the order of the sequence, **we must inject some information about the relative or absolute position of the tokens in the sequence.** To this end, we add **“positional encodings”** to the input embeddings at the bottoms of the encoder and decoder stacks. 

* The positional encodings have the same dimension $d_{model}$ as the embeddings, so that the two can be summed. 

* There are many choices of positional encodings, learned and fixed (cite).

* In this work, we use sine and cosine functions of different frequencies: 
  - $PE_{(pos,2i)} = \sin(pos/10000^{2i/dmodel})$
  - $PE_{(pos,2i+1)} = \cos(pos/10000^{2i/dmodel})$, where pos is the position and $i$ is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. 
  
 * The wavelengths form a geometric progression from $2\pi$ to $10000\cdot 2 \pi$
* We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset $k$, $PE_{pos+k}$ can be represented as a linear function of $PE_{pos}$.

* In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of $P_{drop}=0.1$.

In [15]:
class PositionalEncoding(nn.Block):
    """
    Implement the PE function
    단순히 위치만 표현하고 싶음. 원래의 데이터 + 그 데이터가 해당하는 위치까지의 각 차원에서의 sine값과 cosine값
    d_model: size of latent space for multihead self attention
    """
    def __init__(self, d_model, dropout, max_len = 5000, ctx = mx.cpu()):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Compute the positional encodings once in log space.
        self.pe = nd.zeros((max_len, d_model), ctx = ctx)
        position = nd.arange(0, max_len).expand_dims(axis = 1)
        div_term = nd.exp(nd.arange(0, d_model, 2) * -(nd.log(nd.array([10000.0])) / d_model))
        self.pe[:, 0::2] = nd.sin(position * div_term) # 0부터 2번째마다. 그러니깐 0부터 1칸씩 떼서 한줄씩
        self.pe[:, 1::2] = nd.cos(position * div_term) # 1부터 2번째마다. 그러니깐 1부터 1칸씩 떼서 한줄씩
        self.pe = self.pe.expand_dims(axis = 0)
        
    def forward(self, x):
        #print('x shape = {}'.format(x.shape))
        #print('pe shape = {}'.format(self.pe.shape))
        x = x + self.pe[:, :x.shape[1]]
        return self.dropout(x)

In [16]:
pea = nd.zeros((5000, 512))
pea[:, 0::4].shape

import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize = (15 ,5))
pe = PositionalEncoding(20, 0, ctx = ctx)
x = nd.zeros((1, 100, 20), ctx = ctx)
y = pe(x)
plt.plot(np.arange(100), y[0, :, 4:8].asnumpy())
plt.legend(['dim %d'%p for p in [4, 5, 6, 7]])
None

## Encoder Layer
* Each layer has two sub-layers. 
* The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed- forward network.

In [17]:
class EncoderLayer(nn.Block):
    """
    Encoder is made up of self-attention nad feed forward
    size: size for sublayer connectioh tuple: (batch * in_seq_len * embedding_dim)
    self_attn: self attention layer
    feed_forward: network
    dropout: dropout rate float
    """
    def __init__(self, size, self_attn, feed_forward, dropout = .1):
        super(EncoderLayer, self).__init__()
        with self.name_scope():
            self.self_attn = self_attn
            self.feed_forward = feed_forward
            # Size is necessary for layer_normalization
            self.sublayer_0 = SublayerConnection(size, dropout)
            self.sublayer_1 = SublayerConnection(size, dropout)
            self.size = size
        
    def forward(self, x, mask = None):
        """
        Follow figure 1 for connections
        x: ndarray tuple (n_batch, in_seq_len, embedding_dim)
        """
        x = self.sublayer_0(x, lambda x: self.self_attn(x, x, x, mask))
        #print('x shape = {}'.format(x.shape))
        # Apply the same position-wise feed-forward network
        # Reshape (Batch size * in_seq_len * d_model) into (-1, d_model)
        return self.sublayer_1(x, self.feed_forward)

In [18]:
d_model = 512
vocab = 100
batch = 1

in_seq_len = 10
d_hidden_feed_forward = 5


# Input layers
positionwise_feed_forward = PositionwiseFeedForward(d_model, d_hidden_feed_forward)
mlh = MultiHeadedAttention(h, d_model)

# network
encoder_layer = EncoderLayer(d_model, mlh, positionwise_feed_forward)
encoder_layer.collect_params().initialize(mx.init.Xavier(), ctx = ctx)
# Data
data = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)

#network
res = encoder_layer(data)
print(res.shape)

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = encoder_layer(data)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
    _los.backward()

x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)


In [19]:
# We have to use sequential instead of 'list of layers' for parameter initialization
class Encoder(nn.Block):
    """
    Core encoder is a stack of N layers
    """
    def __init__(self, in_seq_len, d_model, h, d_hidden_feed_forward , N):
        super(Encoder, self).__init__()
        with self.name_scope():
            self.layers = nn.Sequential()
            self.norm = LayerNorm(d_model)
            for i in range(N): # stack of N layers
                self.layers.add(EncoderLayer(d_model
                          , MultiHeadedAttention(h, d_model)
                          , PositionwiseFeedForward(d_model, d_hidden_feed_forward)
                          , dropout
                          )
                )
            
    def forward(self, x, mask = None):
        """
        Pass the input (and mask) through each layer in turn.
        """
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [20]:
d_model = 512
vocab = 100
batch = 1
in_seq_len = 10
dropout = .1
h = 2
d_hidden_feed_forward = 20

# input layers
#positionwise_feed_forward = PositionwiseFeedForward(d_model, d_hidden_feed_forward)
#mlh = MultiHeadedAttention(h, embedding_dim)

enc = Encoder(in_seq_len, d_model, h, d_hidden_feed_forward, 6)
enc.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

data = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)
res = enc(data)
print(res.shape)

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = enc(data)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
    _los.backward()

x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)


c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std siz

## Embedding and Softmax

* Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension $d_{model}$. 

* We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. 

* In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation. 

* In the embedding layers, we multiply those weights by $\sqrt{d_{model}}$

In [21]:
class Embeddings(nn.Block):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        with self.name_scope():
            self.emb = nn.Embedding(vocab, d_model)
            
        
    def forward(self, x):
        # nd.array of size (1, 1) => np.sqrt not nd.sqrt
        return self.emb(x) * np.sqrt(self.d_model)

In [22]:
d_model = 512
vocab = 100
batch = 1
in_seq_len = 10

data = nd.array(np.random.choice(range(vocab), (batch, in_seq_len)))

embeddings = Embeddings(d_model, vocab)
embeddings.collect_params().initialize(mx.init.Xavier())
res = embeddings(data)
print(res.shape)

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = embeddings(data)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape)))
    print('los = {}'.format(_los))
    _los.backward()

(1, 10, 512)
los = 
[-0.22586372]
<NDArray 1 @cpu(0)>


## Decoder Layer

* The decoder is also composed of a stack of $N=6$ identical layers
* In addition to the two sub-layers in each encoder, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack.
* SImilar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization.

In [23]:
class DecoderLayer(nn.Block):
    """
    Decoder is made of self-attn, src-attn, and feed forward (defined below)
    """
    
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer_0 = SublayerConnection(size, dropout)
        self.sublayer_1 = SublayerConnection(size, dropout)
        self.sublayer_2 = SublayerConnection(size, dropout)
        
    def forward(self, x, memory, src_mask = None, trg_mask = None):
        """
        Follow Figure 1 (right) for connections.
        x: ndarray tuple (n_batch, in_seq_len, embedding_dim)
        memory: attention
        """
        m = memory
        x = self.sublayer_0(x, lambda x: self.self_attn(x, x, x, trg_mask))
        x = self.sublayer_1(x, lambda x: self.src_attn(x, m, m, src_mask)) # Concat information from input & target
        return self.sublayer_2(x, self.feed_forward)
        

In [24]:
d_model = 512
vocab = 100
batch = 1
in_seq_len = 10
d_hidden_feed_forward = 5
dropout = .1

# Input layers
positionwise_feed_forward = PositionwiseFeedForward(d_model, d_hidden_feed_forward)
trg_attn = MultiHeadedAttention(h, d_model)
masked_trg_attn = MultiHeadedAttention(h, d_model)
# network
decoder_layer = DecoderLayer(d_model, trg_attn, masked_trg_attn, positionwise_feed_forward, dropout)
decoder_layer.collect_params().initialize(mx.init.Xavier(), ctx = ctx)
# Data
x = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)
memory = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)
#network
res = decoder_layer(x, memory)
res.shape

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = decoder_layer(x, memory)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
    _los.backward()

x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)


In [25]:
class Decoder(nn.Block):
    """
    Generic N layer decoder with masking.
    """
    def __init__(self, in_seq_len, d_model, h, d_hidden_feed_forward, N):
        super(Decoder, self).__init__()
        with self.name_scope():
            self.norm = LayerNorm(d_model)
            self.layers = nn.Sequential()
            for i in range(N):
                self.layers.add(DecoderLayer(d_model
                              , MultiHeadedAttention(h, d_model)
                              , MultiHeadedAttention(h, d_model)
                              , PositionwiseFeedForward(d_model, d_hidden_feed_forward)
                              , dropout))
            
    def forward(self, x, memory, src_mask = None, trg_mask = None):
        for layer in self.layers:
            x = layer(x, memory, src_mask, trg_mask)
        return self.norm(x)

In [26]:
d_model = 512
vocab = 100
batch = 1
in_seq_len = 10
dropout = .1
h = 2
d_hidden_feed_forward = 20

# input layers cannot be passed after initialization
# we have to initiate them inside class
# Pytorch: deep copy layer and put them in modulelist

enc = Encoder(in_seq_len, d_model, h, d_hidden_feed_forward, 6)
enc.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

dec = Decoder(in_seq_len, d_model, h, d_hidden_feed_forward, 6)
dec.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

src_data = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)
trg_data = nd.random.normal(shape = (batch, in_seq_len, d_model), ctx = ctx)

tmp = enc(src_data)
res = dec(enc(src_data), memory)
print(res.shape)

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = dec(enc(src_data), memory)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
    _los.backward()

x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)


_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10,

x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)


bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 

a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
aa = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)
_mean size = (1, 10, 512)
_std size = (1, 10, 512)
self.a size = (1, 512)
a = (1, 10, 512)
b = (1, 10, 512)
c = (1, 10, 512)
bb = (1, 10, 512)
x size = (1, 10, 512)
mean size = (1, 10)


## Subsequent mask
* We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. 
* This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position $i$ can depend only on the known outputs at positions less than $i$

In [63]:
def subsequent_mask(size):
    """
    Maks out subsequent positions.
    """
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k = 1)#.astype('uint8')
    return nd.array(subsequent_mask) == 0

In [28]:
subsequent_mask(10)


[[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
  [1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
  [1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
  [1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
  [1. 1. 1. 1. 1. 1. 1. 0. 0. 0.]
  [1. 1. 1. 1. 1. 1. 1. 1. 0. 0.]
  [1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]]
<NDArray 1x10x10 @cpu(0)>

In [52]:
class EncoderDecoder(nn.Block):
    """
    A standard Encoder-Decoder architecture.
    """
    def __init__(self, encoder, decoder, src_embed, trg_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.generator = generator
        
    def forward(self, src, trg, src_mask = None, trg_mask = None):
        memory = self.encode(src, src_mask)
        return self.decode(memory, src_mask, trg, trg_mask)
    
    def encode(self, src, src_mask):
        #print('src_mask = {}'.format(type(src_mask)))
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, trg, trg_mask):
        x = self.decoder(self.trg_embed(trg), memory, src_mask, trg_mask)
        # Transform into two dimensional array
        #print(x.shape)
        _x = x.reshape(-1, x.shape[-1])
        _x = self.generator(_x)
        #print('_x shape = {}, {}'.format(x.shape[0], x.shape[1]))
        _x = _x.reshape(x.shape[0], x.shape[1], self.generator.vocab)
        return _x

In [75]:
d_model = 512
vocab = 10
batch = 1
in_seq_len = 10

src_emb = Embeddings(d_model, vocab)
trg_emb = Embeddings(d_model, vocab)

enc = Encoder(in_seq_len, d_model, h, d_hidden_feed_forward, 6)
dec = Decoder(in_seq_len, d_model, h, d_hidden_feed_forward, 6)

gen = Generator(d_model, vocab)

enc_dec = EncoderDecoder(enc, dec, src_emb, trg_emb, gen)
enc_dec.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

src_data = nd.array(np.random.choice(range(vocab), (batch, in_seq_len)), ctx = ctx)
trg_data = nd.array(np.random.choice(range(vocab), (batch, in_seq_len)), ctx = ctx)

res = enc_dec(src_data, trg_data)
print('res = {}'.format(res.shape))

los_fn = gluon.loss.KLDivLoss()
with autograd.record():
    res = enc_dec(src_data, trg_data)
    _los = nd.sum(los_fn(res, nd.random.uniform(shape = res.shape, ctx = ctx)))
    print('los = {}'.format(_los))
    _los.backward()

res = (1, 10, 10)
los = 
[-0.31559086]
<NDArray 1 @gpu(0)>


## Full Model

In [136]:
def make_model(src_vocab, trg_vocab, in_seq_len, out_seq_len, N = 6, d_model = 512, d_ff = 2048, h = 8, dropout = .1, ctx = mx.cpu()):
    """
    Helper: Construct a model from hyperparameters.
    """
    enc = Encoder(in_seq_len, d_model, h, d_ff, N)
    dec = Decoder(out_seq_len, d_model, h, d_ff, N)

    gen = Generator(d_model, vocab)
    
    src_emb = nn.Sequential()
    src_emb.add(Embeddings(d_model, src_vocab))
    src_emb.add(PositionalEncoding(d_model, dropout, ctx = ctx))
    
    trg_emb = nn.Sequential()
    trg_emb.add(Embeddings(d_model, trg_vocab))
    trg_emb.add(PositionalEncoding(d_model, dropout, ctx = ctx))
    
    model = EncoderDecoder(enc, dec, src_emb, trg_emb, gen)
    return model

In [137]:
src_vocab = 10# Small example model.
trg_vocab = 10
batch = 1
in_seq_len = 10
out_seq_len = 10
N = 6
d_model = 512
d_ff = 2048
h = 8
dropout = .1

model = make_model(src_vocab, trg_vocab, in_seq_len, out_seq_len, N, d_model, d_ff, h, dropout, ctx = ctx)
model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

# Data
src_data = nd.array(np.random.choice(range(vocab), (batch, in_seq_len)), ctx = ctx)
trg_data = nd.array(np.random.choice(range(vocab), (batch, in_seq_len)), ctx = ctx)

res = model(src_data, trg_data)
print(res.shape)

(1, 10, 10)


## Batches and Masking
*This section describes the training regime for our models.
> We stop for a quick interlude to introduce some of the tools needed to train a standard encoder decoder model. <br>
> First we define a batch object that holds the src and target sentences for training, as well as constructing the masks.

In [138]:
class Batch:
    """
    Object for holding a batch of data with mask during training
    src: 
    """
    
    def __init__(self, src, trg = None, pad = 0, ctx = mx.cpu()):
        self.src = src
        self.src_mask = (src != pad).expand_dims(axis = -2)
        self.ctx = ctx
        if trg is not None:
            self.trg = trg[:, :-1]
            #print('trg shape= {}'.format(self.trg.shape))
            #print('trg = {}'.format(self.trg))
            self.trg_y = trg[:, 1:]
            #print('trg_y shape= {}'.format(self.trg_y.shape))
            #print('trg_y = {}'.format(self.trg_y))
            self.trg_mask = self.make_std_mask(self.trg, pad, self.ctx)
            #print('trg_mask shape= {}'.format(self.trg_mask.shape))
            #print('trg_mask = {}'.format(self.trg_mask))
            self.ntokens = nd.sum(self.trg_y != pad)
            
    @staticmethod
    def make_std_mask(trg, pad, ctx):
        """
        Create a mask to hide padding ad future words.
        Compare each element of trg_mask and sub_mask. (1, 1) -> 1 o.w. -> 0
        There is no bitwise operator for mxnet
        """
        trg_mask = (trg != pad).expand_dims(axis = -2)
        trg_mask = nd.repeat(trg_mask, repeats = trg_mask.shape[-1], axis = -2)
        sub_mask = subsequent_mask(trg.shape[-1])
        sub_mask = nd.repeat(sub_mask, repeats = trg_mask.shape[0], axis = 0)
        #print('trg_mask shape = {}'.format(trg_mask.shape))
        #print(trg_mask)
        #print(sub_mask)
        #print((trg != pad).expand_dims(-2).shape)
        trg_mask = nd.multiply(trg_mask, sub_mask.as_in_context(ctx))
        #print(trg_mask.shape)
        return trg_mask

### Synthetic data

In [139]:
def data_gen(V, batch, nbatches, in_seq_len, ctx = mx.cpu()):
    "Generate random data for a src-trg copy task."
    for i in range(nbatches):
        data = nd.array(np.random.randint(1, V, size=(batch, in_seq_len)), ctx =ctx)
        data[:, 0] = 1
        src, trg = data, data
        yield Batch(src, trg, 0, ctx)

In [140]:
V = 11
data = data_gen(V, 30, 50, 10)

## Training Loop

In [175]:
def run_epoch(data_iter, model, trainer, loss_fn, ctx = mx.cpu()):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    tokens = 0
    total_loss = 0

    for i, batch in enumerate(data_iter):        
        #print('out = {}'.format(out[0, :, :].shape))
        #print('out sum= {}'.format(nd.sum(out[0, :, :], axis = 1)))
        #print('trg_y = {}'.format(batch.trg_y.shape))
        src = batch.src
        trg = batch.trg
        src_mask = batch.src_mask
        trg_mask = batch.trg_mask
        trg_y = batch.trg_y
        ntokens = batch.ntokens
        with autograd.record():
            out = model(src, trg, src_mask, trg_mask)
            _out = out.reshape(-1, out.shape[-1])
            # One-hot
            _cols = list(batch.trg_y.reshape(-1).asnumpy())
            _rows = list(range(_out.shape[0]))
            _idx = nd.array([_rows, _cols], ctx = ctx)
            #print('_ones = {}'.format(nd.ones_like(trg_y.reshape(-1))))
            #print('_idx = {}'.format(_idx.as_in_context(ctx)))
            _trg = nd.scatter_nd(nd.ones_like(trg_y.reshape(-1)), _idx, _out.shape)
            #print('trg shape= {}'.format(batch.trg_y.reshape(-1)[:5]))
            #print('_out shape= {}'.format(_out.shape))
            #print('trg shape= {}'.format(_trg.shape))
            #print('trg shape= {}'.format(_trg[:5, :]))
            #print('_rows = {}'.format(_cols))
            #print('_idx shape = {}'.format(_idx.shape))
            #_trg = nd.array([self.confidence - (self.smoothing / (self.size - 2))] * target.shape[0])
            #print('_idx = {}'.format(_idx))
            loss = nd.sum(loss_fn(_out, _trg))
            #print('loss = {}'.format(loss.asnumpy()[0]))
            #print('batch = {}'.format(i))
            loss.backward()
        trainer.step(out.shape[0])
        total_loss += loss.asnumpy()[0]
        total_tokens += ntokens.asnumpy()[0]
        tokens += ntokens.asnumpy()[0]
        #print('total tokens = {}'.format(total_tokens))
        if i % 50 == 0:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" % (i, loss.asnumpy()[0] / ntokens.asnumpy()[0], tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss #/ total_tokens
    

In [176]:
# Task: copy 10 input integers
V = 10
batch = 30
n_batch = 20
in_seq_len = 10
out_seq_len = 9
data = data_gen(V, batch, n_batch, in_seq_len)
model = make_model(V, V, in_seq_len, out_seq_len, d_model = 100, h = 5, d_ff = 50, N=2, ctx = ctx)
model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': 1e-4})
loss = gluon.loss.KLDivLoss(from_logits = False)

In [177]:
import time

for epoch in range(500):
    run_epoch(data_gen(V, batch, n_batch, in_seq_len, ctx = ctx), model, trainer, loss, ctx = ctx)

Epoch Step: 0 Loss: 0.230418 Tokens per Sec: 2681.633993
Epoch Step: 0 Loss: 0.230201 Tokens per Sec: 3602.505710
Epoch Step: 0 Loss: 0.230445 Tokens per Sec: 3587.850892
Epoch Step: 0 Loss: 0.230105 Tokens per Sec: 3579.719239
Epoch Step: 0 Loss: 0.229815 Tokens per Sec: 3536.810861
Epoch Step: 0 Loss: 0.230063 Tokens per Sec: 3593.771476
Epoch Step: 0 Loss: 0.229978 Tokens per Sec: 3542.231634
Epoch Step: 0 Loss: 0.229801 Tokens per Sec: 3543.739122
Epoch Step: 0 Loss: 0.229744 Tokens per Sec: 3551.017459
Epoch Step: 0 Loss: 0.229653 Tokens per Sec: 3507.355876
Epoch Step: 0 Loss: 0.229430 Tokens per Sec: 3558.974353
Epoch Step: 0 Loss: 0.229606 Tokens per Sec: 3515.805604
Epoch Step: 0 Loss: 0.229503 Tokens per Sec: 3469.308474
Epoch Step: 0 Loss: 0.229621 Tokens per Sec: 3560.372995
Epoch Step: 0 Loss: 0.229788 Tokens per Sec: 3208.089699
Epoch Step: 0 Loss: 0.229620 Tokens per Sec: 3295.834973
Epoch Step: 0 Loss: 0.229848 Tokens per Sec: 3538.435350
Epoch Step: 0 Loss: 0.229616 To

Epoch Step: 0 Loss: 0.200841 Tokens per Sec: 3517.629365
Epoch Step: 0 Loss: 0.199867 Tokens per Sec: 3555.309534
Epoch Step: 0 Loss: 0.200682 Tokens per Sec: 3470.871532
Epoch Step: 0 Loss: 0.199417 Tokens per Sec: 3575.334198
Epoch Step: 0 Loss: 0.200353 Tokens per Sec: 3594.512923
Epoch Step: 0 Loss: 0.203508 Tokens per Sec: 3569.654181
Epoch Step: 0 Loss: 0.200496 Tokens per Sec: 3596.647717
Epoch Step: 0 Loss: 0.198468 Tokens per Sec: 3581.258811
Epoch Step: 0 Loss: 0.196992 Tokens per Sec: 3548.013610
Epoch Step: 0 Loss: 0.198430 Tokens per Sec: 3545.514281
Epoch Step: 0 Loss: 0.197686 Tokens per Sec: 3540.105597
Epoch Step: 0 Loss: 0.196563 Tokens per Sec: 3584.841170
Epoch Step: 0 Loss: 0.196620 Tokens per Sec: 3539.895347
Epoch Step: 0 Loss: 0.195261 Tokens per Sec: 3269.402221
Epoch Step: 0 Loss: 0.197456 Tokens per Sec: 1489.871926
Epoch Step: 0 Loss: 0.196101 Tokens per Sec: 3551.340398
Epoch Step: 0 Loss: 0.195929 Tokens per Sec: 3582.097076
Epoch Step: 0 Loss: 0.197118 To

Epoch Step: 0 Loss: 0.151327 Tokens per Sec: 3584.364620
Epoch Step: 0 Loss: 0.150692 Tokens per Sec: 3594.569970
Epoch Step: 0 Loss: 0.151436 Tokens per Sec: 3586.033097
Epoch Step: 0 Loss: 0.150181 Tokens per Sec: 3531.726035
Epoch Step: 0 Loss: 0.150290 Tokens per Sec: 3567.236646
Epoch Step: 0 Loss: 0.150456 Tokens per Sec: 3516.176756
Epoch Step: 0 Loss: 0.149804 Tokens per Sec: 3544.571006
Epoch Step: 0 Loss: 0.150579 Tokens per Sec: 3560.160330
Epoch Step: 0 Loss: 0.150950 Tokens per Sec: 3547.613480
Epoch Step: 0 Loss: 0.151066 Tokens per Sec: 3456.833404
Epoch Step: 0 Loss: 0.149927 Tokens per Sec: 3546.802384
Epoch Step: 0 Loss: 0.149823 Tokens per Sec: 3488.996830
Epoch Step: 0 Loss: 0.149314 Tokens per Sec: 3551.875045
Epoch Step: 0 Loss: 0.150984 Tokens per Sec: 3591.138960
Epoch Step: 0 Loss: 0.149848 Tokens per Sec: 3584.205799
Epoch Step: 0 Loss: 0.151359 Tokens per Sec: 3566.023491
Epoch Step: 0 Loss: 0.149292 Tokens per Sec: 3540.769709
Epoch Step: 0 Loss: 0.149215 To

Epoch Step: 0 Loss: 0.146900 Tokens per Sec: 3570.667146
Epoch Step: 0 Loss: 0.146654 Tokens per Sec: 3551.061999
Epoch Step: 0 Loss: 0.146659 Tokens per Sec: 3527.062209
Epoch Step: 0 Loss: 0.147025 Tokens per Sec: 3539.574487
Epoch Step: 0 Loss: 0.146437 Tokens per Sec: 3521.852012
Epoch Step: 0 Loss: 0.146854 Tokens per Sec: 3551.908466
Epoch Step: 0 Loss: 0.146733 Tokens per Sec: 3554.383353
Epoch Step: 0 Loss: 0.146554 Tokens per Sec: 3558.191593
Epoch Step: 0 Loss: 0.146522 Tokens per Sec: 3510.791838
Epoch Step: 0 Loss: 0.146415 Tokens per Sec: 3589.795700
Epoch Step: 0 Loss: 0.146611 Tokens per Sec: 3566.169475
Epoch Step: 0 Loss: 0.146456 Tokens per Sec: 3556.079860
Epoch Step: 0 Loss: 0.146623 Tokens per Sec: 3501.965434
Epoch Step: 0 Loss: 0.146619 Tokens per Sec: 3543.550616
Epoch Step: 0 Loss: 0.146539 Tokens per Sec: 3572.503202
Epoch Step: 0 Loss: 0.146439 Tokens per Sec: 3527.831331
Epoch Step: 0 Loss: 0.146679 Tokens per Sec: 3569.586671
Epoch Step: 0 Loss: 0.146448 To

In [165]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = nd.array([[start_symbol]], ctx = ctx)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, ys, subsequent_mask(ys.shape[1]))
        #print('prob = {}'.format(out))
        next_word = nd.argmax(out, axis = 2)[:,-1]
        #print('i = {}, next_word = {}, last word = {}'.format(i, next_word.shape, next_word[:,-1].expand_dims(axis = 1).shape))
        #print('i = {}, ys = {}'.format(i, ys.shape))
        ys = nd.concat(ys, next_word.expand_dims(axis = 1), dim = 1)
        print('out = {}'.format(nd.argmax(out, axis = 2).asnumpy()))
        print('tgt = {}'.format(ys.asnumpy()))
        #ys = next_word
    return ys, out

src = nd.array([[1,2,3,4,5,6,7,8,9,10]], ctx = ctx)
print('src = {}'.format(src))
src_mask = nd.ones_like(src)
with autograd.predict_mode():
    res, out = greedy_decode(model, src, src_mask, max_len=10, start_symbol=1)
print('tgt = {}'.format(res))

src = 
[[ 1.00  2.00  3.00  4.00  5.00  6.00  7.00  8.00  9.00  10.00]]
<NDArray 1x10 @gpu(0)>
out = [[ 7.00]]
tgt = [[ 1.00  7.00]]
out = [[ 2.00  3.00]]
tgt = [[ 1.00  7.00  3.00]]
out = [[ 4.00  3.00  3.00]]
tgt = [[ 1.00  7.00  3.00  3.00]]
out = [[ 2.00  2.00  6.00  3.00]]
tgt = [[ 1.00  7.00  3.00  3.00  3.00]]
out = [[ 7.00  6.00  2.00  8.00  3.00]]
tgt = [[ 1.00  7.00  3.00  3.00  3.00  3.00]]
out = [[ 2.00  3.00  3.00  3.00  7.00  2.00]]
tgt = [[ 1.00  7.00  3.00  3.00  3.00  3.00  2.00]]
out = [[ 2.00  4.00  8.00  5.00  5.00  2.00  4.00]]
tgt = [[ 1.00  7.00  3.00  3.00  3.00  3.00  2.00  4.00]]
out = [[ 2.00  8.00  6.00  2.00  4.00  5.00  4.00  1.00]]
tgt = [[ 1.00  7.00  3.00  3.00  3.00  3.00  2.00  4.00  1.00]]
out = [[ 2.00  3.00  4.00  5.00  5.00  7.00  8.00  9.00  9.00]]
tgt = [[ 1.00  7.00  3.00  3.00  3.00  3.00  2.00  4.00  1.00  9.00]]
tgt = 
[[ 1.00  7.00  3.00  3.00  3.00  3.00  2.00  4.00  1.00  9.00]]
<NDArray 1x10 @gpu(0)>


In [161]:
np.set_printoptions(formatter={'float': '{: 0.2f}'.format})
print(nd.argmax(out, axis = 2))


[[ 2.00  3.00  4.00  5.00  5.00  7.00  8.00  9.00  9.00]]
<NDArray 1x9 @gpu(0)>


## Training Data and Batching

## Optimizer

In [None]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

In [None]:
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [None]:
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        return loss.data[0] * norm

In [None]:
opts = [NoamOpt(512, 1, 4000, None), 
        NoamOpt(512, 1, 8000, None),
        NoamOpt(256, 1, 4000, None)]
plt.plot(np.arange(1, 20000), [[opt.rate(i) for opt in opts] for i in range(1, 20000)])
plt.legend(["512:4000", "512:8000", "256:4000"])
None

## Label Smoothing

During training, we employed label smoothing of value $\varepsilon_{ls}=0.1$ (cite). This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.

We implement label smoothing using the KL div loss. Instead of using a one-hot target distribution, we create a distribution that has confidence of the correct word and the rest of the smoothing mass distributed throughout the vocabulary.

In [None]:
global max_src_in_batch, max_trg_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_trg_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_trg_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_trg_in_batch = max(max_trg_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    trg_elements = count * max_trg_in_batch
    return max(src_elements, trg_elements)

In [None]:
import numpy as np
class LabelSmoothing(nn.Block):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = gluon.loss.KLDivLoss()
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.shape[1] == self.size
        true_dist = nd.ones_like(x)
        true_dist = true_dist * (self.smoothing / (self.size - 2))      
        _cols = list(target.asnumpy())
        _rows = list(range(x.shape[0]))
        _idx = nd.array([_rows, _cols])
        _trg = nd.array([self.confidence - (self.smoothing / (self.size - 2))] * target.shape[0])
        #print('_idx = {}'.format(_idx))
        true_dist = nd.scatter_nd(_trg, _idx, x.shape) + true_dist
        #print('true dist after scatter ={}'.format(true_dist))
        #print('self.confidence ={}'.format(self.confidence))
        #print('target ={}'.format(target))
        true_dist[:, self.padding_idx] = 0
        #print('true dist after padding ={}'.format(true_dist))
        
        # allow padding_idx as scalar
        if self.padding_idx in target:
            mask = np.argmax([1 if x == self.padding_idx else 0 for x in target])[0]
        else:
            mask = None
            
        #print('mask = {}'.format(mask))
        if mask is not None:
            #true_dist.index_fill_(0, mask.squeeze(), 0.0) 
            true_dist[mask, :] = 0
            #print('true dist after masking ={}'.format(true_dist))
            
        self.true_dist = true_dist
        #print('x.shape= {}'.format(x))
        #print('true_dist.shape= {}'.format(true_dist))
        return nd.sum(self.criterion(x, true_dist)).asnumpy()[0]


In [None]:
crit = LabelSmoothing(5, 0, 0.1)
def loss(x):
    d = x + 3 * 1
    predict = nd.array([[0, x / d, 1 / d, 1 / d, 1 / d],])
    #print(predict)
    #print(nd.array([1]))
    crit(nd.array(predict), nd.array([1]))
    return crit(nd.array(predict), nd.array([1]))

x = range(1, 100)
y = [loss(x) for x in range(1, 100)]
print('y = {}'.format(y))
plt.plot(x, y)
None

In [None]:
a = gluon.loss.KLDivLoss()

In [None]:
xx = nd.array([[1,2,3], [1,2,3]])
yy = nd.array([[10,20,30], [10,20,30]])
a(xx, yy)

In [None]:
# Example of label smoothing.
crit = LabelSmoothing(5, 0, 0.4)
predict = nd.array([[0, 0.2, 0.7, 0.1, 0],
                             [0, 0.2, 0.7, 0.1, 0], 
                             [0, 0.2, 0.7, 0.1, 0]])
v = crit(predict, nd.array([2, 1, 0]))

# Show the target distributions expected by the system.
plt.imshow(crit.true_dist.asnumpy())
None

In [None]:
V = 11
data = data_gen(V, 30, 20, 10)

In [None]:
d_trg

In [None]:
d_trg_y

### Greedy Decoding

In [None]:
import time

# Train the simple copy task.
V = 11
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, V, N=2)

model.collect_params().initialize(mx.init.Xavier())
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': 1e-3})

for epoch in range(10):
    run_epoch(data_gen(V, 30, 20), model, 
              SimpleLossCompute(model.generator, criterion, trainer))
    model.eval()
    print(run_epoch(data_gen(V, 30, 5), model, 
                    SimpleLossCompute(model.generator, criterion, None)))