# Attention is all you need

I will be implementing the <a href="https://arxiv.org/pdf/1706.03762.pdf">**Attention is all you need**</a> paper following the wonderful work in <a href="http://nlp.seas.harvard.edu/2018/04/03/attention.html">**The Annotated Transformer**</a> by Harverd NLP.

This is maily for clearly understanding the transformer architecture.

In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

# Encoder-Decoder architecture

This is the base of many sequence to sequence models

In [15]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_embed = source_embed
        self.target_embed = target_embed
        self.generator = generator
        
    def forward(self, source, target, source_mask, target_mask):
        encoded_result = self.encoder(self.source_embed(source), source_mask)
        decoded_result = self.decoder(self.target_embed(target), encoded_result, source_mask, target_mask)
        return decoded_result

In [17]:
class Generator(nn.Module):
    """
    Single linear layer to project decoder output into the vocabulary space.
    """
    def __init__(self, vocab, model_dim):
        super(Generator, self).__init__()
        self.projection = nn.Linear(model_dim, vocab)
        
    def forward(self, x):
        return F.log_softmax(self.projection(x), dim=-1)

# Encoder-Decoder stacks
## Encoder

In [33]:
def clones(module, N):
    """
    Copies a module N times
    """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [35]:
class Encoder(nn.Module):
    def __init__(self, module, N):
        super(Encoder, self).__init__()
        self.layers = clones(module, N)
        self.layer_norm = LayerNorm(module.size)
    
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

Here I apply layer normalization:

<img src="https://miro.medium.com/max/498/1*VSYtYThHQtxq1dNkVu6leQ.png">

In [36]:
class LayerNorm(nn.Module):
    def __init__(self, module_size, epsilon=1e-6):
        super(LayerNorm, self).__init__()
        # creating two learnable parameters for layer norm
        self.a_2 = nn.Parameter(torch.ones(module_size))
        self.b_2 = nn.Parameter(torch.ones(module_size))
        self.epsilon = epsilon
        
    def forward(self, x):
        mean = x.mean(dim=-1, keepdims=True)
        std = x.std(dim=-1, keepdims=True)
        return (self.a_2 * (x - mean) / (std + self.epsilon)) + self.b_2