In [1]:
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
from torchtext import data, datasets
import torch.nn.functional as F

import spacy
import math, copy, time
import matplotlib.pyplot as plt
import seaborn


  import pandas.util.testing as tm


In [2]:
def clones(module, N):  # N = 6 in the paper
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

def attention(q, k, v, mask=None):
    d_k = q.size(-1)
    sc = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        sc = sc.masked_fill(mask == 0, -1e9)

    att = F.softmax(sc, dim=-1)
    return torch.matmul(att, val), att


In [3]:
class MultiHeadAtt(nn.Module):
    def __init__(self, reduction_factor, d_model):      # h = 8 as per paper
        super(MultiHeadAtt, self).__init__()
        self.d_k = d_model // h
        self.h = reduction_factor
        self.dropout = nn.Dropout(0.1)
        self.l_transforms = clones(nn.Linear(d_model, d_model), 4)
        self.att = None

    def forward(self, q, k, v, mask=None):
        batch_size = query.size(0)
        if mask is not None:
            mask = mask.unsqueeze(1)
        
        query, key, value =  [linear(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
             for linear, x in zip(self.l_transforms, (query, key, value))]
        
        # Apply attention on all the projected vectors in batch.
        x, self.att = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # "Concat" using a view and apply a final linear. 

        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.l_transforms[-1](x)


In [4]:
class LayerNorm(nn.Module):
    def __init__(self, f):
        super(LayerNorm, self).__init__()
        self.ones = nn.Parameter(torch.ones(f))
        self.zeros = nn.Parameter(torch.zeros(f))
        self.eps = 1e-6

    def forward(self, x):
        mu = x.mean(-1, keepdim=True)
        sigma = x.std(-1, keepdim=True)
        return self.ones * (x - mu) / (sigma + self.eps) + self.zeors


In [5]:
class ApplyNorm(nn.Module):
    def __init__(self, size, dropout):
        super(ApplyNorm, self).__init__()
        self.norm = LayerNorm(size)
        self.regularizer = nn.Dropout(dropout)

    def forward(self, x, sub_layer):
        return x + self.regularizer(sub_layer(self.norm(x)))



In [6]:
class EncoderLayer(nn.Module):
    def __init__(self, size, layer1, layer2, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attention = layer1
        self.fnn = layer2
        self.sublayer = clones(ApplyNorm(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attention(x, x, x, mask))
        return self.sublayer[1](x, self.fnn)


In [7]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


In [8]:
class DecoderLayer(nn.Module):
    def __init__(self, size, layer1, layer2, layer3, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attention = layer1
        self.ed_attention =layer2
        self.fnn = layer3
        self.sublayer = clones(ApplyNorm(size, dropout), 3)
        self.size = size

    def forward(self, x, prev, mask1, mask2):
        x = self.sublayer[0](x, lambda x: self.self_attention(x, x, x, mask2))
        x = self.sublayer[1](x, lambda x: self.ed_attention(x, prev, prev, mask1))
        return self.sublayer[2](x, self.fnn)
        

In [9]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, prev, mask1, mask2):
        for layer in self.layers:
            x = layer(x, prev, mask1, mask2)

        return x


In [10]:
class Positional_FNN(nn.Module):
    def __init__(self, d_model, d_fnn):
        super(Positional_FNN, self).__init__()
        self.layer1 = nn.Linear(d_model, d_fnn)
        self.layer2 = nn.Linear(d_fnn, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.layer1(x)
        x = F.relu(x)
        return self.layer2(x)
    

In [11]:
class Embedd(nn.Module):
    def __init__(self, vocabulary, d_model):  # d_model = 512 in Paper
        super(Embed, self).__init__()
        self.vector = nn.Embedding(vocabulary, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.vector(x) * math.sqrt(self.d_model)


In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [13]:
class Model(nn.Module):
    def __init__(self, enc, dec, src_emb, tgt_emb, gen):
        super(Model, self).__init__()
        self.encoder = enc
        self.decoder = dec
        self.source_embedding = src_emb
        self.target_embedding = tgt_emb
        self.final = gen

    def compute_encoding(self, src, src_mask):
        src_embeddings = self.source_embedding(src)
        return self.encoder(src_embeddings, src_mask)

    def compute_decoding(self, m, src_mask, tgt, tgt_mask):
        tgt_embeddings = self.target_embedding(tgt)
        return self.decoder(tgt_embeddings, m, src_mask, tgt_mask)

    def forward(self, source, target, source_mask, target_mask):
        encoding = self.compute_encoding(source, source_mask)
        return self.compute_decoding(encoding, source_mask, target, target_mask)



In [14]:
class Final_Proj(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Final_Proj, self).__init__()
        self.projection = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.projection(x)
        return F.log_softmax(x, dim=-1)


In [15]:
def Transformer(source_voc, target_voc):
    # Setting hyperparameters as per the Paper
    N = 6
    d_model = 512
    d_fnn = 2048
    h = 8
    dropout = 0.1

    att = MultiHeadAtt(reduction_factor=h, d_model=d_model)
    fnn = Positional_FNN(d_model=d_model, d_fnn=d_fnn)
    posn = PositionalEncoding(d_model=d_model, dropout=dropout)
    src_embeddings = Embedd(source_voc, d_model=d_model)
    tgt_embeddings = Embedd(target_voc, d_model=d_model)

    c = copy.deepcopy
    encoder_layer = EncoderLayer(d_model, c(att), c(fnn), dropout)
    encoder = Encoder(layer=encoder_layer, N=N)
    decoder_layer = DecoderLayer(d_model, c(att), c(att), c(fnn), dropout)
    decoder = Decoder(layer=decoder_layer, N=N)

    transformer_model = Model(encoder, decoder, 
                              nn.Sequential(src_embeddings, c(posn)),
                              nn.Sequential(tgt_embeddings, c(posn)), 
                              Final_Proj(d_model, target_voc))
    
    for param in transformer_model.parameters():
        if param.dim() > 1:
            nn.init.xavier_uniform(param)
    
    return transformer_model
