References:

[Original Paper](https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)

[Medium Article](https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec)

[Analytics Vidhya Article](https://www.analyticsvidhya.com/blog/2019/06/understanding-transformers-nlp-state-of-the-art-models/)



In [0]:
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import io
import math

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, Dropout,Concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam

import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import copy
from torch.utils.data import Dataset, DataLoader

import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Setting Parameters

BATCH_SIZE = 32            # Batch size for the training set. After each BATCH_SIZE the weights will be updated
EPOCHS = 100               # Number of times we will train the model
VOCAB_SIZE = 20000         # Vocab size for the dataset
EMBEDDING_DIM = 512        # Embedding units to represent a single word in text for both language

# PREPROCESSING

In [0]:
"""
Preprocessing class for all the sequences.
A single instance is created for all the different languages and all the functions will be performed using this class

"""
class Preprocessing():
  
  """
  Args:
    
    VOCAB_SIZE = Maximum number of words in the sentence.
    length = Size of the longest sentence
    
  """
  def __init__(self,VOCAB_SIZE):
    self.VOCAB_SIZE = VOCAB_SIZE
    self.length = 0
  
  """
  Functions:
  
  Tokenizer_text: It tokenizes the corpus passed to it and then return the sequences, the word to index mapping, length of the largest sentence and the index to word mapping.
  total_words: Returns the total number of words in the vocabulary
  padding: Returns a padded sequence with the maxlen equal to self.length
  
  """
  def Tokenizer_text(self,text,VOCAB_SIZE,filters = None):
  
    if filter is not None:
    
      self.tokenizer_lang = Tokenizer(num_words=VOCAB_SIZE,filters = '')
      self.sentences = self.tokenizer_lang.fit_on_texts(text)
      self.sequences = self.tokenizer_lang.texts_to_sequences(text)
      self.word2idx = self.tokenizer_lang.word_index                           # Indexing happen in decreasing order of the frequency of the word. Most frequent word is indexed first.
  
    else:
      
      self.tokenizer_lang = Tokenizer(num_words=VOCAB_SIZE)
      self.sentences = self.tokenizer_lang.fit_on_texts(text)
      self.sequences = self.tokenizer_lang.texts_to_sequences(text)
      self.word2idx = self.tokenizer_lang.word_index
      
    self.length = max(len(s) for s in self.sequences)
    self.idx2word = {v:k for k,v in self.word2idx.items()}
    
    return self.sequences,self.word2idx,self.length,self.idx2word
  
  """
  eng = Preprocessing(VOCAB_SIZE)
  text = ['How are you','I am good thankyou']
  eng.Tokenizer_text(text,VOCAB_SIZE)
  
  OUTPUT 
  
  SEQUENCES =   ([[1, 2, 3], [4, 5, 6, 7]],
  WORD2INDEX =  {'am': 5, 'are': 2, 'good': 6, 'how': 1, 'i': 4, 'thankyou': 7, 'you': 3},
  MAX SEQUENCE LENGTH =  4,
  INDEX2WORD =  {1: 'how', 2: 'are', 3: 'you', 4: 'i', 5: 'am', 6: 'good', 7: 'thankyou'})
  
  """
  
  def total_words(self):
    return len(self.word2idx)
    
  """
  eng.total_words()
  
  OUTPUT
  
  7
  
  """
  def padding(self):
    return pad_sequences(self.sequences,maxlen = self.length,padding = 'post')
  
  """
  eng.padding()
  
  OUTPUT
  
  array([[1, 2, 3, 0],
         [4, 5, 6, 7]], dtype=int32)
  
  """
  


# EMBEDDING LAYER

## Word Embedding

In [0]:
"""
Embedding Class.
The embedding of each vector will be learned during the model execution and will be learned as the parameters of the model
We call also use a predefined embedding like the one provided in GloVe

"""
class Embedder(nn.Module):
    
    """
    Args
    num_words is the maximum number of words present in the vocabulary. Is equal to the value returned by total_words from the Preprocessing cards. 
    
    """
    def __init__(self, num_words, embed_dim):
      super().__init__()
      self.embed_dim = embed_dim
      self.embed = nn.Embedding(num_words, self.embed_dim)
    
    """
    Args
    x is the input i.e. the sentence with the words in the number format
    
    """
    def forward(self, x):
      x =  self.embed(x)
      return x
    
    """
    c = Embedder(7,4)   # We take num_words to be 7 because we have 6 numbers from 1-6 and 0 for padding so total 7
    text = torch.LongTensor([[1,2,3],[4,5,6]]) 
    c(text)
    
    OUTPUT
    
    tensor([[[-0.1336,  0.6301,  0.7810, -0.8898],
             [ 0.3597,  1.9011,  0.6907, -0.9391],
             [-0.8484, -0.1596,  1.4215,  1.9027]],

            [[ 0.3623, -0.6805, -0.7152, -0.3026],
             [-1.3681, -0.0394,  0.2179, -0.5563],
             [ 1.1576, -0.8182,  0.4053,  0.2688]]], grad_fn=<EmbeddingBackward>)
             
    """

## Position Embedding

In [0]:
"""
Position Embedding.
Since we are not using a RNN to create this transformer. The network would have no idea about the position of the word.
Hence we add some information related to the position to this embedding layer.

"""

class PositionEmbedding(nn.Module):
  
  
  """
  Args
  embed_dim is the dimension length for each of the words generated by the Embedding Layer
  max_seq_len is the max length of the sequence. This is equal to the length variable for each of the preprocessed text
    
  """
  
  def __init__(self,embed_dim,max_seq_len,dropout = 0.1):
    super().__init__()
    self.embed_dim = embed_dim
    self.max_seq_len = max_seq_len
    self.dropout = nn.Dropout(dropout)
    
    self.positionencoding = torch.zeros((self.max_seq_len,self.embed_dim))  # So that we have a structure for each position in relation to each dimension column
    
    """
    We first start with each position followed by each dimension column and then generate a value for that column which would then be added to the embedding output.
    """
     
    for position_in_sequence in range(self.max_seq_len):              
      for position_in_dimension in range(0, self.embed_dim, 2):
        self.positionencoding[position_in_sequence, position_in_dimension] = math.sin(position_in_sequence / (10000 ** ((2 * position_in_dimension)/self.embed_dim)))
        self.positionencoding[position_in_sequence, position_in_dimension + 1] = math.cos(position_in_sequence / (10000 ** ((2 * (position_in_dimension + 1))/self.embed_dim)))
                
    self.positionencoding = self.positionencoding.unsqueeze(0)   
    # So that we can have the nnumber of dimensions same as the input data. the first dimensions would overlap the batchsize so the next two dimensions overlap seq_len and embed_dim
    
  """
  Args
  x is the input i.e. the sentence with the words in the embeded format
  """      
  
  def forward(self,x):   # The x is the embedding output
    
    x = x * math.sqrt(self.embed_dim)   # Giving a higher value to the embedding output
    
    seq_len = x.size(1)          # Get the sequence length as the shape of x would be (batch_size,seq_len,embed_dim)
    self.positionencoding = Variable(self.positionencoding[:,:seq_len],requires_grad=False)  # Adding the Position
    x = x + self.positionencoding
    x = self.dropout(x)
    return x
    
    """
    SELF NOTE
    
    All of this gives the same output
    
     self.positioneoncodeing[:,:seq_len]
     self.positioneoncodeing[:,:,:]
     self.positioneoncodeing[:,:self.seq_len]
     self.positioneoncodeing[:,:self.seq_len,:self.embed_dim]
     
    Assuming the embed_dim and max_seq_len we gave has the same values as the values of the dimension of positionencoding  
    
    OUTPUTS
    
    x = torch.zeros(2,3,4)
    layer = PositionEmbedding(4,3)
    c = layer(x)
    c                   # For each batch size
    
    tensor([[[0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00],
             [8.4147e-01, 9.9995e-01, 1.0000e-04, 1.0000e+00],
             [9.0930e-01, 9.9980e-01, 2.0000e-04, 1.0000e+00]],

           [[0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00],
            [8.4147e-01, 9.9995e-01, 1.0000e-04, 1.0000e+00],
            [9.0930e-01, 9.9980e-01, 2.0000e-04, 1.0000e+00]]])
    
    c[0]               # For each input whose seq length is 3 and embed_dim is 4
    
    tensor([[0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00],
        [8.4147e-01, 9.9995e-01, 1.0000e-04, 1.0000e+00],
        [9.0930e-01, 9.9980e-01, 2.0000e-04, 1.0000e+00]])
        
    c[0,1]             # For each input and word at time 1 we get the followind embed_dim values
    
    tensor([8.4147e-01, 9.9995e-01, 1.0000e-04, 1.0000e+00])
    """
    

# MASKING

In [0]:
"""
Masking

This will mask all the padded values in the sequence for source and target so that they do not contribute to the information of the next word
"""

class Masking():
  
  """
  Args
  sequencetype to see if its a source or target since both of them have different maskings
  paddingvalue = 0 to provide the integer for padding
  
  """
  def __init__(self,sequencetype = 'Source',paddingvalue = 0):
    
    self.sequencetype = sequencetype
    self.paddingvalue = paddingvalue
  
  def create_mask(self,sequence):  # Sequence is the input sequence

    if self.sequencetype == 'Source':   # Encoder Mapping or input Mapping
      
      self.mask = (sequence != self.paddingvalue).unsqueeze(-2)   # We unsqueeze the data so that for each input we have a mask of size (1,seq_len) 
      return self.mask  
    
      """
    
        c = Masking(sequencetype = 'Target',paddingvalue = 0)
        text = torch.Tensor([[1,2,3,4,0,0],[1,2,3,4,0,0]])
        c.create_mask(text)
      
        OUTPUT WHEN mask is printed
       
        tensor([[1, 1, 1, 1, 0, 0],
               [1, 1, 1, 1, 0, 0]], dtype=torch.uint8)
       
      """   
    
    else:
      self.mask = (sequence != self.paddingvalue).unsqueeze(-2)  # This is done to avoid padding
      
      # We also need to make sure that at no point in time, there is a leftward movement of information in the decoder. i.e. No word at time greater than t is able to impact the decision at time t.
      
      size = sequence.size(1)   # Since the shape of the sequence is batch_size,seq_len,embed_dim
      
      self.forwardmask = np.ones((1,size,size))
      self.forwardmask = np.triu(self.forwardmask,k=1).astype('uint8')   # This will make all the values below the kth diagonal zero 
      
      self.nopeak_mask = Variable(torch.from_numpy(self.forwardmask) == 0)   # This will basically reverse the matrix with 1 becoming zero and zero becoming one. Hence we get our required masking for the sequences,
      
      """
      c = Masking(sequencetype = 'Source',paddingvalue = 0)
      text = torch.Tensor([[1,2,3,4,0,0],[1,2,3,4,0,0]])
      c.create_mask(text)
      
      OUTPUT WHEN nopeak_mask is printed

         tensor([[[1, 0, 0, 0, 0, 0],
                  [1, 1, 0, 0, 0, 0],
                  [1, 1, 1, 0, 0, 0],
                  [1, 1, 1, 1, 0, 0],
                  [1, 1, 1, 1, 1, 0],
                  [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8)

      As we can see for t = 1 all the values in the future is masked. For t = 2 only value at t = 1 is unmasked and so on.
      
      But we can see that we get this output only once but we need this masking matrix for all the inputs. We also need the masking for the padding values.
      Hence the next step.
      
      """
      
      self.final_mask = self.mask & self.nopeak_mask 
      
      """
    
       c = Masking(sequencetype = 'Target',paddingvalue = 0)
       text = torch.Tensor([[1,2,3,4,0,0],[1,2,3,4,0,0]])
       c.create_mask(text)
      
       OUTPUT WHEN final_mask is printed

         tensor([[[1, 0, 0, 0, 0, 0],
                  [1, 1, 0, 0, 0, 0],
                  [1, 1, 1, 0, 0, 0],
                  [1, 1, 1, 1, 0, 0],
                  [1, 1, 1, 1, 0, 0],
                  [1, 1, 1, 1, 0, 0]],
                  
                  
                 [[1, 0, 0, 0, 0, 0],
                  [1, 1, 0, 0, 0, 0],
                  [1, 1, 1, 0, 0, 0],
                  [1, 1, 1, 1, 0, 0],
                  [1, 1, 1, 1, 0, 0],
                  [1, 1, 1, 1, 0, 0]]], dtype=torch.uint8)

      Now we get output for all the inputs. 
      
      """
      
      return self.final_mask
      
      
      

# ATTENTION LAYER

## Multi Headed Attention

In [0]:
"""
Multi Headed Attention Layer

Here we will implement the Multi Headed Attention class which will calculate the self attention for the encoder and the decoder in the architecture. To learn more about how this works look at the references mentioned above.

"""

class MultiHeadedAttention(nn.Module):
  
  """
  Args
  heads is the number of heads we want to create as per the paper
  embed_dim is the embedding dimension
  
  """
  def __init__(self,heads,embed_dim,dropout = 0.1): # You can change the dropout value to include regularization
    super().__init__()
    
    self.heads = heads
    self.embed_dim = embed_dim
    self.matrixsize = self.embed_dim // self.heads  # So that size is in int and not in float
    
    self.q_linear = nn.Linear(self.embed_dim, self.embed_dim)
    self.k_linear = nn.Linear(self.embed_dim, self.embed_dim)
    self.v_linear = nn.Linear(self.embed_dim, self.embed_dim)
    

    self.dropout = nn.Dropout(dropout)
    
    self.out = nn.Linear(self.embed_dim, self.embed_dim)
    
  """
  Args
  q_vector,k_vector,v_vector are the vector provided for the multi headed attention
  mask is the mask for the vectors
  
  """
  def forward(self, q_vector, k_vector, v_vector, mask=None):
      
    self.batch_size = q_vector.size(0)
      
    self.q_vector = self.q_linear(q_vector).view(self.batch_size, -1, self.heads, self.matrixsize)                # Here we calculate different values of matrix Q,K,V
    self.k_vector = self.k_linear(k_vector).view(self.batch_size, -1, self.heads, self.matrixsize)
    self.v_vector = self.v_linear(v_vector).view(self.batch_size, -1, self.heads, self.matrixsize)
    
    """
    q = torch.zeros(2,20,20)
    c = MultiHeadedAttention(5,20)
    c(q,q,q).size()
    
    When the .view lines and the transpose line below were commented and the size of q_vector was printed.
    
    OUTPUT
    
    torch.Size([2, 20, 20]) which would have values different from the ones passed as a linear function was applied
    
    q = torch.zeros(2,20,20)
    c = MultiHeadedAttention(5,20)
    c(q,q,q).size()
    
    When the transpose line below were commented and the size of q_vector was printed.
    
    OUTPUT
    
    Size of the q vector
    
    torch.Size([2, 20, 5, 4])
    
    The third dimension in the input is the embed_dim (20 in our example). Now we have split that dimension into 5 heads each with a size of 4. We need to transpose this into a form which we can use for multiplication.
    Hence the transpose layer
    
    """
       
    self.q_vector = self.q_vector.transpose(1,2)            
    self.k_vector = self.k_vector.transpose(1,2)
    self.v_vector = self.v_vector.transpose(1,2)
    
    """
    q = torch.zeros(2,20,20)
    c = MultiHeadedAttention(5,20)
    c(q,q,q).size()
    
    When the .view lines and the transpose line below were commented and the size of q_vector was printed.
    
    OUTPUT
    
    Size of the q vector
    
    torch.Size([2, 5, 20, 4])
       
    """
    
      
    # Now we will calculate individual attentions
    scores = single_attention(self.q_vector, self.k_vector, self.v_vector, self.matrixsize, mask, self.dropout)
    
    """
    We get the scores shape as (2,5,20,4)
    
    We have done the matrix multiplication now we need to normalize and concat the values
    """
    
    # This is the concat layer for each of the attentions
    concat = scores.transpose(1,2).contiguous().view(self.batch_size, -1, self.embed_dim)
        
    output = self.out(concat)    # This is the final Linear Layer
    
    """
    Shape of the output
    
    torch.Size([2, 20, 20])
    
    """
    return output


## Single Layer Attention

In [0]:
"""
Single attention Layer.

This calculate a single attention for the dataset.
It calculate the function

(q_vector*k_vector)*v_vector/sqrt

"""
"""
Args

q_vector,k_vector,v_vector are passed from the above Multi headed attention class
embed_dim is the embedding dimesion
mask is the mask for the input to single attetion

"""
def single_attention(q_vector, k_vector, v_vector, matrixsize, mask=None, dropout=None):
  
  scores = torch.matmul(q_vector, k_vector.transpose(-2, -1)) /  math.sqrt(matrixsize)   # Size of the k_vector.transpose is [2,5,4,20]
  
  """
  Size of the scores
  
  OUTPUT
  torch.Size([2, 5, 20, 20])
  
  """
  if mask is not None:
    mask = mask.unsqueeze(1)                        
    
    """
    Scores are of size (2,5,20,20)
    But if we would have applied the mask function for the same values it would be of size (2,20,20) since heads are created as a part of the multiheaded attention function
    
    So we need to unsqueeze it at dim = 1 so the new shape of the mask is (2,1,20,20). Hence we get a mask for each of the values
    
    """
    scores = scores.masked_fill(mask == 0, -1e9)
  
  scores = F.softmax(scores, dim=-1) # We apply the softmax on the last dim
    
  if dropout is not None:
    scores = dropout(scores)
        
  output = torch.matmul(scores, v_vector)
  
  """
  
  Shape of scores is (2,5,20,20)
  Shape of v_vector is (2,5,20,4)
  
  Hence after matrix multiplication
  Shape of output is (2,5,20,4)
  """
  
  return output


# FEED FORWARD LAYER

In [0]:
"""
Feed Forward Network

As per the paper each layer in the encoder and decoder contains a fully connected feedforward layer with a relu function in between. This clas is use to model this layer.

"""

class FeedForward(nn.Module):
  
  """
  Args
  embed_dim is the embedding dimension
  ffhd is the feed forward network hidden layer size
  """
  def __init__(self,embed_dim,ffhd = 2048,dropout = 0.1):  # ffhd stands for Feed Forward Hidden Dimension 
    super().__init__()
    
    self.linear_1 = nn.Linear(embed_dim,ffhd)
    self.dropout = nn.Dropout(dropout)
    self.linear_2 = nn.Linear(ffhd,embed_dim)
    
  """
  Args
  x is the input
  """
  def forward(self,x):
    
    x = F.relu(self.linear_1(x))
    
    """
    c = FeedForward(20)
    t = torch.ones(2,20,20)
    c(t).size()
    
    Shape before first linear layer : (2,20,20) # As per what was calculated in the MultiHeaded Attention class
    Shape after first linear layer : (2,20,2048)
    """
    x = self.dropout(x)
    
    x = self.linear_2(x)
    
    """
    c = FeedForward(20)
    t = torch.ones(2,20,20)
    c(t).size()
    
    Shape before second linear layer : (2,20,2048) 
    Shape after second linear layer : (2,20,20)
    """
    
    return x
    
    
    

# NORMALIZATION LAYER

In [0]:
"""
Normalization Layer

Each of the output from the feed forward is followed by a normalization layer so that the outputs do not go out of hand and stay with a certain level

"""

class Normalization(nn.Module):
  """
  Args
  embed_dim is the emebedding size
  """
  def __init__(self, embed_dim, eps = 1e-6):
    super().__init__()
    self.embed_dim = embed_dim

    # create two learnable parameters to calibrate normalisation
    
    self.alpha = nn.Parameter(torch.ones(self.embed_dim))
    self.bias = nn.Parameter(torch.zeros(self.embed_dim))
    self.eps = eps
  
  def forward(self,x):
    
    norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
    return norm
  
  

# PUTTING IT ALL TOGETHER

As we see in the paper the encoder and the decoder have a particular architecture comprising of many layers. And this layers have a particular structure. We will create this strcuture here

## Encoder Layer Architecture

In [0]:
"""
Encoder Structure

Each encoder has a single Multi Headed Attention layer and a Feed Forward Layer

"""

class EncoderStructure(nn.Module):
  
  """
  Args
  embed_dim is the embedding dimesion
  heads is the number of heads for the multiheaded attention layer
  """
  def __init__(self,embed_dim,heads,dropout = 0.1):
    super().__init__()
    
    # This layer are in order as shown in the image in the paper
    
    self.normalize_1 = Normalization(embed_dim = embed_dim)
    self.mhattention = MultiHeadedAttention(embed_dim = embed_dim, heads = heads)
    self.dropout_1 = nn.Dropout(dropout)
    self.normalize_2 = Normalization(embed_dim = embed_dim)
    self.feedforward = FeedForward(embed_dim=embed_dim)
    self.dropout_2 = nn.Dropout(dropout)
    
  """
  Args
  x is the input size
  mask is the encoder mask
  """
  def forward(self,x,mask):
    
    x2 = self.normalize_1(x)                                 # This is like the calculation of q,k,v which are equal
    x = x + self.dropout_1(self.mhattention(x2,x2,x2,mask))  # This calculates the multi headed attention and then perform the add and norm layer as per the paper
    x2 = self.normalize_1(x)                                 # This applies a normalization before the feedforward layer
    x = x2 + self.dropout_2(self.feedforward(x2))            # This is the feedforward layer followed by the add and norm layer as per the paper
    
    return x
    
    """
    c = EncoderStructure(20,5)
    t = torch.ones(2,20,20)
    c(t,None).size()
    
    Where None is mentioned we provide the mask for the dataset
    
    OUTPUT
    
    torch.Size([2, 20, 20])
    
    The shape is same as the input but all the functions of the encoder structure are applied to it.
    
    """
  
    
    
    

## Decoder Layer Architecture

In [0]:
"""
Decoder Structure

Each decoder has two Multi Headed Attention layer and a Feed Forward Layer

"""

class DecoderStructure(nn.Module):
  
  """
  Args
  embed_dim is the embedding dimesion
  heads is the number of heads for the multiheaded attention layer
  """
  def __init__(self,embed_dim,heads,dropout = 0.1):
    super().__init__()
  
    # This layer are in order as shown in the image in the paper
  
    self.normalize_1 = Normalization(embed_dim = embed_dim)
    self.attention_1 = MultiHeadedAttention(embed_dim = embed_dim,heads = heads)
    self.dropout_1 = nn.Dropout(dropout)
  
    self.normalize_2 = Normalization(embed_dim = embed_dim)
    self.attention_2 = MultiHeadedAttention(embed_dim = embed_dim, heads = heads)
    self.dropout_2 = nn.Dropout(dropout)
  
    self.normalize_3 = Normalization(embed_dim = embed_dim)
    self.feedforward = FeedForward(embed_dim = embed_dim)
    self.dropout_3 = nn.Dropout(dropout)
  
  """
  Args
  x is the input size
  encoder_output is the encoder output
  source_mask is the source mask
  target_mask is the target_mask
  """  
  def forward(self,x,encoder_output,source_mask,target_mask):
     
    x2 = self.normalize_1(x)                                              # This is like the calculation of q,k,v which are equal
    x = x + self.dropout_1(self.attention_1(x2,x2,x2,target_mask))       # This calculates the multi headed attention and then perform the add and norm layer as per the paper and uses the target mask
    
    x2 = self.normalize_2(x)                                              # This is like the calculation of q,k,v which are equal
    x = x + self.dropout_1(self.attention_2(x2,encoder_output,encoder_output,source_mask))
    
    # This layer is the second multi headed layer from the image. It has two inputs k and v coming from the encoder layer and one input coming from the previous multi headed attention.
    # This makes use of the source mask as we have inputs coming from the encoder
    
    x2 = self.normalize_3(x)
    x = x + self.dropout_3(self.feedforward(x2))
    
    return x
  
    """
    c = DecoderStructure(20,5)
    t = torch.ones(2,20,20)
    c(t,t,None,None).size()       
    
    Passing the same t inplace of the encoder output just to check the shapes
    
    Where None is mentioned we provide the mask for the dataset
    
    OUTPUT
    
    torch.Size([2, 20, 20])
    
    The shape is same as the input but all the functions of the encoder structure are applied to it.
    
    """
    
    
  

## Getting Layer Copies

In [0]:
def clones(module,number):
  return nn.ModuleList([copy.deepcopy(module) for i in range(number)])

##  Encoder 

In [0]:
"""
Encoder

In the above classes we just build one Encoder Layer structure. This class will create the entire structure for the Encoder using the Layer architecture created above.

"""

class Encoder(nn.Module):
  
  """
  Args
  num_words is the total word size for the vacab
  embed_dim is the embedding dimension
  number is the number of encoder layers
  heads is the number of heads for the attention layer
  max_seq_len is the maximum length of the input to encoder
  """
  def __init__(self,num_words,embed_dim,number,heads,max_seq_len):
    super().__init__()
    
    self.number = number                 # Number of Encoder Layers
    self.embedded_layer = Embedder(embed_dim=embed_dim,num_words=num_words)                                # First creating Word Embedding the input 
    self.PositionEmbedding = PositionEmbedding(embed_dim=embed_dim,max_seq_len=max_seq_len)                # Creating the Position Embedding
    self.encoderlayers = clones(module = EncoderStructure(embed_dim=embed_dim,heads=heads),number=number)  # Creating the Encoder Layers
    self.normalization = Normalization(embed_dim=embed_dim)                                                # Normalization LAyer
    
  
  """
  Args
  source_input is the input to encoder
  source_mask is the mask for the input
  """
  def forward(self,source_input,source_mask):
    
    x = self.embedded_layer(source_input)
    x = self.PositionEmbedding(x)
    
    for i in range(self.number):  
      x = self.encoderlayers[i](x,source_mask)
    
    x = self.normalization(x)
    
    return x

  """
  c = Encoder(20,20,5,5,5)
  m = Masking(sequencetype='Source',paddingvalue=0)
  t = torch.LongTensor([[1,2,3,4,5],[6,7,8,9,10]])
  print('Input Shape',t.size())
  a = m.create_mask(t)
  c(t,a).size()
  print('Output Shape',c(t,a).size())

  We created an encoder structure with num_words = 20, embed_Dim = 20,number of encoder layers = 5,heads = 5,max_seq_len = 5
  We then create a masking for the input t which basically represents a sentence and its 5 words 
  Then we pass it to the Encoder layer as it is the input it expects
  
  OUTPUT
  
  Input Shape torch.Size([2, 5])
  Output Shape torch.Size([2, 5, 20])
  
  As we can see that we are able to create an embedding of 20 for each words and perform all the tasks for the encoder.
  
  """
    
    
    
  
  

## Decoder

In [0]:
"""
Decoder

In the above classes we just build one Decoder Layer structure. This class will create the entire structure for the Decoder using the Layer architecture created above.

"""

class Decoder(nn.Module):
  
  """
  Args
  num_words is the total word size for the vacab
  embed_dim is the embedding dimension
  number is the number of decoder layers
  heads is the number of heads for the attention layer
  max_seq_len is the maximum length of the input to decoder
  """
  def __init__(self,num_words,embed_dim,number,heads,max_seq_len):
    super().__init__()
    
    self.number = number                 # Number of Encoder Layers
    self.embedded_layer = Embedder(embed_dim=embed_dim,num_words=num_words)                                # First creating Word Embedding the input 
    self.PositionEmbedding = PositionEmbedding(embed_dim=embed_dim,max_seq_len=max_seq_len)                # Creating the Position Embedding
    self.decoderlayers = clones(module = DecoderStructure(embed_dim=embed_dim,heads=heads),number=number)  # Creating the Encoder Layers
    self.normalization = Normalization(embed_dim=embed_dim)                                                # Normalization LAyer
    
  """
  Args
  target_input is the input to decoder
  encoder_input is the output of the encoder
  source_mask is the mask for the encoder
  target_mask is the mask for the decoder
  """
  def forward(self,target_input,encoder_input,source_mask,target_mask):
    
    x = self.embedded_layer(target_input)
    x = self.PositionEmbedding(x)
    
    for i in range(self.number):  
      x = self.decoderlayers[i](x,encoder_input,source_mask,target_mask)
    
    x = self.normalization(x)
    
    return x

  """
  c = Encoder(20,20,5,5,5)
  m = Masking(sequencetype='Source',paddingvalue=0)
  t = torch.LongTensor([[1,2,3,4,5],[6,7,8,9,10]])
  print('Input Shape',t.size())
  a = m.create_mask(t)
  ei = c(t,a) 
  print('Output Shape Encoder',ei.size())
  d = Decoder(20,20,5,5,5)
  x = d(t,ei,a,a)
  print('Output Shape Decoder',x.size())

  We created an encoder structure with num_words = 20, embed_Dim = 20,number of encoder layers = 5,heads = 5,max_seq_len = 5
  We then create a masking for the input t which basically represents a sentence and its 5 words 
  Then we pass it to the Encoder layer as it is the input it expects
  
  We created an decoder structure with num_words = 20, embed_Dim = 20,number of encoder layers = 5,heads = 5,max_seq_len = 5
  For understanding purposes we are using the same mask for source and target but they will be different in the actual set
  We then call the Decoder forward function and pass the necessary inputs
  
  OUTPUT
  
  Input Shape torch.Size([2, 5])
  Output Shape Encoder torch.Size([2, 5, 20])
  Output Shape Decoder torch.Size([2, 5, 20])
  
  As we can see that we are able to create an embedding of 20 for each words and perform all the tasks for the encoder.
  
  """
    
    
    
  
  

## Transformer

In [0]:
"""
Transformer

After creating the Encoder and Decoder Layers. They still are not related as they are individual classes. In this class we will create a sequential flow of this layers

"""

class Transformer(nn.Module):
  
  """
  Args
  
  source_vocab is the total words for the source language
  target_vocab is the total words for the target language
  embed_dim is the embedding dimension
  number is the number of encoder and decoder layers
  heads is the number of head for the multi attention layer
  max_seq_length_source is the maximum length for the source language
  max_seq_length_target is the maximum length for the target language
  """
  
  def __init__(self,source_vocab,target_vocab,embed_dim,number,heads,max_seq_length_source,max_seq_length_target):
    super().__init__()
    
    self.Encoder = Encoder(embed_dim=embed_dim,number=number,num_words=source_vocab,max_seq_len=max_seq_length_source,heads=heads)     # Encoder Layer
    self.Decoder = Decoder(embed_dim=embed_dim,number=number,num_words=target_vocab,max_seq_len=max_seq_length_target,heads=heads)    # Decoder Layer
    self.outputlayer = nn.Linear(embed_dim,target_vocab)  # Convert an input from embed_dim length to target_vocab length
    
  """
  source is the source language sequences
  target is the target language sequences
  source_mask is the mask for the source
  target_mask is the mask for the target
  
  """
  def forward(self,source,target,source_mask,target_mask):
    
    encoder_output = self.Encoder(source,source_mask)
    decoder_output = self.Decoder(target,encoder_output,source_mask,target_mask)
    output = self.outputlayer(decoder_output)
    
    return output
  
  """
  source_masking = Masking(sequencetype='Source',paddingvalue=0)
  source = torch.LongTensor([[1,2,3,4,5],[6,7,8,9,10]])
  print('Input Shape',source.size())
  source_masked = source_masking.create_mask(source)
  c = Transformer(20,25,20,5,5,5,7)
  target = torch.LongTensor([[1,2,3,4,5,6,7],[6,7,8,9,10,13,14]])
  target_masking = Masking(sequencetype='Target',paddingvalue=0)
  target_masked = target_masking.create_mask(target)
  c(source,target,source_masked,target_masked).size()
  
  First we will create two masks one for source and one for target
  We will mask our source and target
  Then we will run the transformer
  
  OUTPUT:
  
  Input Shape torch.Size([2, 5])
  torch.Size([2, 7, 25])
  
  As we can see that the input is successfully giving an output as expected. We said that the max sequence length for target is 7
  and it has 25 vocab size. Hence for each input we get the required output.
  
  """

Now as all the strutures are connected and we are able to run on this small inputs. Its time we take the actual inputs 

# DATA GENERATION

In [0]:
import io
#imdbdata = pd.read_csv('fra.txt',encoding = 'latin-1')
df = pd.read_table('/content/drive/My Drive/Colab Notebooks/Dataset/fra.txt',header = None)

In [47]:
df.columns = ['English','French']
df.head()

Unnamed: 0,English,French
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [0]:
english_text_input = df['English'][:25000]    # Input for the training Encoder
french_text_input = df['French'][:25000].apply(lambda x: '<sos> ' + x + ' <eos>')      # Input for the training Decoder

In [0]:
#Preprocessing for English Text

en_preprocessor = Preprocessing(VOCAB_SIZE=VOCAB_SIZE) 
english_sequences,english_word2idx,english_length,english_idx2word = en_preprocessor.Tokenizer_text(english_text_input,VOCAB_SIZE) 
english_sequences = en_preprocessor.padding()
english_totalwords = len(english_word2idx)


In [0]:
#Preprocessing for French Text

fra_preprocessor = Preprocessing(VOCAB_SIZE=VOCAB_SIZE) 
french_sequences,french_word2idx,french_length,french_idx2word = fra_preprocessor.Tokenizer_text(french_text_input,VOCAB_SIZE) 
french_sequences = fra_preprocessor.padding()
french_totalwords = len(french_word2idx)

# DATA LOADER

In [0]:
"""
TranslatorDataset creates a dataset for easy batch creation
"""
class TranslatorDataset(Dataset):
  
  """
  Args
  english_sequences is the source language sequences
  french_sequences is the target language sequences
  """
  def __init__(self,english_sequences,french_sequences):
    self.english_sequences = english_sequences
    self.french_sequences = french_sequences
    
  def __len__(self):
    return len(self.english_sequences)
  
  def __getitem__(self,index):
    
    english = self.english_sequences[index]
    french = self.french_sequences[index]
    
    return english,french

In [0]:
data = TranslatorDataset(english_sequences,french_sequences)

# MODEL CREATION

In [0]:
model = Transformer(embed_dim=EMBEDDING_DIM,heads=8,number=6,max_seq_length_source=english_length,max_seq_length_target=french_length,source_vocab=english_totalwords+1,target_vocab=french_totalwords+1)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
        

optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# TRAINING

In [0]:
def train(epochs,print_timestep = 100):
  
  model.train()
  
  start = time.time()
  temp = start
  
  total_loss = 0
  
  # Creating the Dataloader for the dataset
  
  dataloader = DataLoader(data, batch_size=BATCH_SIZE,shuffle=False)
  
  # Appliyng the masking
  source_masking = Masking(sequencetype='Source',paddingvalue=0)
  target_masking = Masking(sequencetype='Target',paddingvalue=0)
  
  # Rrunning epochs
  for epoch in range(epochs):
    
    # Extracting batches
    for i_batch, sample_batched in enumerate(dataloader):
      
      # Extracting the source and target and converting into tensor
      
      source = torch.tensor(sample_batched[0]).to(torch.int64)
      target = torch.tensor(sample_batched[1]).to(torch.int64)
      target_input = target[:,:-1]
      target_output = target[:,1:].contiguous().view(-1)
      
      #masking
      source_mask = source_masking.create_mask(source)
      target_mask = target_masking.create_mask(target_input)
      
#       print('Source',source.size())
#       print('Source',target_input.size())
#       print('Source',source_mask.size())
#       print('Source',target_mask.size())

      # Predicting
      preds = model(source,target_input,source_mask,target_mask)
      optim.zero_grad()

      # Calculating Loss
      loss = F.cross_entropy(preds.view(-1, preds.size(-1)),target_output, ignore_index=0)
      
      # Updating weights
      loss.backward()
      optim.step()
      
      total_loss += loss.item()
      if (i_batch + 1) % print_timestep == 0:
        loss_avg = total_loss / print_timestep
        print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % ((time.time() - start) // 60,epoch + 1, i_batch + 1, loss_avg,time.time() - temp,print_timestep))
        total_loss = 0
        temp = time.time()
        

In [119]:
train(1)


time = 2m, epoch 1, iter = 100, loss = 6.446, 136s per 100 iters
time = 4m, epoch 1, iter = 200, loss = 5.617, 135s per 100 iters
time = 6m, epoch 1, iter = 300, loss = 5.509, 132s per 100 iters
time = 8m, epoch 1, iter = 400, loss = 5.518, 133s per 100 iters
time = 11m, epoch 1, iter = 500, loss = 5.489, 132s per 100 iters
time = 13m, epoch 1, iter = 600, loss = 5.446, 131s per 100 iters
time = 15m, epoch 1, iter = 700, loss = 5.388, 131s per 100 iters


# TESTING

In [165]:
# Generating a random input
i = np.random.choice(len(english_text_input[:25000]))

model.eval()

# Preprocessing taks
input_sequence = english_sequences[i:i+1]
input_sequence = torch.from_numpy(input_sequence).to(torch.int64)

source_masking = Masking(sequencetype='Source',paddingvalue=0)
target_masking = Masking(sequencetype='Target',paddingvalue=0)

source_mask = source_masking.create_mask(input_sequence)

# Getting the encoder output to pass to the input
encoder_output = model.Encoder(input_sequence,source_mask)

# creating an output tensor array with starting as <sos>

outputs = torch.zeros(13).type_as(input_sequence.data)
outputs[0] = torch.LongTensor([french_word2idx['<sos>']])

# Running for the maximum seq length
for j in range(1,french_length-1):
  
  # Target Mask
  trg_mask = np.triu(np.ones((1, j, j)),k=1).astype('uint8')
  trg_mask= Variable(torch.from_numpy(trg_mask) == 0)
  
  # Calculating output
  out = model.outputlayer(model.Decoder(outputs[:j].unsqueeze(0),encoder_output, source_mask, trg_mask))  # unsqueezing it to match the expected size
  out = F.softmax(out, dim=-1)
  
#  print(out[:,-1].data.topk(1))
  # Extracting index with maximum probability
  
  val, ix = out[:, -1].data.topk(1)
  
  outputs[j] = ix[0][0]
  
  if ix[0][0] == french_word2idx['<eos>']:
    break
  
  

print('English: ',english_text_input[i])
print('French: ',french_text_input[i])
print('French Predicted: '," ".join(french_idx2word[int(index.numpy())] for index in outputs[1:j]))

English:  I'm calm.
French:  <sos> Je suis calme. <eos>
French Predicted:  je suis ne ne ne ne ne ne ne ne ne


#  SAVING THE MODEL

In [0]:
torch.save(model.state_dict(), '/content/drive/My Drive/Colab Notebooks/Dataset/model.pth')

# LOADING THE MODEL

In [55]:
model2 = Transformer(embed_dim=EMBEDDING_DIM,heads=8,number=6,max_seq_length_source=english_length,max_seq_length_target=french_length,source_vocab=english_totalwords+1,target_vocab=french_totalwords+1)
model2.load_state_dict(torch.load('/content/drive/My Drive/Colab Notebooks/Dataset/model.pth'))
model2.eval()

Transformer(
  (Encoder): Encoder(
    (embedded_layer): Embedder(
      (embed): Embedding(9494, 512)
    )
    (PositionEmbedding): PositionEmbedding()
    (encoderlayers): ModuleList(
      (0): EncoderStructure(
        (normalize_1): Normalization()
        (mhattention): MultiHeadedAttention(
          (q_linear): Linear(in_features=512, out_features=512, bias=True)
          (k_linear): Linear(in_features=512, out_features=512, bias=True)
          (v_linear): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (dropout_1): Dropout(p=0.1)
        (normalize_2): Normalization()
        (feedforward): FeedForward(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout_2): Dropout(p=0.1)
      )
      