In [3]:
!pip install -U torch==1.10.0 torchtext==0.11.0

# Reload environment
exit()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.10.0
  Downloading torch-1.10.0-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 1.2 MB/s eta 0:00:39tcmalloc: large alloc 1147494400 bytes == 0x65890000 @  0x7f9c84e8a615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |████████████████████████████████| 881.9 MB 17 kB/s 
[?25hCollecting torchtext==0.11.0
  Downloading torchtext-0.11.0-cp37-cp37m-manylinux1_x86_64.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 9.6 MB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu1

In [1]:
import random
import re
import math
import pandas as pd
import spacy
import torch
import torch.optim as optim
import torch.nn as nn
from torch import Tensor
from torchtext.legacy import data

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
g_path = "/content/drive/My Drive/"
data_path = g_path + "ml-data/"
code_path = g_path + "pytorch-nlp/"

Mounted at /content/drive


In [3]:
data_fl = 'data/eng_ben_small.csv'

In [4]:
#reproducing the same result
SEED = 2021
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fa987c1d6b0>

In [7]:
def tokenizer(text):
    return [tok for tok in text.split()]

ENG_TEXT = data.Field(tokenize=tokenizer,batch_first=True,include_lengths=True)
BEN_TEXT = data.Field(tokenize=tokenizer,batch_first=True,include_lengths=True)
fields = [('eng',ENG_TEXT),('ben', BEN_TEXT)]

In [8]:
#loading the entire data
def load_data():
  eng_ben_data = data.TabularDataset(path = data_path+data_fl,format = 'csv', fields = fields, skip_header = True)
  return eng_ben_data

eng_ben_data = load_data() 
print(vars(eng_ben_data.examples[0]))
print(eng_ben_data.examples[0].eng, eng_ben_data.examples[0].ben)

{'eng': ['Go.'], 'ben': ['যাও।']}
['Go.'] ['যাও।']


In [9]:
#splitting the data into training and validation dataset
def split_data(eng_ben_data):
  train_data, valid_data = eng_ben_data.split(split_ratio=0.7, random_state = random.seed(SEED))
  return train_data, valid_data

train_data, valid_data = split_data(eng_ben_data)

In [11]:
#generate vocabulary
ENG_TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
BEN_TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d") 

#No. of unique tokens in text
print("Size of ENG_TEXT vocabulary:",len(ENG_TEXT.vocab))
#No. of unique tokens in label
print("Size of BEN_TEXT vocabulary:",len(BEN_TEXT.vocab))

print("unk:- ", ENG_TEXT.vocab.stoi["<unk>"])
print("pad:- ", ENG_TEXT.vocab.stoi["<pad>"])
print("sos:- ", ENG_TEXT.vocab.stoi["<sos>"]) #not present in dictionary
print("The first word in vocab is ", ENG_TEXT.vocab.itos[0])
print("The second word in vocab is ", ENG_TEXT.vocab.itos[1])
print("The third word in vocab is ", ENG_TEXT.vocab.itos[2])
print("The last word in vocab is ", ENG_TEXT.vocab.itos[len(ENG_TEXT.vocab)-1])


print("unk:- ", BEN_TEXT.vocab.stoi["<unk>"])
print("pad:- ", BEN_TEXT.vocab.stoi["<pad>"])
print("sos:- ", BEN_TEXT.vocab.stoi["<sos>"]) #not present in dictionary
print("The first word in vocab is ", BEN_TEXT.vocab.itos[0])
print("The second word in vocab is ", BEN_TEXT.vocab.itos[1])
print("The third word in vocab is ", BEN_TEXT.vocab.itos[2])
print("The last word in vocab is ", BEN_TEXT.vocab.itos[len(BEN_TEXT.vocab)-1])

Size of ENG_TEXT vocabulary: 168
Size of BEN_TEXT vocabulary: 162
unk:-  0
pad:-  1
sos:-  0
The first word in vocab is  <unk>
The second word in vocab is  <pad>
The third word in vocab is  I
The last word in vocab is  won't
unk:-  0
pad:-  1
sos:-  0
The first word in vocab is  <unk>
The second word in vocab is  <pad>
The third word in vocab is  আমি
The last word in vocab is  হচ্ছে।


In [12]:
#preparing batches for training the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 5

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort=True, # Sort all examples in data using `sort_key`.
    sort_key = lambda x: len(x.text),
    sort_within_batch=True, # Use `sort_key` to sort examples in each batch.
    device = device)

# Transformer Encoder Model

In [2]:
class MulHeadEncAttention(nn.Module):

    def __init__(self, name: str = '', conf: dict = {},):
        super(MulHeadEncAttention, self).__init__()

        self.embed_dim = conf['trans_enc']["embed_dim"]
        self.heads = conf['trans_enc']["heads"] #number of heads
        self.head_dim = conf['trans_enc']["embed_dim"] // conf['trans_enc']["heads"]

        assert (
            self.head_dim * self.heads == conf['trans_enc']["embed_dim"]
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)

    def forward(self, value, key, query, mask):
        
        '''
        value ~ # [batch, seq_len, embed_dim]
        key ~ # [batch, seq_len, embed_dim]
        query ~ # [batch, seq_len, embed_dim]
        mask ~ # [batch, 1, 1, seq_len]
        '''

        # Get number of training examples
        batch = query.shape[0]

        '''
        value_len ~ the max seq_len in values 
        key_len ~ the max seq_len in keys 
        query_len ~ the max seq_len in query 
        here all of them are same
        '''
        value_len, key_len, query_len = value.shape[1], key.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, value_len, heads, head_dim]
        value = value.reshape(batch, value_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, key_len, heads, head_dim]
        key = key.reshape(batch, key_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, query_len, heads, head_dim]
        query = query.reshape(batch, query_len, self.heads, self.head_dim)

        # [batch, value_len, heads, head_dim]
        values = self.values(value  # [batch, value_len, heads, head_dim]
                             )
        # [batch, key_len, heads, head_dim]
        keys = self.keys(key  # [batch, key_len, heads, head_dim]
                         )
        # [batch, query_len, heads, head_dim]
        queries = self.queries(query  # [batch, query_len, heads, head_dim]
                               )
        
        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm
        # [batch, head, query_len, key_len] 
        # ~ we can say, query_len is our target and 
        # key_len is our source and 
        # energy is how much attention to pay on each word in key to predict query
        energy = torch.einsum("nqhd,nkhd->nhqk", 
                              [queries, # [batch, query_len, heads, head_dim]
                               keys # [batch, key_len, heads, head_dim]
                               ])

        # Mask padded indices so their weights become 0
        if mask is not None:
          energy = energy.masked_fill(mask == 0, float("-1e20"))
            # energy = energy.masked_fill(mask == 0, float("0"))

        # print("ENC energy:- \n",energy/ (self.embed_dim ** (1 / 2)))
        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability ~ sqrt(d_k)
        # attention shape: [batch, heads, query_len, key_len]
        attention = torch.softmax(energy / (self.embed_dim ** (1 / 2)), dim=3)

        print("ENC attention:- \n",attention)
        '''
        here key_len and value_len are same ~ denoted by l 
        '''
        # [batch, query_len, heads, head_dim]
        out = torch.einsum("nhql,nlhd->nqhd", [attention,  # [batch, heads, query_len, key_len]
                                               values  # [batch, value_len, heads, head_dim]
                                               ])

        print("ENC values:- \n",values)
        print("ENC out:- \n",out)

        if mask is not None:
          out = out.masked_fill(values == 0, float("0"))

        # print("ENC after out:- \n", out)
        # [batch, query_len, heads, head_dim] --converted--> [batch, query_len, embed_dim]
        out = out.reshape(batch, query_len, self.heads * self.head_dim)

        return out

In [3]:
class TransformerEncBlock(nn.Module):
    def __init__(self, name: str = '', conf: dict = {},):
        super(TransformerEncBlock, self).__init__()

        self.attention = MulHeadEncAttention(name='MulHeadEncAttention', conf=conf)
        
        embed_dim = conf["trans_enc"]["embed_dim"]
        self.norm1 = nn.LayerNorm(embed_dim)
        
        forward_expansion = conf["trans_enc"]["forward_expansion"]
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, forward_expansion * embed_dim),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_dim, embed_dim),
        )

        self.norm2 = nn.LayerNorm(embed_dim)

        dropout = conf['trans_enc']["dropout"]
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):

      '''
      value ~ # [batch, seq_len, embed_dim]
      key ~ # [batch, seq_len, embed_dim]
      query ~ # [batch, seq_len, embed_dim]
      mask ~ # [batch, 1, 1, seq_len]
      '''

      '''
      Step 1: passing the value, key, and query to self attention layer
      '''
      # [batch, seq_len, embed_dim] 
      attention = self.attention(value,  # [batch, seq_len, embed_dim]
                                 key,  # [batch, seq_len, embed_dim]
                                 query,  # [batch, seq_len, embed_dim]
                                 mask  # [batch, 1, 1, seq_len]
                                 )
      # print("ENC query:- \n", query)
      # print("ENC attention:- \n", attention)
      '''
      Step 2: normalizing the output
      '''
      # [batch, seq_len, embed_dim]   
      # Add skip connection, run through normalization and finally dropout
      x = self.dropout(self.norm1(attention + query))
      forward = self.feed_forward(x)
      out = self.dropout(self.norm2(forward + x))

      return out

In [4]:
class PositionalEncodingEncoder(nn.Module):

    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
        super(PositionalEncodingEncoder, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def pos_mask(self, src, conf):
      # Here src_pad_idx is always 0
      # because here src is embedding and embedding is 0 for padding_idx~1
      src_pad_idx = 0 
      src_mask = (src != src_pad_idx)
      # [batch, seq_len, embed_dim]
      # the values are True or False
      return src_mask

    def forward(self, x: Tensor, conf: dict) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        pos_mask_val = self.pos_mask(x, conf)

        x = x + self.pe[:x.size(0)]
        x = x.masked_fill(pos_mask_val == 0, float("0"))

        return self.dropout(x)

In [5]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab_size: int, name: str = '', conf: dict = {},):

        super(TransformerEncoder, self).__init__()
        
        self.conf = conf
        embed_dim = conf['trans_enc']['embed_dim']
        self.word_embedding = nn.Embedding(num_embeddings = src_vocab_size, 
                                           embedding_dim = embed_dim,
                                           padding_idx=1)
        
        # max_length is the max length of sentence in the entire input / batch
        max_length = conf['trans_enc']['max_length']
        max_length = conf['trans_enc']['max_length']
        self.position_embedding = PositionalEncodingEncoder(embed_dim,
                                                      max_length)

        num_layers = conf['trans_enc']["num_Tlayers"]
        self.layers = nn.ModuleList(
            [
                TransformerEncBlock(name='TransformerEncBlock', conf=conf)
                for _ in range(num_layers)
            ]
        )

        dropout = conf['trans_enc']["dropout"]
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):

        '''
        x ~ # [batch, seq_len]
        mask ~ # [batch, 1, 1, seq_len]
        '''

        '''
        Step 1: pass through the embedding layer to convert text into vectors
        '''
        # x_embed ~ [batch, seq_len, embed_dim] 
        x_embed = self.word_embedding(x # [batch, seq_len]
                                      )  
        
        '''
        Step 2: position_embedding incorporates the position information
        '''
        if self.conf['trans_enc']["ps_embed_enc_ind"]:
          x_embed_with_position = self.position_embedding(x_embed, conf)
          x_embed = x_embed_with_position

        # out ~ [batch, seq_len, embed_dim] 
        out = self.dropout(x_embed # [batch, seq_len, embed_dim] 
                           )

        '''
        Step 3: passing the embeddings to the transformer block
        '''
        # In the Encoder the query, key, value are all the same
        for t_layer in self.layers:
          # [batch, seq_len, embed_dim]
          out = t_layer(out,  # [batch, seq_len, embed_dim]
                        out,  # [batch, seq_len, embed_dim]
                        out,  # [batch, seq_len, embed_dim]
                        mask  # [batch, 1, 1, seq_len]
                      )

        return out

# Transformer Decoder Model

In [69]:
class MulHeadDecAttention2(nn.Module):

    def __init__(self, conf: dict = {},):
        super(MulHeadDecAttention2, self).__init__()

        self.embed_dim = conf['trans_dec']["embed_dim"]
        self.heads = conf['trans_dec']["heads"] #number of heads
        self.head_dim = conf['trans_dec']["embed_dim"] // conf['trans_dec']["heads"]

        assert (
            self.head_dim * self.heads == conf['trans_dec']["embed_dim"]
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)

    def forward(self, value, key, query, mask):
        
        '''
        value ~ # [batch, seq_len, embed_dim]
        key ~ # [batch, seq_len, embed_dim]
        query ~ # [batch, seq_len, embed_dim]
        mask ~ # [batch, 1, 1, seq_len]
        '''

        # Get number of training examples
        batch = query.shape[0]

        '''
        value_len ~ the max seq_len in values 
        key_len ~ the max seq_len in keys 
        query_len ~ the max seq_len in query 
        here all of them are same
        '''
        value_len, key_len, query_len = value.shape[1], key.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, value_len, heads, head_dim]
        value = value.reshape(batch, value_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, key_len, heads, head_dim]
        key = key.reshape(batch, key_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, query_len, heads, head_dim]
        query = query.reshape(batch, query_len, self.heads, self.head_dim)

        # [batch, value_len, heads, head_dim]
        values = self.values(value  # [batch, value_len, heads, head_dim]
                             )
        # [batch, key_len, heads, head_dim]
        keys = self.keys(key  # [batch, key_len, heads, head_dim]
                         )
        # [batch, query_len, heads, head_dim]
        queries = self.queries(query  # [batch, query_len, heads, head_dim]
                               )
        
        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm
        # [batch, head, query_len, key_len] 
        # ~ we can say, query_len is our target and 
        # key_len is our source and 
        # energy is how much attention to pay on each word in key to predict query
        energy = torch.einsum("nqhd,nkhd->nhqk", 
                              [queries, # [batch, query_len, heads, head_dim]
                               keys # [batch, key_len, heads, head_dim]
                               ])

        # Mask padded indices so their weights become 0
        if mask is not None:
          energy = energy.masked_fill(mask == 0, float("-1e20"))
            # energy = energy.masked_fill(mask == 0, float("0"))

        # print("energy:- \n",energy/ (self.embed_dim ** (1 / 2)))
        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability ~ sqrt(d_k)
        # attention shape: [batch, heads, query_len, key_len]
        attention = torch.softmax(energy / (self.embed_dim ** (1 / 2)), dim=3)

        # print("attention:- \n",attention)
        '''
        here key_len and value_len are same ~ denoted by l 
        '''
        # [batch, query_len, heads, head_dim]
        out = torch.einsum("nhql,nlhd->nqhd", [attention,  # [batch, heads, query_len, key_len]
                                               values  # [batch, value_len, heads, head_dim]
                                               ])

        # print("values:- \n",values)
        # print("out:- \n",out)

        if mask is not None:
          out = out.masked_fill(values == 0, float("0"))

        # print("after out:- \n", out)
        # [batch, query_len, heads, head_dim] --converted--> [batch, query_len, embed_dim]
        out = out.reshape(batch, query_len, self.heads * self.head_dim)

        return out

In [71]:
class MulHeadDecAttention1(nn.Module):

    def __init__(self, conf: dict = {},):
        super(MulHeadDecAttention1, self).__init__()

        self.embed_dim = conf['trans_dec']["embed_dim"]
        self.heads = conf['trans_dec']["heads"] #number of heads
        self.head_dim = conf['trans_dec']["embed_dim"] // conf['trans_dec']["heads"]

        assert (
            self.head_dim * self.heads == conf['trans_dec']["embed_dim"]
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)

    def forward(self, value, key, query, mask):
        
        '''
        value ~ # [batch, seq_len, embed_dim]
        key ~ # [batch, seq_len, embed_dim]
        query ~ # [batch, seq_len, embed_dim]
        mask ~ # [batch, 1, 1, seq_len]
        '''

        # Get number of training examples
        batch = query.shape[0]

        '''
        value_len ~ the max seq_len in values 
        key_len ~ the max seq_len in keys 
        query_len ~ the max seq_len in query 
        here all of them are same
        '''
        value_len, key_len, query_len = value.shape[1], key.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, value_len, heads, head_dim]
        value = value.reshape(batch, value_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, key_len, heads, head_dim]
        key = key.reshape(batch, key_len, self.heads, self.head_dim)

        # [batch, seq_len, embed_dim] --converted--> [batch, seq_len, heads, head_dim]
        # [batch, query_len, heads, head_dim]
        query = query.reshape(batch, query_len, self.heads, self.head_dim)

        # [batch, value_len, heads, head_dim]
        values = self.values(value  # [batch, value_len, heads, head_dim]
                             )
        # [batch, key_len, heads, head_dim]
        keys = self.keys(key  # [batch, key_len, heads, head_dim]
                         )
        # [batch, query_len, heads, head_dim]
        queries = self.queries(query  # [batch, query_len, heads, head_dim]
                               )
        
        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm
        # [batch, head, query_len, key_len] 
        # ~ we can say, query_len is our target and 
        # key_len is our source and 
        # energy is how much attention to pay on each word in key to predict query
        energy = torch.einsum("nqhd,nkhd->nhqk", 
                              [queries, # [batch, query_len, heads, head_dim]
                               keys # [batch, key_len, heads, head_dim]
                               ])

        print("DEC1 energy:-\n",energy)
        # Mask padded indices so their weights become 0
        if mask is not None:
          energy = energy.masked_fill(mask == 0, float("-1e20"))
            # energy = energy.masked_fill(mask == 0, float("0"))

        # print("energy:- \n",energy/ (self.embed_dim ** (1 / 2)))
        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability ~ sqrt(d_k)
        # attention shape: [batch, heads, query_len, key_len]
        attention = torch.softmax(energy / (self.embed_dim ** (1 / 2)), dim=3)

        # print("attention:- \n",attention)
        '''
        here key_len and value_len are same ~ denoted by l 
        '''
        # [batch, query_len, heads, head_dim]
        out = torch.einsum("nhql,nlhd->nqhd", [attention,  # [batch, heads, query_len, key_len]
                                               values  # [batch, value_len, heads, head_dim]
                                               ])

        print("DEC1 values:-\n",values)
        print("DEC1 out:-\n",out)

        # if mask is not None:
        #   out = out.masked_fill(values == 0, float("0"))

        # print("after out:- \n", out)
        # [batch, query_len, heads, head_dim] --converted--> [batch, query_len, embed_dim]
        out = out.reshape(batch, query_len, self.heads * self.head_dim)

        return out

In [72]:
class TransformerDecBlock(nn.Module):
    def __init__(self, conf):
        super(TransformerDecBlock, self).__init__()

        self.attention_1 = MulHeadDecAttention1(conf) # with mask
        
        embed_dim = conf["trans_dec"]["embed_dim"]
        self.norm1 = nn.LayerNorm(embed_dim)

        self.attention_2 = MulHeadDecAttention2(conf) # without mask

        self.norm2 = nn.LayerNorm(embed_dim)

        forward_expansion = conf["trans_enc"]["forward_expansion"]
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, forward_expansion * embed_dim),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_dim, embed_dim),
        )

        self.norm3 = nn.LayerNorm(embed_dim)

        dropout = conf['trans_enc']["dropout"]
        self.dropout = nn.Dropout(dropout)

    def forward(self, trgt_value, trgt_key, trgt_query, trgt_mask, src_value, src_key, src_mask):

      '''
      trgt_value ~ # [batch, trgt_len, embed_dim]
      trgt_key ~ # [batch, trgt_len, embed_dim]
      trgt_query ~ # [batch, trgt_len, embed_dim]
      trgt_mask ~ # [batch, 1, trgt_len, trgt_len],
      src_value ~ # [batch, src_len, embed_dim]
      src_key ~ # [batch, src_len, embed_dim]
      src_mask # [batch, 1, 1, src_len],
      '''

      '''
      Step 1: passing the value, key, and query to self attention layer
      '''
      # 
      attention_1 = self.attention_1(trgt_value,  # [batch, trgt_len, embed_dim]
                                 trgt_key,  # [batch, trgt_len, embed_dim]
                                 trgt_query,  # [batch, trgt_len, embed_dim]
                                 trgt_mask  # [batch, 1, trgt_len, trgt_len]
                                 )

      '''
      Step 2: normalizing the output
      '''
      # [batch, seq_len, embed_dim]   
      # Add skip connection, run through normalization and finally dropout
      x = self.dropout(self.norm1(attention_1 + trgt_query))


      # [batch, seq_len, embed_dim] 
      attention_2 = self.attention_1(src_value,  # [batch, seq_len, embed_dim]
                                 src_key,  # [batch, seq_len, embed_dim]
                                 x,  # [batch, seq_len, embed_dim]
                                 src_mask  # [batch, 1, 1, seq_len]
                                 )
      
      x = self.dropout(self.norm1(attention_2 + x))

      forward = self.feed_forward(x)
      out = self.dropout(self.norm3(forward + x))

      return out

In [73]:
class PositionalEncodingDecoder(nn.Module):

    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def pos_mask(self, trgt, conf):
      trgt_pad_idx = conf["trans_dec"]["trgt_pad_idx"]
      trgt_mask = (trgt != trgt_pad_idx)
      # [batch, seq_len, embed_dim]
      # the values are True or False
      return trgt_mask

    def forward(self, x: Tensor, conf: dict) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        pos_mask_val = self.pos_mask(x, conf)

        x = x + self.pe[:x.size(0)]
        x = x.masked_fill(pos_mask_val == 0, float("0"))

        return self.dropout(x)

In [74]:
class TransformerDecoder(nn.Module):
    def __init__(self, trgt_vocab_size, conf):

        super(TransformerDecoder, self).__init__()
        
        self.conf = conf
        embed_dim = conf['trans_dec']['embed_dim']
        padding_idx = conf['trans_dec']['trgt_pad_idx']
        self.word_embedding = nn.Embedding(num_embeddings = trgt_vocab_size, 
                                           embedding_dim = embed_dim,
                                           padding_idx=1)
        
        # max_length is the max length of sentence in the entire input / batch
        max_length = conf['trans_dec']['max_length']
        self.position_embedding = PositionalEncodingDecoder(embed_dim,
                                                      max_length)

        num_layers = conf['trans_dec']["num_Tlayers"]
        self.layers = nn.ModuleList(
            [
                TransformerDecBlock(conf)
                for _ in range(num_layers)
            ]
        )

        dropout = conf['trans_dec']["dropout"]
        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(embed_dim, trgt_vocab_size)

    def forward(self, tgt, trgt_mask, src_enc, src_mask):

        '''
        tgt ~ # [batch, trgt_len]
        trgt_mask ~ # [batch, 1, trgt_len, trgt_len]
        src_enc ~ # [batch, src_len]
        src_mask ~ # [batch, 1, 1, src_len]
        '''

        '''
        Step 1: pass through the embedding layer to convert text into vectors
        '''
        # tgt_embed ~ [batch, trgt_len, embed_dim] 
        tgt_embed = self.word_embedding(tgt # [batch, trgt_len]
                                      )  
        
        '''
        Step 2: position_embedding incorporates the position information
        '''
        if self.conf['trans_dec']["ps_embed_enc_ind"]:
          trgt_embed_with_position = self.position_embedding(tgt_embed, conf)
          trgt_embed = trgt_embed_with_position

        out = self.dropout(trgt_embed)

        '''
        Step 3: passing the embeddings to the transformer block
        '''
        # In the Encoder the query, key, value are all the same
        for t_layer in self.layers:
          # [batch, seq_len, embed_dim]
          out = t_layer(out,  # [batch, trgt_len, embed_dim]
                        out,  # [batch, trgt_len, embed_dim]
                        out,  # [batch, trgt_len, embed_dim]
                        trgt_mask,  # [batch, 1, trgt_len, trgt_len],
                        src_enc, # [batch, src_len, embed_dim]
                        src_enc, # [batch, src_len, embed_dim]
                        src_mask # [batch, 1, 1, src_len],
                      )
          
        out = self.fc(out)

        return out

# Transformer Model

In [6]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size: int, trgt_vocab_size: int, conf: dict = {},):
        super().__init__()

        self.src_pad_idx = 1
        self.trgt_pad_idx = 1
        self.device = conf["device"]
        self.encoder = TransformerEncoder(src_vocab_size=src_vocab_size, conf=conf)    
        # self.decoder = TransformerDecoder(trgt_vocab_size=trgt_vocab_size, conf=conf)

    
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)

    def make_trgt_mask(self, trgt):
      batch, trgt_len = trgt.shape
      trgt_mask = torch.tril(torch.ones((trgt_len, trgt_len))).expand(batch, 1, trgt_len, trgt_len)
      return trgt_mask.to(self.device)
    
    def forward(self, src: Tensor, trgt: Tensor) -> Tensor:
      '''
      Step 1: create mask for the source text
      '''
      # [batch, 1, 1, src_len]
      # the values are True or False
      # all the ids (=src_pad_idx) will be False and rest will True
      src_mask = self.make_src_mask(src # [batch, src_len]
                                    )
      
      '''
      Step 2: encoding the source text
      '''
      # [batch, src_len, embed_dim]
      src_enc = self.encoder(src, # [batch, src_len]
                             src_mask # [batch, 1, 1, src_len]
                             )

      # '''
      # Step 3: create mask for the target task
      # '''
      # # [batch, 1, trgt_len, trgt_len]
      # # triangular matrix
      # trgt_mask = self.make_trgt_mask(trgt # [batch, trgt_len]
      #                                 )

      # '''
      # Step 4: encoding the target text
      # src_enc (source encoding) is used to the encoding the target
      # '''  
      # out = self.decoder(trgt, # [batch, trgt_len]
      #                    trgt_mask, # [batch, 1, trgt_len, trgt_len]
      #                    src_enc, # [batch, src_len]
      #                    src_mask # [batch, 1, 1, src_len]
      #                    )

      return out

In [7]:
### toy example
conf = {
    "device" : torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "trans_enc": {
      "src_pad_idx" : 0,
      "embed_dim" : 6,
      "max_length" : 20,
      "ps_embed_enc_ind": True,
      "num_Tlayers" : 1,
      "heads" : 2,
      "dropout" : 0.1,
      "forward_expansion" : 4
    },
    "trans_dec": {
      "trgt_pad_idx" : 0,
      "embed_dim" : 6,
      "max_length" : 20,
      "ps_embed_enc_ind": True,
      "num_Tlayers" : 1,
      "heads" : 2,
      "dropout" : 0.1,
      "forward_expansion" : 4
    }
}

# 1 padding
x = torch.tensor([
                  [5,2,1],
                  [6,7,2]
]).to(conf['device'])
y = torch.tensor([
                  [7,4,3,5,9,2,1],
                  [5,6,2,4,7,6,2]
]).to(conf['device'])

# print(y[:,:-1])
# print("x shape:- ", x.shape)
# print("y shape:-", y.shape)
# print("y[:,:-1] shape:-", y[:,:-1].shape)

src_vocab_size = 10
trgt_vocab_size = 10

# instantiate the model
model = Transformer(src_vocab_size, trgt_vocab_size, conf)
# output
out = model(x, y)

ENC attention:- 
 tensor([[[[0.6410, 0.3590, 0.0000],
          [0.5928, 0.4072, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.6649, 0.3351, 0.0000],
          [0.5128, 0.4872, 0.0000],
          [0.5000, 0.5000, 0.0000]]],


        [[[0.3879, 0.3269, 0.2852],
          [0.3314, 0.3327, 0.3359],
          [0.3005, 0.3295, 0.3699]],

         [[0.4808, 0.3156, 0.2036],
          [0.2113, 0.2346, 0.5541],
          [0.3815, 0.2295, 0.3890]]]], grad_fn=<SoftmaxBackward0>)
ENC values:- 
 tensor([[[[-0.5293,  0.4282, -0.8814],
          [ 0.7321,  1.3328,  0.5176]],

         [[-0.4071, -1.0329, -0.6063],
          [ 0.1062,  0.1323,  0.1116]],

         [[ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000]]],


        [[[-0.4772,  0.9794, -1.0128],
          [ 0.1031,  1.8358, -0.4967]],

         [[ 0.0455, -0.1329,  0.0962],
          [ 0.9042,  0.2113,  1.4982]],

         [[ 0.6774, -0.7517,  1.1889],
          [ 0.4709, -0.9083,  0.8839]]]], grad_fn=<Unsa

NameError: ignored

In [None]:
#define hyperparameters
eng_vocab_size = len(ENG_TEXT.vocab)
ben_vocab_size = len(BEN_TEXT.vocab)

conf = {
    "device" : torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "trans_enc": {
      "src_pad_idx" : 1,
      "embed_dim" : 6,
      "max_length" : 20,
      "ps_embed_enc_ind": True,
      "num_Tlayers" : 2,
      "heads" : 2,
      "dropout" : 0.1,
      "forward_expansion" : 4
    },
    "trans_dec": {
      "trgt_pad_idx" : 1,
      "embed_dim" : 6,
      "max_length" : 20,
      "ps_embed_enc_ind": True,
      "num_Tlayers" : 2,
      "heads" : 2,
      "dropout" : 0.1,
      "forward_expansion" : 4
    }
}

#instantiate the model
train_model = Transformer(vocab_size, conf)
train_model = train_model.to(device)

In [15]:
"""
A from scratch implementation of Transformer network,
following the paper Attention is all you need with a
few minor differences. I tried to make it as clear as
possible to understand and also went through the code
on my youtube channel!
"""

import torch
import torch.nn as nn


class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        query = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)  # (N, value_len, heads, head_dim)
        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
        queries = self.queries(query)  # (N, query_len, heads, heads_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            print("energy:- ", energy.shape)
            print("mask:- ", mask.shape)
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, trg_mask, enc_dec_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, enc_dec_mask)
        return out


class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, trg_mask, enc_dec_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, trg_mask, enc_dec_mask)

        out = self.fc_out(x)

        return out


class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=15,
        num_layers=1,
        forward_expansion=4,
        heads=3,
        dropout=0,
        device="cpu",
        max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
      trgt_mask = (trg != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
      # (N, 1, 1, trgt_len)
      return trgt_mask.to(self.device)

    def make_enc_dec_mask(self, src, trg):
        N, trg_len = trg.shape
        N, src_len = src.shape
        trg_mask = torch.tril(torch.ones((trg_len, src_len))).expand(
            N, 1, trg_len, src_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_dec_mask = self.make_enc_dec_mask(src, trg)
        print(enc_dec_mask)
        enc_src = self.encoder(src, src_mask)
        print(enc_src.shape)
        out = self.decoder(trg, enc_src, trg_mask, enc_dec_mask)
        return out


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(
        device
    )
    trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)

    src_pad_idx = 0
    trg_pad_idx = 0
    src_vocab_size = 10
    trg_vocab_size = 10
    model = Transformer(src_vocab_size, 
                        trg_vocab_size, 
                        src_pad_idx, 
                        trg_pad_idx, 
                        device=device).to(
        device
    )
    out = model(x, trg[:, :])
    print(out.shape)

cpu
tensor([[[[1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 0.]]],


        [[[1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 0.]]]])
energy:-  torch.Size([2, 3, 9, 9])
mask:-  torch.Size([2, 1, 1, 9])
torch.Size([2, 9, 15])
energy:-  torch.Size([2, 3, 8, 8])
mask:-  torch.Size([2, 1, 1, 8])
energy:-  torch.Size([2, 3, 8, 9])
mask:-  torch.Size([2, 1, 8

In [19]:
attn = torch.tensor(
        [
         [[[0.6410, 0.3590, 0.0000],
          [0.5928, 0.4072, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.6649, 0.3351, 0.0000],
          [0.5128, 0.4872, 0.0000],
          [0.5000, 0.5000, 0.0000]]],


        [[[0.3879, 0.3269, 0.2852],
          [0.3314, 0.3327, 0.3359],
          [0.3005, 0.3295, 0.3699]],

         [[0.4808, 0.3156, 0.2036],
          [0.2113, 0.2346, 0.5541],
          [0.3815, 0.2295, 0.3890]]]])

val = torch.tensor([[[[-0.5293,  0.4282, -0.8814],
          [ 0.7321,  1.3328,  0.5176]],

         [[-0.4071, -1.0329, -0.6063],
          [ 0.1062,  0.1323,  0.1116]],

         [[ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000]]],


        [[[-0.4772,  0.9794, -1.0128],
          [ 0.1031,  1.8358, -0.4967]],

         [[ 0.0455, -0.1329,  0.0962],
          [ 0.9042,  0.2113,  1.4982]],

         [[ 0.6774, -0.7517,  1.1889],
          [ 0.4709, -0.9083,  0.8839]]]])

c = torch.einsum('nhql,nlhd->nqhd', [attn, val])
c = c.masked_fill(val == 0, float("0"))
print(c)

tensor([[[[-0.4854, -0.0963, -0.7826],
          [ 0.5224,  0.9305,  0.3815]],

         [[-0.4795, -0.1668, -0.7694],
          [ 0.4272,  0.7479,  0.3198]],

         [[ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000]]],


        [[[ 0.0230,  0.1221, -0.0223],
          [ 0.4308,  0.7644,  0.4140]],

         [[ 0.0845,  0.0279,  0.0957],
          [ 0.4948, -0.0658,  0.7363]],

         [[ 0.1222, -0.0275,  0.1671],
          [ 0.4300,  0.3955,  0.4982]]]])


In [18]:
import torch.nn.functional as F
scores = F.softmax(attn, dim=-1)
print(scores.shape)
print(val.shape)

scores = torch.matmul(scores, val)
print(scores)

torch.Size([2, 2, 3, 3])
torch.Size([2, 3, 2, 3])


RuntimeError: ignored