In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [8]:
#Hyper parameters
batch_size = 30   # Each batch will contain 30 sentences
max_sequence_len = 200  # max sentence len will be 200
d_model = 512 # Dimensionality of each char in sequence i.e 200 x 512
number_heads = 8  # Number of attention heads
fnn_hidden = 2048 # Feedforward layer dim
drop_prob = 0.1 # Dropout 
num_layer = 5 #number of layers  of encoder 

In [9]:
class ParameteresConfig():
    def __init__(self,**kwargs):
        self.batch_size = 30 # batch_size
        self.max_sequence_len = 200 # max_sequence_len
        self.d_model = 512 # d_model
        self.num_heads = 8 # number_heads
        self.fnn_hidden = 2048 #fnn_hidden
        self.drop_prob = 0.1 # drop_prob
        self.num_layer = 5 #num_layer

        #override the default arguments
        for key,val in kwargs.items():
            setattr(self,key,val)

    def display(self):
        print("parameters are:")
        for key,val in vars(self).items():
            print(f"{key} = {val}")

In [10]:
config = ParameteresConfig()

In [11]:
config.display()

parameters are:
batch_size = 30
max_sequence_len = 200
d_model = 512
num_heads = 8
fnn_hidden = 2048
drop_prob = 0.1
num_layer = 5


In [12]:
sen = torch.rand(30,200,512)

In [13]:
def scalarproduct(q,k,v,mask=False):
    d_k = q.size()[-1]
    dot_prod = (q @ k.transpose(-1,-2)) / math.sqrt(d_k)
    print(f"size of dot_pod of q and k: {dot_prod.size()}")
    if mask:
        print(f"Shape of mask in: {mask.size()}")
        #Adding the mask 
        dot_prod += mask
    soft = F.softmax(dot_prod,dim=-1)
    out = soft @ v 
    return out

In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self,config):
        super().__init__()
        assert config.d_model % config.num_heads ==0
        self.head_dim = config.d_model/config.num_heads
        self.num_head = config.num_heads
        self.d_model = config.d_model
        self.in_linear = nn.Linear(self.d_model,3*self.d_model)
        self.out_linear = nn.Linear(self.d_model,self.d_model)

    
    def forward(self,x):
        B,T,C = x.size() #--> batch_size,max_sequence_len,d_model i.e embeddings
        print(x.shape)
        q,k,v = self.in_linear(x).split(self.d_model,dim=-1)
        print(f"Before --> q.shape:{q.shape}, k.shape:{k.shape}, v.shape: {v.shape}")
        q = q.view(B,T,self.num_head,C//self.num_head).transpose(1,2)
        k = k.view(B,T,self.num_head,C//self.num_head).transpose(1,2)
        v = v.view(B,T,self.num_head,C//self.num_head).transpose(1,2)
        print(f"Before --> q.shape:{q.shape}, k.shape:{k.shape}, v.shape: {v.shape}")
        attention = scalarproduct(q,k,v)
        print(f"Dot product shape: {attention.shape}")
        attention = attention.transpose(1,2).reshape(B,T,C)
        out = self.out_linear(attention)
        print(f"final Output shape: {out.shape}")
        return out


In [15]:
# multi = MultiHeadAttention(config)
# new = multi.forward(sen)

In [16]:
class LayerNormalization(nn.Module):
    def __init__(self,config,elp=1e-5):
        super().__init__()
        self.parameter_size = config.d_model
        self.elp = elp
        self.gamma = nn.Parameter(torch.ones(self.parameter_size))
        self.beta = nn.Parameter(torch.zeros(self.parameter_size))

    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        print(f"Mean ({mean.size()})")
        var = ((x-mean)**2).mean(dim=-1,keepdim=True)
        print(f"varience size: {var.shape}")
        std = (var+self.elp).sqrt()
        print(f"Standard Deviation  ({std.size()})")
        y = (x-var)/std
        print(f"Y size: {y.size()}")
        out = y*self.gamma + self.beta
        print(f"self.gamma: {self.gamma.size()}, self.beta: {self.beta.size()}")
        print(f"out: {out.size()}")
        return out


In [17]:
# class LayerNormalization(nn.Module):
#     def __init__(self,config,epl=1e-5):
#         super().__init__()
#         self.epl = epl
#         self.gamma = nn.Parameter(torch.ones(config.d_model))
#         self.beta = nn.Parameter(torch.zeros(config.d_model))
#     def forward(self,x):
#         out = F.layer_norm(x,self.gamma.shape,self.gamma,self.beta,self.epl)
#         return out

In [18]:
# layer_norm = LayerNormalization(config)
# new =layer_norm.forward(sen)
# new.shape

In [19]:
config.display()

parameters are:
batch_size = 30
max_sequence_len = 200
d_model = 512
num_heads = 8
fnn_hidden = 2048
drop_prob = 0.1
num_layer = 5


In [20]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.in_linear = nn.Linear(config.d_model,config.fnn_hidden)
        self.out_linear = nn.Linear(config.fnn_hidden,config.d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(config.drop_prob)

    def forward(self,x):
        x =  self.in_linear(x)
        print(f"input after first linear layer: {x.shape}")
        x = self.relu(x)
        print(f"input after relu: {x.shape}")
        x = self.dropout(x)
        print(f"input after dropout: {x.shape}")
        x = self.out_linear(x)
        print(f"input after last linear layer: {x.shape}")


        return x

In [21]:
pos_fnn = PositionwiseFeedForward(config)
pos_fnn(sen)

input after first linear layer: torch.Size([30, 200, 2048])
input after relu: torch.Size([30, 200, 2048])
input after dropout: torch.Size([30, 200, 2048])
input after last linear layer: torch.Size([30, 200, 512])


tensor([[[-1.1934e-02,  8.6727e-02, -1.1665e-01,  ..., -1.0056e-01,
          -8.2042e-02, -7.0911e-02],
         [ 2.7607e-02, -1.2417e-01, -9.7045e-02,  ..., -1.0569e-01,
           1.2329e-01,  9.2798e-02],
         [-2.0217e-02,  6.9337e-02, -1.9854e-01,  ..., -1.0679e-01,
           1.1816e-01,  2.0627e-01],
         ...,
         [ 9.6069e-02, -1.1769e-01,  7.1897e-02,  ..., -2.3883e-01,
          -3.7176e-02,  5.6822e-02],
         [ 6.9802e-02, -8.7005e-02, -8.0719e-02,  ..., -2.8943e-02,
           5.8761e-02,  1.3279e-01],
         [ 1.7715e-01,  5.4293e-02, -8.7211e-02,  ...,  2.2315e-03,
           1.4968e-01,  3.1983e-03]],

        [[-5.6912e-02, -6.6315e-02, -2.0896e-01,  ..., -2.6100e-02,
           9.3751e-02, -1.0334e-01],
         [-5.5294e-02,  7.8071e-02, -9.2510e-02,  ..., -5.6710e-02,
           1.1387e-01,  4.6866e-02],
         [ 4.1873e-02,  4.3941e-02, -1.2474e-01,  ..., -1.5462e-01,
          -7.8897e-03,  1.2410e-01],
         ...,
         [ 2.7240e-02, -8

In [22]:
config.display()

parameters are:
batch_size = 30
max_sequence_len = 200
d_model = 512
num_heads = 8
fnn_hidden = 2048
drop_prob = 0.1
num_layer = 5


In [23]:
class EncoderLayer(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.attention = MultiHeadAttention(config)
        self.layernorm1 = LayerNormalization(config)
        self.dropout1 = nn.Dropout(config.drop_prob)
        self.pos_fnn = PositionwiseFeedForward(config)
        self.layernorm2 = LayerNormalization(config)
        self.dropout2 = nn.Dropout(config.drop_prob)

    def forward(self,x):
        residule_x = x
        print("-----------------ATTENTION------------------")
        x = self.attention(x)
        print("-----------------DROPOUT 1-------------------")
        x = self.dropout1(x)
        print(f"After dropout shape: {x.shape}")
        print("-----------------ADD & LAYER NORMALIZATION 1-------------------")
        x = self.layernorm1(x+residule_x)
        residule_x = x
        print("-----------------FEED FORWARD NETWORK-------------------")
        x = self.pos_fnn(x)
        print("-----------------DROPOUT 2-------------------")
        x = self.dropout2(x)
        print(f"After dropout shape: {x.shape}")
        print("-----------------ADD & LAYER NORMALIZATOIN 2-------------------")
        x = self.layernorm2(x+residule_x)
    
        return x


In [24]:
Enc_layer = EncoderLayer(config)
Enc_layer(sen);

-----------------ATTENTION------------------
torch.Size([30, 200, 512])
Before --> q.shape:torch.Size([30, 200, 512]), k.shape:torch.Size([30, 200, 512]), v.shape: torch.Size([30, 200, 512])
Before --> q.shape:torch.Size([30, 8, 200, 64]), k.shape:torch.Size([30, 8, 200, 64]), v.shape: torch.Size([30, 8, 200, 64])
size of dot_pod of q and k: torch.Size([30, 8, 200, 200])
Dot product shape: torch.Size([30, 8, 200, 64])
final Output shape: torch.Size([30, 200, 512])
-----------------DROPOUT 1-------------------
After dropout shape: torch.Size([30, 200, 512])
-----------------ADD & LAYER NORMALIZATION 1-------------------
Mean (torch.Size([30, 200, 1]))
varience size: torch.Size([30, 200, 1])
Standard Deviation  (torch.Size([30, 200, 1]))
Y size: torch.Size([30, 200, 512])
self.gamma: torch.Size([512]), self.beta: torch.Size([512])
out: torch.Size([30, 200, 512])
-----------------FEED FORWARD NETWORK-------------------
input after first linear layer: torch.Size([30, 200, 2048])
input afte

In [25]:
class Encoder(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.layers = nn.Sequential(*[EncoderLayer(config) for _ in range(config.num_layer)])

    def forward(self,x):
        x = self.layers(x)
        return x


In [26]:
encod = Encoder(config)
final_encoder_output = encod(sen)

-----------------ATTENTION------------------
torch.Size([30, 200, 512])
Before --> q.shape:torch.Size([30, 200, 512]), k.shape:torch.Size([30, 200, 512]), v.shape: torch.Size([30, 200, 512])
Before --> q.shape:torch.Size([30, 8, 200, 64]), k.shape:torch.Size([30, 8, 200, 64]), v.shape: torch.Size([30, 8, 200, 64])
size of dot_pod of q and k: torch.Size([30, 8, 200, 200])
Dot product shape: torch.Size([30, 8, 200, 64])
final Output shape: torch.Size([30, 200, 512])
-----------------DROPOUT 1-------------------
After dropout shape: torch.Size([30, 200, 512])
-----------------ADD & LAYER NORMALIZATION 1-------------------
Mean (torch.Size([30, 200, 1]))
varience size: torch.Size([30, 200, 1])
Standard Deviation  (torch.Size([30, 200, 1]))
Y size: torch.Size([30, 200, 512])
self.gamma: torch.Size([512]), self.beta: torch.Size([512])
out: torch.Size([30, 200, 512])
-----------------FEED FORWARD NETWORK-------------------
input after first linear layer: torch.Size([30, 200, 2048])
input afte