In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [2]:
batch_size = 30   # Each batch will contain 30 sentences
max_sequence_len = 200  # max sentence len will be 200
d_model = 512 # Dimensionality of each char in sequence i.e 200 x 512
number_heads = 8  # Number of attention heads
fnn_hidden = 2048 # Feedforward layer dim
drop_prob = 0.1 # Dropout 
num_layer = 5 #number of layers  of encoder 

In [3]:
class ParametersConfig():
    def __init__(self,**kwargs):
        self.batch_size = 30   
        self.max_sequence_len = 200  
        self.d_model = 512 
        self.number_heads = 8  
        self.fnn_hidden = 2048 
        self.drop_prob = 0.1 
        self.num_layer = 5

        for key,val in kwargs.items():
            setattr(self,key,val)

    def display(self):
        print("parameters are: ")
        for key,val in vars(self).items():
            print(f"{key} = {val}")

In [4]:
config = ParametersConfig()
config.display()

parameters are: 
batch_size = 30
max_sequence_len = 200
d_model = 512
number_heads = 8
fnn_hidden = 2048
drop_prob = 0.1
num_layer = 5


In [5]:
sen = torch.rand(batch_size,max_sequence_len,d_model)
sen.shape

torch.Size([30, 200, 512])

In [123]:
def scalardot(q,k,v,mask=None):
    d_k = q.size()[-1]
    scalar = (q@k.transpose(-2,-1))/math.sqrt(d_k)
    if mask is not None:
        scalar+=mask
    soft = F.softmax(scalar,dim=-1)
    out = soft@v
    return out


In [83]:
class MultiHeadAttention(nn.Module):
    def __init__(self,config):
        super().__init__()
        assert config.d_model % config.number_heads == 0
        self.in_linear = nn.Linear(config.d_model,config.d_model*3)
        self.out_linear = nn.Linear(config.d_model,config.d_model)
        self.d_model = config.d_model
        self.number_heads = config.number_heads

    def forward(self,x):
        B,T,C = x.size()
        q,k,v = self.in_linear(x).split(C,dim=-1)
        q = q.view(B,T,self.number_heads,C//self.number_heads).transpose(1,2)
        k = k.view(B,T,self.number_heads,C//self.number_heads).transpose(1,2)
        v = v.view(B,T,self.number_heads,C//self.number_heads).transpose(1,2)
        attention = scalardot(q,k,v,mask=None)
        x = attention.transpose(1,2).reshape(B,T,C)
        out = self.out_linear(x)
        return out


In [85]:
# multi = MultHeadAttention(config)
# new = multi(sen)
# new.shape

In [9]:
# mask = torch.full([10, 10] , float('-inf'))
# mask = torch.triu(mask, diagonal=1)
# mask

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [12]:
config.display()

parameters are: 
batch_size = 30
max_sequence_len = 200
d_model = 512
number_heads = 8
fnn_hidden = 2048
drop_prob = 0.1
num_layer = 5


In [23]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.in_linear = nn.Linear(config.d_model,config.fnn_hidden)
        self.out_linear = nn.Linear(config.fnn_hidden,config.d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(config.drop_prob)

    def forward(self,x):
        x = self.in_linear(x)
        print(f"input after first linear layer: {x.shape}")
        x = self.relu(x)
        x = self.dropout(x)
        out = self.out_linear(x)

        return out

In [25]:
fnn_layer = PositionwiseFeedForward(config)
fnn_layer(sen).shape

input after first linear layer: torch.Size([30, 200, 2048])


torch.Size([30, 200, 512])

In [31]:
class LayerNormalization(nn.Module):
    def __init__(self,config,elp=1e-5):
        super().__init__()
        self.parameter_shape = config.d_model
        self.elp = elp
        self.gamma = nn.Parameter(torch.ones(self.parameter_shape))
        self.beta = nn.Parameter(torch.zeros(self.parameter_shape))
        
    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        var = ((x-mean)**2).mean(dim=-1,keepdim=True)
        std = (var+self.elp).sqrt()
        x = (x-var)/std
        out = x*self.gamma + self.beta

        return out

In [34]:
# layer_norm = LayerNormalization(config)
# layer_norm(sen).shape


torch.Size([30, 200, 512])

In [99]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.vk_linear = nn.Linear(config.d_model,config.d_model*2)
        self.q_linear = nn.Linear(config.d_model,config.d_model)
        self.out_linear = nn.Linear(config.d_model,config.d_model)
        self.num_head = config.number_heads
        self.d_model = config.d_model

    def forward(self,x,y,mask): #--> y = query , x = value,key
        B,T,C = x.size()
        k,v = self.vk_linear(x).split(self.d_model,dim=-1)
        q = self.q_linear(y)
        q = q.view(B,T,self.num_head,C//self.num_head).transpose(1,2)
        k = k.view(B,T,self.num_head,C//self.num_head).transpose(1,2)
        v = v.view(B,T,self.num_head,C//self.num_head).transpose(1,2)
        attention = scalardot(q,k,v,mask)
        value = attention.transpose(1,2).reshape(B,T,C)
        out = self.out_linear(value)

        return out


In [150]:
mask = torch.full([config.max_sequence_len,config.max_sequence_len],float(-1e9))
mask = torch.triu(mask,diagonal=1)


tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09,  ..., -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])

In [154]:
cross_att = MultiHeadCrossAttention(config)
new = cross_att(sen,sen,mask)
new.shape,new[0][0]

(torch.Size([30, 200, 512]),
 tensor([-3.4942e-01, -4.0296e-01, -9.0418e-02, -2.6804e-02,  3.6759e-01,
         -2.4123e-01, -2.8574e-01,  1.0380e-01,  5.5429e-02, -3.3417e-01,
         -3.6635e-02, -3.2199e-01,  1.4780e-01, -2.1409e-02,  2.3340e-01,
          1.3015e-02,  3.6251e-02,  1.5289e-02, -4.9355e-02, -1.3778e-01,
         -1.0361e-01, -2.0159e-01,  1.9469e-01,  1.4734e-01, -3.9139e-02,
         -8.1272e-02,  9.2873e-02,  3.1316e-02, -1.4936e-01, -3.3456e-02,
          3.2958e-02, -9.7513e-02,  6.1522e-02,  2.8603e-01, -8.2449e-02,
         -5.1152e-01, -7.9984e-02,  1.6377e-02, -4.1719e-02, -2.4613e-01,
          2.9034e-02,  1.6266e-01, -1.2981e-01, -2.4003e-02, -1.1398e-01,
          1.6889e-02, -2.0202e-02,  1.3650e-01, -2.1261e-01,  2.5188e-02,
          1.6057e-01,  1.4460e-01, -1.7118e-02,  2.0981e-01,  8.5745e-02,
          4.6990e-02,  2.1997e-01,  8.2009e-03,  1.9615e-01, -1.1128e-01,
          3.1003e-01, -2.8893e-01, -8.6054e-02, -2.5118e-01,  8.7434e-03,
         

In [183]:
class LayerDecoder(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.attention = MultiHeadAttention(config)
        self.norm1 = LayerNormalization(config)
        self.drop1 = nn.Dropout(config.drop_prob)
        self.cross_attention = MultiHeadCrossAttention(config)
        self.norm2 = LayerNormalization(config)
        self.drop2 = nn.Dropout(config.drop_prob)
        self.fnn = PositionwiseFeedForward(config)
        self.norm3 = LayerNormalization(config)
        self.drop3 = nn.Dropout(config.drop_prob)

    def forward(self,x,y,mask):
        y_residual = y
        print("-----------------SELF ATTENTION HEAD----------------")
        y = self.attention(y)
        print("-----------------DROPOUT 1----------------")
        y = self.drop1(y)
        print("-----------------ADD & NORMALIZE 1----------------")
        y = self.norm1(y+y_residual)

        y_residual = y
        print("-----------------CROSS ATTENTION HEAD----------------")
        y = self.cross_attention(x,y,mask)
        print("-----------------DROPOUT 2----------------")
        y = self.drop2(y)
        print("-----------------ADD & NORMALIZE 2----------------")
        y = self.norm2(y+y_residual)

        y_residual = y
        print("-----------------FEED FORWARD NETWORK----------------")
        y = self.fnn(y)
        print("-----------------DROPOUT 3----------------")
        y = self.drop3(y)
        print("-----------------ADD & NORMALIZE 3----------------")
        y = self.norm3(y+y_residual)
        print(y.shape)

        return y
                


In [184]:
# ly_de = LayerDecoder(config)
# out = ly_de(sen,sen,mask)
# out.shape,out

In [185]:
class DecoderSequence(nn.Sequential):
    def forward(self,*inputs):
        x,y,mask = inputs
        for module in self._modules.values():
            y = module(x,y,mask)
        return y 


In [186]:
config.display()

parameters are: 
batch_size = 30
max_sequence_len = 200
d_model = 512
number_heads = 8
fnn_hidden = 2048
drop_prob = 0.1
num_layer = 5


In [187]:
class Decoder(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.decoder = DecoderSequence(*[LayerDecoder(config) for _ in range(config.num_layer)])

    def forward(self,x,y,mask):
        y = self.decoder(x,y,mask)

        return y

In [188]:
decoder = Decoder(config)
decoder(sen,sen,mask)

-----------------SELF ATTENTION HEAD----------------
-----------------DROPOUT 1----------------
-----------------ADD & NORMALIZE 1----------------
-----------------CROSS ATTENTION HEAD----------------
-----------------DROPOUT 2----------------
-----------------ADD & NORMALIZE 2----------------
-----------------FEED FORWARD NETWORK----------------
input after first linear layer: torch.Size([30, 200, 2048])
-----------------DROPOUT 3----------------
-----------------ADD & NORMALIZE 3----------------
torch.Size([30, 200, 512])
-----------------SELF ATTENTION HEAD----------------
-----------------DROPOUT 1----------------
-----------------ADD & NORMALIZE 1----------------
-----------------CROSS ATTENTION HEAD----------------
-----------------DROPOUT 2----------------
-----------------ADD & NORMALIZE 2----------------
-----------------FEED FORWARD NETWORK----------------
input after first linear layer: torch.Size([30, 200, 2048])
-----------------DROPOUT 3----------------
-----------------A

tensor([[[-4.2079, -4.2685, -6.2530,  ..., -3.9700, -5.2043, -5.3958],
         [-4.1949, -4.9931, -7.0118,  ..., -3.4569, -5.0053, -5.1095],
         [-4.4100, -4.6248, -6.3096,  ..., -3.8732, -4.8843, -5.6217],
         ...,
         [-4.1440, -4.7730, -6.1866,  ..., -4.0500, -4.8594, -5.5557],
         [-4.4866, -4.3544, -6.5214,  ..., -3.7832, -4.6537, -5.7370],
         [-4.2844, -4.2821, -6.3834,  ..., -3.5718, -4.9871, -6.0664]],

        [[-4.5385, -4.9991, -6.4217,  ..., -4.0357, -4.9835, -4.9178],
         [-4.4212, -4.5834, -6.5642,  ..., -3.8333, -4.8735, -5.5341],
         [-4.4895, -4.5858, -6.5536,  ..., -3.7502, -4.6002, -5.1254],
         ...,
         [-4.3236, -4.8324, -6.6020,  ..., -3.6323, -4.5887, -5.5378],
         [-4.2908, -4.6537, -5.9806,  ..., -4.1228, -4.9306, -5.5533],
         [-4.1521, -4.9556, -6.3038,  ..., -4.1038, -4.3058, -5.3521]],

        [[-4.0493, -4.2307, -6.6386,  ..., -4.9814, -4.9126, -5.6578],
         [-4.5263, -4.4942, -6.8180,  ..., -3