References:
- https://www.youtube.com/watch?v=g3sEsBGkLU0

In [1]:
import math

import torch
from torch import nn
import torch.nn.functional as F

In [2]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    print(f"scaled.size() : {scaled.size()}")
    if mask is not None:
        print(f"-- ADDING MASK of shape {mask.size()} --") 
        # Broadcasting add. So just the last N dimensions need to match
        scaled += mask
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, mask=None):
        batch_size, max_sequence_length, d_model = x.size()
        print(f"x.size(): {x.size()}")
        qkv = self.qkv_layer(x)
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.reshape(batch_size, max_sequence_length, 
                          self.num_heads, 3 * self.head_dim)
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.permute(0, 2, 1, 3)
        print(f"qkv.size(): {qkv.size()}")
        q, k, v = qkv.chunk(3, dim=-1)
        print(f"q size: {q.size()}, k size: {k.size()}, v size: {v.size()}, ")
        values, attention = scaled_dot_product(q, k, v, mask)
        print(f"""values.size(): {values.size()}, 
        attention.size:{ attention.size()} """)
        values = values.reshape(batch_size, max_sequence_length, 
                                self.num_heads * self.head_dim)
        print(f"values.size(): {values.size()}")
        out = self.linear_layer(values)
        print(f"out.size(): {out.size()}")
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean ({mean.size()})")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation  ({std.size()})")
        y = (inputs - mean) / std
        print(f"y: {y.size()}")
        out = self.gamma * y  + self.beta
        print(f"self.gamma: {self.gamma.size()}, self.beta: {self.beta.size()}")
        print(f"out: {out.size()}")
        return out

  
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        print(f"x after first linear layer: {x.size()}")
        x = self.relu(x)
        print(f"x after activation: {x.size()}")
        x = self.dropout(x)
        print(f"x after dropout: {x.size()}")
        x = self.linear2(x)
        print(f"x after 2nd linear layer: {x.size()}")
        return x


class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, 
                                            num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, 
                                           hidden=ffn_hidden, 
                                           drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x):
        residual_x = x
        print("------- ATTENTION 1 ------")
        x = self.attention(x, mask=None)
        print("------- DROPOUT 1 ------")
        x = self.dropout1(x)
        print("------- ADD AND LAYER NORMALIZATION 1 ------")
        x = self.norm1(x + residual_x)
        residual_x = x
        print("------- ATTENTION 2 ------")
        x = self.ffn(x)
        print("------- DROPOUT 2 ------")
        x = self.dropout2(x)
        print("------- ADD AND LAYER NORMALIZATION 2 ------")
        x = self.norm2(x + residual_x)
        return x

class Encoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
        super().__init__()

        # this sequential unit is essentially going to take 
        # all the comma separated values here and 
        # it's going to execute them in the forward pass 
        # one at a time one after the other 

        # in this example, we have a sequence of five encoder layer objects
        self.layers = nn.Sequential(*[EncoderLayer(d_model, 
                                                   ffn_hidden, 
                                                   num_heads, 
                                                   drop_prob)
                                     for _ in range(num_layers)])

    # override the default forward method from Module
    def forward(self, x):

        # pass the input to our layers
        # essentially it's passing it through all of the five encoder layers
        x = self.layers(x)
        return x

In [3]:
d_model = 512 # the size of every single vector throughout the encoder
'''
every vector that you'll see kind of throughout the encoder architecture 
will be somewhat related to 512 dimensions
and this is the reason why I've just defined it as a parameter here
'''

num_heads = 8 # used in the concept of multi-headed attention
'''
in the Transformer neural network architecture 
we see these multi-headed attention units well essentially when
we're performing the concept of attention we are actually going 
to perform it eight times in parallel

you can see consider this as the number of parallelized operations that 
we perform within the encoder
'''

drop_prob = 0.1 # a Dropout
'''
where we're going to randomly turn off certain neurons and what this
allows the neural network to do is it forces it to learn along different paths
over here and thus make weight updates accordingly this will help 
the neural network be better able to generalize data instead of 
accidentally memorizing specific data
and effectively this acts as a regularizer 
if you've heard the concept of regularization for neural networks
and it's pretty useful when it comes to very deep networks with a lot of
connections and parameters now I've set this probability to 0.1 which basically
means that there is a 10% chance that a given neuron will be turned off on a
given stage 

we can adjust this value to be anywhere between zero and one
'''

batch_size = 30
'''
when we're dealing with neural networks and we want to perform some
training typically we would pass in multiple examples at the same time those
multiple examples constitute a batch

there's actually a couple of reasons why I would do this so first of all it's
faster training and second it's also more stable training

typically a good middle ground for many of these machine learning 
and deep learning problems is to use mini batches so do somewhere 
along the Middle where we will batch some some arbitrary number of examples
together in order to just learn effectively quickly and also in a way
that's more stable so that you every time that you know you'll 
see examples you will see that the loss on average
will always decrease

we're going to be it's saying that we are going to look at 30 examples of 
let's say 30 sentences in some English
language and it's only then that it's going to propagate 
through the entire network
that is go through the encoder and the decoder then we have 
a loss function which is going to be computed and then
the gradients are going to be calculated in the reverse Direction and 
will have gradient updates all the parameters will
be updated only after seeing 30 examples and so we have like a mini batch
gradient descent that we're going to be performing
'''

max_sequence_length = 200
'''
the largest number of words that we can be passing in at a time through the
encoder 

in reality this is always going to be the number of words that we
pass in through the encoder

we're going to be also passing a lot of padding tokens here and 
this padding token will be
let's say you know if there's the maximum sequence length is 
like 200 words and the sentence is only four
words and there'll be 196 padding tokens here that are just
going to be passed in and 

this is always going to be the case where we'll have a
sentence and then we will add some padding tokens such that 
the maximum sequence length is achieved 

so there's always a fixed lengths input for any kind of sentence 
that we decide to input to our Transformer encoder 
'''

ffn_hidden = 2048
'''
Feed Forward Network
While most of the cases like throughout
the entire encoder you can even like through almost all the entire decoder 
we will have 512 dimensional vectors like I
mentioned before it's only at this step over here that I'm going to be expanding
the number of neurons at some point to be 2048 before making it eventually back
down to 512. and this is simply to just learn additional information if 
and while we can like any other feed forward layer is designed to do 
'''

num_layers = 5
'''
number of Transformer encoder units that we want to include in our architecture

this can vary depending on complexity so you can change this to a higher number 
if you have a lot more data and also it can
pick up more complex patterns otherwise you can also keep it pretty low 
'''

# create an encoder object
encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers)

In [4]:
x = torch.randn( (batch_size, max_sequence_length, d_model) ) # includes 
                                                              # positional enc
out = encoder(x)

------- ATTENTION 1 ------
x.size(): torch.Size([30, 200, 512])
qkv.size(): torch.Size([30, 200, 1536])
qkv.size(): torch.Size([30, 200, 8, 192])
qkv.size(): torch.Size([30, 8, 200, 192])
q size: torch.Size([30, 8, 200, 64]), k size: torch.Size([30, 8, 200, 64]), v size: torch.Size([30, 8, 200, 64]), 
scaled.size() : torch.Size([30, 8, 200, 200])
values.size(): torch.Size([30, 8, 200, 64]), 
        attention.size:torch.Size([30, 8, 200, 200]) 
values.size(): torch.Size([30, 200, 512])
out.size(): torch.Size([30, 200, 512])
------- DROPOUT 1 ------
------- ADD AND LAYER NORMALIZATION 1 ------
Mean (torch.Size([30, 200, 1]))
Standard Deviation  (torch.Size([30, 200, 1]))
y: torch.Size([30, 200, 512])
self.gamma: torch.Size([512]), self.beta: torch.Size([512])
out: torch.Size([30, 200, 512])
------- ATTENTION 2 ------
x after first linear layer: torch.Size([30, 200, 2048])
x after activation: torch.Size([30, 200, 2048])
x after dropout: torch.Size([30, 200, 2048])
x after 2nd linear laye

In [5]:
!pip install session-info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import session_info

session_info.show()