In [0]:
import torch
import numpy as np

In [0]:
# vector of embeddings
# let's say the number of the tokens/words or for the sake of simplicity the length of sentence is V = 3
# with each V_i (embedding) be of size (1 x 2)
# so [1, 6, 10]

torch.Tensor([[0.2, 0.3], [0.3, 0.4], [0.7, 0.5]]).unsqueeze(0).shape

torch.Size([1, 3, 2])

In [0]:
torch.manual_seed(33)
q = torch.randn(1, 3, 2)
k = torch.randn(1, 3, 2)
print(f"Query: \n{q} \n\n Key: \n{k}")

Query: 
tensor([[[ 1.1982, -0.3998],
         [-0.3476, -0.2759],
         [-2.3094, -1.0931]]]) 

 Key: 
tensor([[[-0.0808,  0.7721],
         [-1.1370, -0.4773],
         [-1.0679,  1.0688]]])


In [0]:
q, k = q.reshape(1, -1), k.view(1, -1)

In [0]:
print(q.size(), k.size())
q.view(-1, 1).size()

torch.Size([1, 6]) torch.Size([1, 6])


torch.Size([6, 1])

In [0]:
torch.matmul(q.view(-1, 1), k)

tensor([[-0.0968,  0.9251, -1.3624, -0.5718, -1.2796,  1.2806],
        [ 0.0323, -0.3087,  0.4546,  0.1908,  0.4270, -0.4273],
        [ 0.0281, -0.2684,  0.3953,  0.1659,  0.3712, -0.3715],
        [ 0.0223, -0.2130,  0.3137,  0.1317,  0.2946, -0.2948],
        [ 0.1866, -1.7830,  2.6259,  1.1022,  2.4662, -2.4682],
        [ 0.0883, -0.8440,  1.2430,  0.5217,  1.1674, -1.1683]])

In [0]:
# Softmax(similarity_measure)
softmax = torch.nn.Softmax(dim=0)
softmax(torch.matmul(q.view(-1, 1), k))

tensor([[0.1443, 0.4647, 0.0117, 0.0642, 0.0142, 0.5918],
        [0.1642, 0.1353, 0.0717, 0.1377, 0.0782, 0.1073],
        [0.1635, 0.1409, 0.0676, 0.1343, 0.0740, 0.1134],
        [0.1626, 0.1489, 0.0623, 0.1298, 0.0685, 0.1225],
        [0.1916, 0.0310, 0.6290, 0.3425, 0.6011, 0.0139],
        [0.1737, 0.0792, 0.1578, 0.1917, 0.1640, 0.0511]])

In [0]:
# Multihead Attention Layer
embedding_size = 6
num_heads = 3
dropout = 0.5
sequence_length = 10
batch_size = 4



self_attention = torch.nn.MultiheadAttention(embed_dim=embedding_size, 
                                             num_heads=num_heads, 
                                             dropout= dropout, 
                                             bias=False)

# [BATCH_SIZE, SEQUENCE_LENGTH, EMBED_SIZE]
src = torch.randn(batch_size, sequence_length, embedding_size)
src = src.permute(1, 0, 2)

# query = key = value for self_attention
attn_output, attn_output_weights = self_attention(src, src, src)
attn_output.size(), attn_output_weights.size()

(torch.Size([10, 4, 6]), torch.Size([4, 10, 10]))

In [0]:
# src.size() = [BATCH_SIZE, SEQENCE_LEN, EMBEDDING_DIM]
# change to [SEQUENCE_LEN, BATCH_SIZE, EMBEDDING_DIM]
src.size(), src.permute(1, 0, 2).size()

(torch.Size([10, 4, 6]), torch.Size([4, 10, 6]))

In [0]:
# Encoder and Decoder Stack
import torch.nn as nn


# - Encoder layer has two sub-layers 
#       - first : Multihead Attention Layer
#       - second : fc layer
# - After each sublayer we use fc sub layer

'''
class Encoder(nn.Module):
    def __init__(self, 
                 multi_head_attn_layer,
                 feed_forward_layer,
                 num_embeddings,
                 embedding_dim):
        super(Encoder, self).__init__()

        self.multi_head_attn_layer = multi_head_attn_layer
        self.feed_forward_layer = feed_forward_layer

        self.embed = nn.Embedding(num_embeddings, embedding_dim)


    def layer_norm(self, out):
        layer_norm = nn.LayerNorm(out.size()[1:])
        return layer_norm


    def forward(self, src):
        # TODO: Add Residual Connections
        # TODO: Fix the sizes of the input/output tensors
        # TODO: Add layer Norm

        embed = self.embed(src)
        attn_output, _ = self.multi_head_attn_layer(embed, embed, embed)
        out = torch.sum(embed, attn_output)
        layer_norm = nn.LayerNorm(out.size()[1:])

        out = layer_norm(out)
        out = self.feed_forward_layer(out)
        out = layer_norm(out)
        return out



class MultiheadAttentionLayer(nn.Module):
    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout,):
        super(MultiheadAttentionLayer, self).__init__()

        self.multi_head_attn_layer = nn.MultiheadAttention(embed_dim=embed_dim,
                                                           num_heads=num_heads,
                                                           dropout=dropout,
                                                           bias=False)
    def forward(self, src):
        attn_output, _ = self.multi_head_attn_layer(src, src, src)
        return attn_output


class FeedForwardLayer(nn.Module):
    def __init__(self,
                 input_size,
                 output_size,
                 dropout):
        supper(FeedforwarLayer, self).__init__()

        self.input_size = input_size
        self.output_size = output_size
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(in_features=self.input_size,
                            out_features=self.output_size)

    def forward(self, x):
        out = self.dropout(nn.ReLU(self.fc(x)))
        return out


class Decoder(nn.Module):
    
    # In addition to the two layers present in the encoder the decoder has one
    # more multihead attention layer that performs attention over the output of
    # the encoder stack.
    
    def __init__(self, 
                 multihead_attn_layer, 
                 feed_forward_layer,
                 dropout):
        super(Decoder, self).__init__()

        self.multihead_attn_layer = multihead_attn_layer
        self.feed_forward_layer = feed_forward_layer
        self.dropout = nn.Dropout(dropout)


    def forward(self):
        return

'''
print()




In [0]:
class Encoder(nn.Module):
    def __init__(self, 
                 multi_head_attn_layer,
                 feed_forward_layer,
                 num_embeddings,
                 embedding_dim):
        super(Encoder, self).__init__()

        self.multi_head_attn_layer = multi_head_attn_layer
        self.feed_forward_layer = feed_forward_layer

        self.embed = nn.Embedding(num_embeddings, embedding_dim)


    def layer_norm(self, out):
        layer_norm = nn.LayerNorm(out.size()[1:])
        return layer_norm


    def forward(self, src):
        # TODO: Add Residual Connections
        # TODO: Fix the sizes of the input/output tensors
        # TODO: Add layer Norm

        embed = self.embed(src)
        attn_output, _ = self.multi_head_attn_layer(embed, embed, embed)
        out = torch.sum(embed, attn_output)
        layer_norm = nn.LayerNorm(out.size()[1:])

        out = layer_norm(out)
        out = self.feed_forward_layer(out)
        out = layer_norm(out)
        return out

#multi_head_attn_layer = MultiheadAttentionLayer(embed_dim=12, num_heads=3, dropout=0.5)
#feed_forward_layer = FeedForwardLayer()
#encoder = Encoder(multi_head_attn_layer=multi_head_attn_layer, feed_forward_layer=feed_forward_layer)

In [0]:
from time import time, perf_counter

start = perf_counter()


class MultiheadAttentionLayer(nn.Module):
    '''
    Takes in embeddings and returns attention tensors/output
    '''
    def __init__(self, embed_dim, num_heads, dropout):
        super(MultiheadAttentionLayer, self).__init__()
        self.multi_head_attn_layer = nn.MultiheadAttention(embed_dim=embed_dim,
                                                           num_heads=num_heads,
                                                           dropout=dropout,
                                                           bias=True)
    def forward(self, src):
        attn_output, _ = self.multi_head_attn_layer(src, src, src)
        return attn_output

# Create the MultiheadAttentionLayer object
multihead = MultiheadAttentionLayer(embed_dim=512, num_heads=8, dropout=0.0)


# REMEMBER: Add padding to each sentences in the batch to have the same size
seq_len = 12
bs = 4
edim = 512
# A batch of equal sized sentences
#src = torch.randn(seq_len, bs, edim)
src = torch.randn(4, 512).unsqueeze(0)

# Output of multihead attention layer
attn_output = multihead(src)

print(attn_output.size())

# For calculating time
end = perf_counter()
time_taken = end - start
print(time_taken)

''' COMMENT: MultiheadAttentionLayer sublayer works. '''

torch.Size([1, 4, 512])
0.012960660999993934


' COMMENT: MultiheadAttentionLayer sublayer works. '

In [0]:
fc = nn.Linear(16, 32)
fc(torch.randn(4, 2, 16)).size()

torch.Size([4, 2, 32])

In [0]:
'''
    Here the focus is input the output of multi-head attention layer to 
    LayerNorm and then Feed Forward Network
'''

'\n    Here the focus is input the output of multi-head attention layer to \n    LayerNorm and then Feed Forward Network\n'

In [0]:
print(attn_output.size())

attn_output = attn_output.permute(1, 0, 2)
print(attn_output.reshape(4,-1).size())

torch.Size([1, 4, 512])
torch.Size([4, 512])


In [0]:
norm = nn.LayerNorm(512)
lnorm = norm(attn_output)
lnorm.size()

torch.Size([4, 1, 512])

In [0]:
class FeedForwardLayer(nn.Module):
    def __init__(self, input_size, dropout):
        super(FeedForwardLayer, self).__init__()

        self.dropout = nn.Dropout(dropout)

        self.fc1 = nn.Linear(in_features=input_size, out_features=2048)
        
        self.fc2 = nn.Linear(in_features=2048, out_features=512)

        self.relu = nn.ReLU()


    def forward(self, input):
        x = input.view(input.size(0), -1)
        
        out = self.dropout(self.relu(self.fc1(input)))
        out = self.dropout(self.relu(self.fc2(out)))
        print(f"{'-'*10} SUCCESS {'-'*10}\n")
        return out


feed_forward_layer = FeedForwardLayer(input_size=512, dropout=0.5)

fc_out = feed_forward_layer(torch.randn(4, 12, 512))
print(fc_out.size())

---------- SUCCESS ----------

torch.Size([4, 12, 512])


In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout):
        super(EncoderLayer, self).__init__()

        self.attn_layer = MultiheadAttentionLayer(embed_dim,
                                                  num_heads,
                                                  dropout)
        
        self.feed_forward_layer = FeedForwardLayer(embed_dim, dropout)

        self.norm = nn.LayerNorm(embed_dim)


    def forward(self, src):
        # multihead attention sub-layer
        attn_output = self.attn_layer(src)  

        # add and norm      
        attn = attn_output.add(src)
        _attn = self.norm(attn)

        # feed forward layer
        out = self.feed_forward_layer(_attn)

        # add and norm
        out = out.add(_attn)
        out = self.norm(out)

        return out

encoder_layer = EncoderLayer(512, 8, 0.5)

encoder_layer(torch.randn(4, 12, 512)).size()

---------- SUCCESS ----------



torch.Size([4, 12, 512])

In [0]:
class Encoder(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, max_len):
        super(Encoder, self).__init__()

        self.encoder = EncoderLayer(embed_dim, num_heads, dropout)

        self.embed = nn.Embedding(max_len, embed_dim)
        self.pos = nn.Embedding(max_len, embed_dim)

    def forward(self, src):
        _embed = self.embed(src)
        _pos = self.pos(src)
        _src = _embed.add(_pos)
        out = self.encoder(_src)
        return out

_encoder = Encoder(512, 8, 0.5, 12)

# A batch of sequences/tokens or sentences
_out = _encoder(torch.ones((4, 12), dtype=torch.long))
_out.size()

---------- SUCCESS ----------



torch.Size([4, 12, 512])

In [0]:
class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout):
        super(DecoderLayer, self).__init__()
        
        
        self.self_attn = MultiheadAttentionLayer(embed_dim, 
                                                 num_heads, 
                                                 dropout)
        
        self.feed_forward = FeedForwardLayer(embed_dim, dropout)

        self.encoder_attn = MultiheadAttentionLayer(embed_dim, 
                                                    num_heads, 
                                                    dropout)

        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, trg, positional_encoding):
        # TODO: How the mask works in this case.

        
        # masked multihead attention
        attn_output = self.self_attn(trg)

        # add and norm
        attn_output = attn_output.add(positional_encoding)
        norm_output = self.norm(attn_output)

        # encoder multihead attention
        output = self.encoder_attn(norm_output)

        # add and norm
        output = output.add(norm_output)

        # feed forwarfd layer
        fc_out = self.feed_forward(output)

        # add and norm
        out = fc_out.add(output)
        out = self.norm(out)

        return out

dlayer = DecoderLayer(512, 8, 0.5)
trg = torch.randn(4, 12, 512)
dlayer(trg, trg).size()

---------- SUCCESS ----------



torch.Size([4, 12, 512])

In [0]:
class Decoder(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, max_len):
        super(Decoder, self).__init__()
        self.decoder = DecoderLayer(embed_dim, num_heads, dropout)
        self.embed = nn.Embedding(max_len, embed_dim)
        self.pos = nn.Embedding(max_len, embed_dim)
        self.fc = nn.Linear(embed_dim, max_len)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, trg):
        _trg = self.embed(trg)
        _pos = self.embed(trg)
        _trg = _trg + _pos
        attn_out = self.decoder(_trg, _pos)
        out = self.softmax(self.fc(attn_out))
        return out

_decoder_ = Decoder(512, 8,  0.5, 33)
_decoder_(torch.ones((32, 33), dtype=torch.long)).size()

---------- SUCCESS ----------



torch.Size([32, 33, 33])

In [0]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout):
        super(EncoderLayer, self).__init__()

        self.attn_layer = MultiheadAttentionLayer(embed_dim,
                                                  num_heads,
                                                  dropout)
        
        self.feed_forward_layer = FeedForwardLayer(embed_dim, dropout)

        self.norm = nn.LayerNorm(embed_dim)


    def forward(self, src):
        # multihead attention sub-layer
        attn_output = self.attn_layer(src)  

        # add and norm      
        attn = attn_output.add(src)
        _attn = self.norm(attn)

        # feed forward layer
        out = self.feed_forward_layer(_attn)

        # add and norm
        out = out.add(_attn)
        out = self.norm(out)

        return out


class Encoder(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout):
        super(Encoder, self).__init__()

        self.encoder = EncoderLayer(embed_dim, num_heads, dropout)

        self.embed = nn.Embedding(12, embed_dim)
        self.pos = nn.Embedding(12, embed_dim)

    def forward(self, src):
        _embed = self.embed(src)
        _pos = self.pos(src)
        _src = _embed.add(_pos)
        out = self.encoder(_src)
        return out


class FeedForwardLayer(nn.Module):
    def __init__(self, input_size, dropout):
        super(FeedForwardLayer, self).__init__()

        self.dropout = nn.Dropout(dropout)

        self.fc1 = nn.Linear(in_features=input_size, out_features=2048)
        
        self.fc2 = nn.Linear(in_features=2048, out_features=512)

        self.relu = nn.ReLU()


    def forward(self, input):
        x = input.view(input.size(0), -1)
        
        out = self.dropout(self.relu(self.fc1(input)))
        out = self.dropout(self.relu(self.fc2(out)))
        print(f"{'-'*10} SUCCESS {'-'*10}\n")
        return out


class MultiheadAttentionLayer(nn.Module):
    '''
    Takes in embeddings and returns attention tensors/output
    '''
    def __init__(self, embed_dim, num_heads, dropout):
        super(MultiheadAttentionLayer, self).__init__()
        self.multi_head_attn_layer = nn.MultiheadAttention(embed_dim=embed_dim,
                                                           num_heads=num_heads,
                                                           dropout=dropout,
                                                           bias=True)
    def forward(self, src):
        attn_output, _ = self.multi_head_attn_layer(src, src, src)
        return attn_output


class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, fc_out, dropout):
        super(DecoderLayer, self).__init__()
        
        
        self.self_attn = MultiheadAttentionLayer(embed_dim, 
                                                 num_heads, 
                                                 dropout)
        
        self.feed_forward = FeedForwardLayer(embed_dim, dropout)

        self.encoder_attn = MultiheadAttentionLayer(embed_dim, 
                                                    num_heads, 
                                                    dropout)

        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, trg, trg_mask, positional_encoding):
        # TODO: How the mask works in this case.

        
        # masked multihead attention
        attn_output = self.self_attn(trg)

        # add and norm
        attn_output = attn_output.add(positional_encoding)
        norm_output = self.norm(attn_output)

        # encoder multihead attention
        output = self.encoder_attn(norm_output)

        # add and norm
        output = output.add(norm_output)

        # feed forwarfd layer
        fc_out = self.feed_forward(output)

        # add and norm
        out = fc_out.add(output)
        out = self.norm(out)

        return out


class Decoder(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, max_len):
        super(Decoder, self).__init__()
        self.decoder = DecoderLayer(embed_dim, num_heads, dropout)
        self.embed = nn.Embedding(max_len, embed_dim)
        self.pos = nn.Embedding(max_len, embed_dim)
        self.fc = nn.Linear(embed_dim, max_len)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, trg):
        _trg = self.embed(trg)
        _pos = self.embed(trg)
        _trg = _trg + _pos
        attn_out = self.decoder(_trg, _pos)
        out = self.softmax(self.fc(attn_out))
        return out