In [1]:
## self- attention is basically doing cosine similarity across the embeddings of every word (word+ position vector)
## first, all(vectors) -> all(vectors) cosine similarity
## for instance, to find the first word's embedding: we find the cosine similarity of it with every other word 
## (that is the affinity of the word to every other word. One with high relation should be given more priority)
## So, this cosine similarity we have here would be taken as a weight for every other word we have
## But now the dimension of the embedding is (n, x) where n is the number of words, We must reduce it to (1, x) 
## We can add all the embeddings multiplied with their weight to get the embedding of the word

## _but wait, there is nothing being trained though. How will it learn_

## Query, Key, Values
## Query - the vector in question for which we are finding the embedding is called the query
## Key- All the vectors with which it finds a similarity
## Values- All the vectors again, when they are about to be multiplied with weights

## Question on my mind: I understand that we are finding the dot products of a vector with another vector. Doesn't the basic definition of 
## cosine similarity mean that we are seeing if two words are alike. For instance, "dog" and "bark". 
## So, for a change if the sentence we are looking at is: "The dog was barked at by the human"
## "human" and "bark" will have less similarity, as compared to the "dog" and "bark" when this situation actually calls for it
## The idea of the attention block is to learn as many Natural Language properties as possible
## Does a different function work then? As long as it's a static way of computing it can't compute relations on the fly. can it?
## There must be a learning for sure, then
## why did we decide for it to happen on query, key and values? and why linear layers?
## what do linear layers learn to do? they learn a combination of weights so that the resultant vector is a representation of the dynamic ways
## the input can be interpreted

## reverse engineer: if the dot product we are finding now, would indeed increase the similarity for "human" and "bark". 
## if cosine similarity remained the same, then? it's the vectors which have been modified. So what the FC layers essentially do is massage
## the vectors so that it is tuned to produce an attention vector for a particular NL property

In [2]:

import math
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

### functions

In [28]:
def softmax(arr: np.ndarray):
    sum_of_exponents = np.sum(np.exp(arr))
    return arr/ sum_of_exponents

### Attention

In [8]:
sent_embedding = np.random.randn(10, 1024)

#### Attention Block without Linear Layer

In [9]:
def attention_block_without_linear_layers(sentence: np.ndarray):
    query= sentence.copy()
    keys = sentence.copy()
    values = sentence.copy()

    weights = cosine_similarity(query, keys)
    weights = softmax(weights)
    weighted_values = np.matmul(weights, values)
    return weighted_values

In [None]:
attention_block_without_linear_layers(sent_embedding).shape

#### Attention Block

In [26]:
def attention_block(sentence: np.ndarray):
    #query
    query = Tensor(sentence.copy())
    query_layer = nn.Linear(1024, 1024)
    query = query_layer(query)

    #keys
    keys = Tensor(sentence.copy())
    keys_layer = nn.Linear(1024, 1024)
    keys = keys_layer(keys)

    #values
    values = Tensor(sentence.copy())
    values_layer = nn.Linear(1024, 1024)
    values = values_layer(values)

    weights = nn.CosineSimilarity()(query, keys)
    weights = nn.Softmax(weights).dim
    weighted_values = torch.matmul(weights, values)
    return weighted_values

In [None]:
attention_block(sent_embedding)

In [12]:
class AttentionBlock(nn.Module):
    def __init__(self, dim= 1024):
        super(AttentionBlock, self).__init__()

        self.query_layer = nn.Linear(dim, dim)
        self.keys_layer = nn.Linear(dim, dim)
        self.values_layer = nn.Linear(dim, dim)

    def forward(self, sentence):

        if not isinstance(sentence, Tensor):
            sentence= Tensor(sentence)

        query= self.query_layer(sentence)
        keys= self.keys_layer(sentence)
        values= self.values_layer(sentence)

        weights = F.cosine_similarity(query.unsqueeze(1), keys.unsqueeze(0), dim=-1)
        weights = F.softmax(weights, dim= -1)

        weighted_values = torch.matmul(weights, values)

        return weighted_values

In [13]:
att= AttentionBlock()
att.forward(sent_embedding)

tensor([[ 0.0299, -0.0435, -0.1451,  ..., -0.4967,  0.2968,  0.0756],
        [ 0.0259, -0.0464, -0.1332,  ..., -0.4950,  0.2985,  0.0762],
        [ 0.0272, -0.0442, -0.1294,  ..., -0.4916,  0.2944,  0.0765],
        ...,
        [ 0.0281, -0.0411, -0.1251,  ..., -0.4970,  0.2939,  0.0725],
        [ 0.0288, -0.0370, -0.1393,  ..., -0.4963,  0.2989,  0.0747],
        [ 0.0361, -0.0275, -0.1311,  ..., -0.4937,  0.3030,  0.0620]],
       grad_fn=<MmBackward0>)

#### Multi Head Attention Block

In [14]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, dim= 1024, num_heads= 4):
        super(MultiHeadAttentionBlock, self).__init__()

        assert dim%num_heads==0, "dim should be divisible by num_heads"

        self.dim= dim
        self.heads = num_heads
        self.per_head = dim // num_heads

        self.query_layer = nn.Linear(dim, dim)
        self.keys_layer = nn.Linear(dim, dim)
        self.values_layer = nn.Linear(dim, dim)
        self.linear_layer = nn.Linear(dim, dim)

    def split_head(self, tensor: Tensor):
        batch_size, num_tokens, dim = tensor.size()
        return tensor.view(batch_size, num_tokens, self.heads, self.per_head).transpose(1, 2)

    def forward(self, sentence):

        if not isinstance(sentence, Tensor):
            sentence= Tensor(sentence)

        query= self.split_head(self.query_layer(sentence))
        keys= self.split_head(self.keys_layer(sentence))
        values= self.split_head(self.values_layer(sentence))

        weights = F.cosine_similarity(query.unsqueeze(3), keys.unsqueeze(2), dim=-1) / math.sqrt(self.per_head)
        weights = F.softmax(weights, dim= -1)

        weighted_values = torch.matmul(weights, values)
        weighted_values = weighted_values.transpose(1,2).contiguous().reshape(sentence.shape[0], -1, self.dim)
        attention = self.linear_layer(weighted_values)

        return attention

In [15]:
sent_embedding = np.random.rand(1, 10, 1024)
mha = MultiHeadAttentionBlock()
mha.forward(sent_embedding).shape

torch.Size([1, 10, 1024])

#### Masked Multi-Head Attention Block

In [255]:
class MultiHeadAttentionWithMaskBlock(nn.Module):
    def __init__(self, dim= 1024, num_heads= 4):
        super(MultiHeadAttentionWithMaskBlock, self).__init__()

        assert dim%num_heads==0, "dim should be divisible by num_heads"

        self.dim= dim
        self.heads = num_heads
        self.per_head = dim // num_heads

        self.query_layer = nn.Linear(dim, dim)
        self.keys_layer = nn.Linear(dim, dim)
        self.values_layer = nn.Linear(dim, dim)
        self.linear_layer = nn.Linear(dim, dim)

    def split_head(self, tensor: Tensor):
        batch_size, num_tokens, dim = tensor.size()
        return tensor.view(batch_size, num_tokens, self.heads, self.per_head).transpose(1, 2)

    def forward(self, query, keys, values, mask= None):

        query= self.split_head(self.query_layer(query))
        keys= self.split_head(self.keys_layer(keys))
        values= self.split_head(self.values_layer(values))

        weights = F.cosine_similarity(query.unsqueeze(3), keys.unsqueeze(2), dim=-1) / math.sqrt(self.per_head)
        if mask is not None:
            weights = weights.masked_fill(mask == 0, -1e9)
        weights = F.softmax(weights, dim= -2)

        weighted_values = torch.matmul(weights, values)
        weighted_values = weighted_values.transpose(1,2).contiguous().reshape(query.shape[0], -1, self.dim)
        attention = self.linear_layer(weighted_values)

        return attention

In [204]:
sent_embedding = Tensor(np.random.rand(3, 10, 1024))
sent= Tensor(np.random.rand(3, 10))
sent[:, 8:] = 0
sent

tensor([[0.7044, 0.1143, 0.3322, 0.7135, 0.7482, 0.1063, 0.8904, 0.2262, 0.0000,
         0.0000],
        [0.3648, 0.7871, 0.4774, 0.5742, 0.3992, 0.6542, 0.3569, 0.0488, 0.0000,
         0.0000],
        [0.0378, 0.5281, 0.1939, 0.1050, 0.5639, 0.4289, 0.5317, 0.2887, 0.0000,
         0.0000]])

In [240]:
mask = (sent != 0).unsqueeze(1).unsqueeze(2)
tri = torch.tril(torch.ones(3, 1, 10, 10))
torch.logical_and(mask, tri).long()

tensor([[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]],


        [[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]],


        [[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 1, 1, 1, 0, 

In [206]:

mham = MultiHeadAttentionWithMaskBlock()
res = mham.forward(sent_embedding, sent_embedding, sent_embedding, mask)
res

torch.Size([3, 4, 10, 10])
torch.Size([3, 4, 10, 10])


tensor([[[-0.2608,  0.0836,  0.0352,  ..., -0.0335,  0.1987, -0.2264],
         [-0.2605,  0.0835,  0.0348,  ..., -0.0333,  0.1985, -0.2263],
         [-0.2603,  0.0836,  0.0346,  ..., -0.0330,  0.1986, -0.2266],
         ...,
         [-0.2610,  0.0834,  0.0351,  ..., -0.0332,  0.1986, -0.2264],
         [-0.2608,  0.0835,  0.0350,  ..., -0.0332,  0.1985, -0.2265],
         [-0.2607,  0.0835,  0.0350,  ..., -0.0333,  0.1985, -0.2263]],

        [[-0.3146,  0.0826,  0.0528,  ...,  0.0040,  0.2743, -0.2550],
         [-0.3145,  0.0827,  0.0525,  ...,  0.0041,  0.2743, -0.2550],
         [-0.3147,  0.0827,  0.0527,  ...,  0.0040,  0.2745, -0.2551],
         ...,
         [-0.3147,  0.0827,  0.0524,  ...,  0.0043,  0.2742, -0.2551],
         [-0.3150,  0.0828,  0.0526,  ...,  0.0042,  0.2746, -0.2554],
         [-0.3149,  0.0827,  0.0526,  ...,  0.0041,  0.2744, -0.2552]],

        [[-0.3039,  0.0498,  0.1231,  ..., -0.0834,  0.2766, -0.2127],
         [-0.3043,  0.0496,  0.1235,  ..., -0

### Feed Forward Network

In [18]:
class FeedForwardNetworkBlock(nn.Module):
    def __init__(self, dim= 1024, inter_dim= 512):
        super(FeedForwardNetworkBlock, self).__init__()

        self.ff1 = nn.Linear(dim, inter_dim)
        self.ff2 = nn.Linear(inter_dim, dim)
        self.relu = nn.ReLU()
    
    def forward(self, attention):
        return self.ff2(self.relu(self.ff1(attention)))

In [19]:
ffn = FeedForwardNetworkBlock()
ffn.forward(res)

tensor([[[ 0.0311, -0.0649, -0.0714,  ..., -0.0334,  0.0045,  0.0153],
         [ 0.0311, -0.0649, -0.0714,  ..., -0.0334,  0.0045,  0.0154],
         [ 0.0311, -0.0649, -0.0714,  ..., -0.0334,  0.0045,  0.0153],
         ...,
         [ 0.0311, -0.0649, -0.0714,  ..., -0.0334,  0.0045,  0.0154],
         [ 0.0311, -0.0649, -0.0714,  ..., -0.0334,  0.0045,  0.0154],
         [ 0.0311, -0.0649, -0.0714,  ..., -0.0334,  0.0045,  0.0153]]],
       grad_fn=<ViewBackward0>)

### Positional Encoding

In [20]:
class PositionalEncodingBlock(nn.Module):
    def __init__(self, max_token_length, dim):
        super(PositionalEncodingBlock, self).__init__()

        pe= torch.zeros(max_token_length, dim)
        position = torch.arange(0, max_token_length, dtype=torch.float).unsqueeze(1)
        # div_alt = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))
        div = 1 / (10000 ** (torch.arange(0, dim, 2).float() / dim))

        pe[:, 0::2] = torch.sin(position * div)
        pe[:, 1::2] = torch.cos(position * div)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, tokens):
        return tokens + self.pe[:, :tokens.size(1), :]

In [21]:
pe= PositionalEncodingBlock(100, 1024)
pe.forward(sent_embedding)

tensor([[[ 0.8690,  1.1486,  0.8891,  ...,  1.7113,  0.5925,  1.1869],
         [ 1.2769,  1.1276,  1.4926,  ...,  1.1220,  0.9188,  1.7056],
         [ 1.1858,  0.5714,  1.5746,  ...,  1.3050,  0.7084,  1.8731],
         ...,
         [ 0.7074,  1.0905,  0.7942,  ...,  1.0692,  0.1868,  1.2570],
         [ 1.8714,  0.6506,  1.3127,  ...,  1.9354,  0.8969,  1.8366],
         [ 1.1768, -0.7912,  0.6091,  ...,  1.4076,  0.4606,  1.5468]]])

### Encoder

In [212]:
class EncoderBlock(nn.Module):
    def __init__(self, num_heads: int = 4, dim: int = 1024, inter_dim: int = 512):
        super(EncoderBlock, self).__init__()

        self.mha = MultiHeadAttentionWithMaskBlock(dim, num_heads)
        self.ff = FeedForwardNetworkBlock(dim, inter_dim)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, token_embeddings, input_mask):
        mha_res = self.mha(token_embeddings, token_embeddings, token_embeddings, input_mask)
        add_norm1= self.norm1(torch.add(token_embeddings, mha_res))
        ff_res = self.ff(add_norm1)
        add_norm2 = self.norm2(torch.add(add_norm1, ff_res))
        return add_norm2


### Decoder

In [213]:
class DecoderBlock(nn.Module):
    def __init__(self, num_heads: int = 4, dim: int = 1024, inter_dim: int = 512):
        super(DecoderBlock, self).__init__()

        self.self_attention = MultiHeadAttentionWithMaskBlock(dim, num_heads)
        self.cross_attention = MultiHeadAttentionWithMaskBlock(dim, num_heads)
        self.ff = FeedForwardNetworkBlock(dim, inter_dim)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.norm3 = nn.LayerNorm(dim)

    def forward(self, output_embeddings, encoder_output, source_mask, target_mask):
        mmha_res = self.self_attention(output_embeddings, output_embeddings, output_embeddings, target_mask)
        add_norm1= self.norm1(torch.add(output_embeddings, mmha_res))
        mha_res = self.cross_attention(add_norm1, encoder_output, encoder_output, source_mask)
        add_norm2 = self.norm2(torch.add(mmha_res, mha_res))
        ff_res = self.ff(add_norm2)
        add_norm3 = self.norm2(torch.add(add_norm2, ff_res))
        return add_norm3


In [265]:
sent_embedding = Tensor(np.random.randn(3, 10, 1024))
sent= torch.rand(3, 10)
enc_embedding = Tensor(np.random.randn(3, 12, 1024))

In [None]:
target_mask = (sent != 0).unsqueeze(1).unsqueeze(2)
batch_size, sequence_length= sent.size()
tri = torch.tril(torch.ones(batch_size, 1, sequence_length, sequence_length))
torch.logical_and(target_mask, tri).long()

In [263]:
target_mask= torch.ones(1, 1, sent_embedding.size(1), 1)
target_mask[:, :, out_embedding.size(1):, :] = 0
source_mask= torch.zeros(1, 1, sent_embedding.size(1), 1)
source_mask[:, :, out_embedding.size(1):, :] = 1

In [216]:
dec= DecoderBlock()
dec(sent_embedding, sent_embedding, source_mask, target_mask)

torch.Size([1, 4, 10, 10])
torch.Size([1, 4, 10, 10])
torch.Size([1, 4, 10, 10])
torch.Size([1, 4, 10, 10])


tensor([[[ 0.1505, -1.4318, -0.8466,  ..., -0.7459,  1.2129,  1.6794],
         [ 0.1497, -1.4256, -0.8490,  ..., -0.7419,  1.2107,  1.6744],
         [ 0.1459, -1.4208, -0.8456,  ..., -0.7463,  1.2139,  1.6787],
         ...,
         [ 0.7678, -0.9228,  0.2006,  ..., -1.3777,  0.2824, -0.8879],
         [ 0.7692, -0.9339,  0.2005,  ..., -1.3825,  0.2834, -0.8881],
         [ 0.7591, -0.9308,  0.2029,  ..., -1.3822,  0.2785, -0.8891]]],
       grad_fn=<NativeLayerNormBackward0>)

### Transformer

In [259]:
class TransformerModule(nn.Module):
    def __init__(self, dim, encoder_vocab_size, decoder_vocab_size, max_token_length, num_heads, num_layers):
        super(TransformerModule, self).__init__()

        self.encoder_embedding = nn.Embedding(encoder_vocab_size, dim)
        self.decoder_embedding = nn.Embedding(decoder_vocab_size, dim)
        self.positional_encoder = PositionalEncodingBlock(max_token_length, dim)

        self.encoder_layers = nn.ModuleList([EncoderBlock(num_heads, dim, 2* dim) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderBlock(num_heads, dim, 2* dim) for _ in range(num_layers)])
        
        self.linear = nn.Linear(dim, decoder_vocab_size)
        self.softmax = nn.Softmax()

    def generate_mask(self, input, target):
        input_mask = (input != 0).unsqueeze(1).unsqueeze(2)
        target_mask = (target != 0).unsqueeze(1).unsqueeze(2)
        batch_size, sequence_length= target.size()
        tri = torch.tril(torch.ones(batch_size, 1, sequence_length, sequence_length))
        target_mask = torch.logical_and(target_mask, tri).long()

        return input_mask, target_mask

    def forward(self, input, target):
        input_mask, target_mask = self.generate_mask(input, target)

        input_embedding = self.encoder_embedding(input)
        output_embedding = self.decoder_embedding(target)

        encoder_output = input_embedding
        for encoder_layer in self.encoder_layers:
            encoder_output = encoder_layer(encoder_output, input_mask)

        decoder_output = output_embedding
        for decoder_layer in self.decoder_layers:
            decoder_output = decoder_layer(decoder_output, encoder_output, input_mask, target_mask)

        linear_output = self.linear(decoder_output)
        return self.softmax(linear_output)


In [260]:
transformer = TransformerModule(
    dim= 64,
    encoder_vocab_size=500,
    decoder_vocab_size=500,
    max_token_length=100,
    num_heads=4,
    num_layers=2,
)

In [261]:
input_sentences = torch.randint(1, 500, (4, 100))
target_sentences = torch.randint(1, 500, (4, 100))
transformer(input_sentences, target_sentences)

  return self._call_impl(*args, **kwargs)


tensor([[[0.2345, 0.3223, 0.3191,  ..., 0.2616, 0.2889, 0.2379],
         [0.2345, 0.3218, 0.3178,  ..., 0.2617, 0.2894, 0.2377],
         [0.2346, 0.3212, 0.3163,  ..., 0.2618, 0.2896, 0.2378],
         ...,
         [0.2552, 0.1833, 0.2444,  ..., 0.2038, 0.3027, 0.2636],
         [0.2673, 0.1612, 0.2405,  ..., 0.1898, 0.3266, 0.2705],
         [0.2797, 0.1431, 0.2414,  ..., 0.1703, 0.3629, 0.2758]],

        [[0.2566, 0.2335, 0.2245,  ..., 0.2718, 0.2672, 0.2174],
         [0.2578, 0.2329, 0.2252,  ..., 0.2724, 0.2671, 0.2183],
         [0.2594, 0.2322, 0.2258,  ..., 0.2732, 0.2674, 0.2189],
         ...,
         [0.3828, 0.2000, 0.2150,  ..., 0.3144, 0.3016, 0.2409],
         [0.3703, 0.2111, 0.2139,  ..., 0.3292, 0.2913, 0.2274],
         [0.3740, 0.2112, 0.1952,  ..., 0.3746, 0.2602, 0.2187]],

        [[0.2387, 0.2130, 0.2173,  ..., 0.2352, 0.2043, 0.2951],
         [0.2385, 0.2136, 0.2176,  ..., 0.2357, 0.2040, 0.2950],
         [0.2384, 0.2145, 0.2178,  ..., 0.2360, 0.2036, 0.