<a href="https://colab.research.google.com/github/israel-adewuyi/transformer/blob/main/attention_mechanism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [None]:
"""
  Embedding vector of N * d_model, i.e N words/tokens in the sequence, residing in
    a d_model-dimensional space.

"""
d_model = 8
d_k = 4
N = 4 # sequence length
batch_size = 1
heads = 2

## Self *Attention*

In [None]:
# K,Q, V  - shape = d_model by d_k
K, Q, V= torch.rand((d_model, d_k)), torch.rand(d_model, d_k), torch.rand(d_model, d_k)

In [None]:
# mask is the same dim as the attention pattern matrix - d_model * d_model
mask = torch.ones(d_model, d_model)
mask = torch.tril(mask)
mask[mask == 0] = -float('inf')
mask[mask == 1] = 0

In [None]:
Z = ((Q @ K.T) / np.sqrt(d_k)) + mask
Z

tensor([[0.3811,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.4749, 0.0926,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.7087, 0.2542, 0.5864,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.5527, 0.2629, 0.3430, 0.3673,   -inf,   -inf,   -inf,   -inf],
        [0.7194, 0.4565, 0.5250, 0.5358, 0.3033,   -inf,   -inf,   -inf],
        [0.5079, 0.1389, 0.5416, 0.3255, 0.2208, 0.4135,   -inf,   -inf],
        [0.5179, 0.2049, 0.3409, 0.3261, 0.1632, 0.2434, 0.3883,   -inf],
        [0.3646, 0.2434, 0.1226, 0.2567, 0.1531, 0.2558, 0.4314, 0.4805]])

In [None]:
Z_mask = F.softmax(Z, dim=-1)
Z_mask.shape

torch.Size([8, 8])

In [None]:
attention = Z_mask @ V
attention

tensor([[0.9856, 0.6100, 0.5396, 0.9817],
        [0.9688, 0.5313, 0.3871, 0.7837],
        [0.6311, 0.4680, 0.2634, 0.6950],
        [0.6687, 0.4565, 0.2728, 0.6136],
        [0.5776, 0.4102, 0.3798, 0.6209],
        [0.5661, 0.3990, 0.3281, 0.5517],
        [0.6352, 0.4415, 0.3763, 0.5837],
        [0.5927, 0.4746, 0.4344, 0.5776]])

### Putting it all together

In [None]:
def self_attention(K, Q, V, is_mask_present):
    mask = torch.tril(torch.ones(d_model, d_model))
    mask[mask == 0] = -float('inf')
    mask[mask == 1] = 0
    attention_pattern = (Q @ K.T) / np.sqrt(d_k)

    if is_mask_present == True:
        attention_pattern += mask

    Z = F.softmax(attention_pattern, dim=-1) @ V
    return Z

In [None]:
print(self_attention(K, Q, V, is_mask_present=True))

tensor([[0.9856, 0.6100, 0.5396, 0.9817],
        [0.9688, 0.5313, 0.3871, 0.7837],
        [0.6311, 0.4680, 0.2634, 0.6950],
        [0.6687, 0.4565, 0.2728, 0.6136],
        [0.5776, 0.4102, 0.3798, 0.6209],
        [0.5661, 0.3990, 0.3281, 0.5517],
        [0.6352, 0.4415, 0.3763, 0.5837],
        [0.5927, 0.4746, 0.4344, 0.5776]])


## Multi-Head Attention

In [None]:
X = torch.rand((batch_size, N, d_model))
X, X.shape

(tensor([[[0.3271, 0.7711, 0.1463, 0.7642, 0.5292, 0.4177, 0.9999, 0.5407],
          [0.4655, 0.9250, 0.5578, 0.8686, 0.8262, 0.7051, 0.6217, 0.1012],
          [0.8937, 0.7151, 0.2110, 0.1878, 0.2648, 0.0572, 0.6349, 0.0100],
          [0.6035, 0.3324, 0.1408, 0.9069, 0.8992, 0.7332, 0.4593, 0.1390]]]),
 torch.Size([1, 4, 8]))

In [None]:
# Adjust the linear layer to accommodate multiple heads
qkv_layer = torch.nn.Linear(d_model, 3 * heads * d_k)
qkv_layer

Linear(in_features=8, out_features=24, bias=True)

In [None]:
qkv = qkv_layer(X)
qkv.shape

torch.Size([1, 4, 24])

In [None]:
"""
    For each batch, for each input word/token in the sequence, for each attention
        head, there are q, k and v matrices with dims of d_k
"""
qkv = qkv.reshape(batch_size, N, heads, 3 * d_k)

In [None]:
qkv.shape

torch.Size([1, 4, 2, 12])

In [None]:
q, k, v = qkv.chunk(3, dim=-1)

In [None]:
q.shape

torch.Size([1, 4, 2, 4])

In [None]:
def single_head_attention(Q, K, V):
    scores = (Q @ K.transpose(-2, -1)) / np.sqrt(d_k)
    attention_pattern = F.softmax(scores, dim=-1)
    Z = attention_pattern @ V
    return Z

In [None]:
# We need to apply this function to each head's Q, K, V
outputs = []
for i in range(heads):
    out = single_head_attention(q[:, :, i], k[:, :, i], v[:, :, i])
    outputs.append(out)

In [None]:
outputs

[tensor([[[-0.0820,  0.5960, -0.6383,  0.1604],
          [-0.0837,  0.5961, -0.6411,  0.1603],
          [-0.0815,  0.5950, -0.6378,  0.1599],
          [-0.0839,  0.5955, -0.6417,  0.1591]]], grad_fn=<UnsafeViewBackward0>),
 tensor([[[ 0.6695, -0.6061, -0.3127, -0.0378],
          [ 0.6654, -0.6082, -0.3121, -0.0362],
          [ 0.6689, -0.6024, -0.3109, -0.0373],
          [ 0.6635, -0.6063, -0.3105, -0.0353]]], grad_fn=<UnsafeViewBackward0>)]

### Putting it together