# Call Imports

In [1]:
import torch
import numpy as np

# Make Random Q, K, and V tensors

`T` for timestep  
`d` for dimensionality

In [2]:
T, d = 3, 4
q, k, v = torch.rand((T, d*3)).chunk(3, dim=-1)

print(f'q shape: {q.shape}')
print(f'k shape: {k.shape}')
print(f'v shape: {v.shape}')

q shape: torch.Size([3, 4])
k shape: torch.Size([3, 4])
v shape: torch.Size([3, 4])


# Compute Attention (With Causal Mask)

$$
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^{\top}}{\sqrt{d}}\right)V
$$

# First Compute $\frac{QK^{\top}}{\sqrt{d}}$

# Outputs a (T, T) matrix

In [3]:
qk = (q@k.T)/np.sqrt(d)

print(qk)

tensor([[1.0366, 0.4480, 0.8684],
        [0.6112, 0.2692, 0.4616],
        [0.3284, 0.1526, 0.4017]])


# Masking is applied by setting masked values to $-\infty$ before softmax
$\oslash$ means element-wise division
$$
\text{softmax}(x) = \exp(x) \oslash \sum\limits_{i}\exp(x_i)
$$

In [7]:
# random example vector
example = torch.rand(5)
print(f'v: {example}')

# Set last value to -inf
example[-1] = float('-inf')
print(f'\nv with masked end value: {example}')

# apply softmax after setting last value to -inf
s_example = torch.softmax(example, dim=0)
print(f'\napply softmax: {s_example}')

v: tensor([0.1602, 0.6989, 0.0781, 0.5008, 0.2713])

v with masked end value: tensor([0.1602, 0.6989, 0.0781, 0.5008,   -inf])

apply softmax: tensor([0.1984, 0.3400, 0.1827, 0.2789, 0.0000])


# Now Apply Causal Attention Mask and Compute Softmax

In [8]:
# Create attention mask
attn_mask = torch.triu(
    torch.ones((T, T), dtype=torch.bool),
    diagonal=1
)
print('Attention mask (True values are the ones that get masked out):')
print(attn_mask)

# Apply attention mask
sqk = qk.clone()
sqk[attn_mask] = float('-inf')
sqk = torch.softmax(sqk, dim=-1)
print('\nQK^T/(sqrt(d)) after applying softmax:')
print(sqk)


Attention mask (True values are the ones that get masked out):
tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])

QK^T/(sqrt(d)) after applying softmax:
tensor([[1.0000, 0.0000, 0.0000],
        [0.5847, 0.4153, 0.0000],
        [0.3431, 0.2878, 0.3692]])


# Now Matrix Multiply with V and you have your attention output

In [9]:
attn_out = sqk@v

print('output:')
print(attn_out)

output:
tensor([[0.2759, 0.8454, 0.4397, 0.1585],
        [0.4682, 0.7469, 0.2908, 0.4935],
        [0.6005, 0.5586, 0.2971, 0.6820]])


# Compare with Original V

In [10]:
print('V:')
print(v)

print('\nQK^T/(sqrt(d)) after applying softmax:')
print(sqk)

print('\nAttention Output:')
print(attn_out)

V:
tensor([[0.2759, 0.8454, 0.4397, 0.1585],
        [0.7389, 0.6082, 0.0813, 0.9651],
        [0.7942, 0.2534, 0.3329, 0.9478]])

QK^T/(sqrt(d)) after applying softmax:
tensor([[1.0000, 0.0000, 0.0000],
        [0.5847, 0.4153, 0.0000],
        [0.3431, 0.2878, 0.3692]])

Attention Output:
tensor([[0.2759, 0.8454, 0.4397, 0.1585],
        [0.4682, 0.7469, 0.2908, 0.4935],
        [0.6005, 0.5586, 0.2971, 0.6820]])


# Notice that row 0 of attention output is just the first value vector

In [11]:
attn_out[0] == v[0]

tensor([True, True, True, True])

# Now see that row 1 of attention output is just a linear combination of the first 2 rows of V

In [12]:
# compare extracted output with manual computation
(sqk@v)[1] == sqk[1, 0]*v[0] + sqk[1, 1]*v[1]

tensor([True, True, True, True])

# Now see that row 2 of attention output is just a linear combination of all 3 rows of V

In [14]:
(sqk@v)[2] == sqk[2, 0]*v[0] + sqk[2, 1]*v[1] + sqk[2, 2]*v[2]

tensor([True, True, True, True])