### Generate Data

In [2]:
import numpy as np
import math

Every word is split up into three vectors. Query vector, Key vector and a Value vector.


In [36]:
"""
L = Length of the input sequence. (Example use here: 'My name is X')
q = query vector
k = key vector
v = value vector
d_k, d_v = size of each of these vectors
"""

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [14]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[-0.67762928  0.70211587 -0.47525356 -2.56295576  1.5551187   1.60511125
   0.75763105 -0.51958392]
 [-0.02430746 -0.00934381 -0.66844778  0.750387   -1.39911559 -0.06681684
  -0.37380486 -1.16584116]
 [ 1.8900711  -0.22283168 -0.69120335 -0.0424152  -1.30495227  0.39678753
  -1.4265933  -0.31808167]
 [-0.88579704  1.40833268  0.07738356  1.30144203 -1.94665242 -0.41506677
   1.62032265  1.02627814]]
K
 [[ 0.71578513  1.16526636  0.06398878  0.67762784  1.80759377  1.19916105
   1.43557652 -1.37724867]
 [ 0.17332983  0.64572944 -1.03915836  0.53539524 -0.7198133   0.74833529
   1.20027689 -0.32163997]
 [ 1.31550364 -0.59087156  0.08534911 -0.28457603  0.83859706 -0.33220705
   0.81194889  0.67836294]
 [ 0.85123885  1.01049603  1.25692968 -0.00598653  1.04928531  1.22448944
   0.99894432  0.40709721]]
V
 [[-0.97358127  1.22023975  1.878248    0.01825846 -0.06955402  0.67219912
  -0.7610852   0.44161022]
 [ 0.6988489  -0.58548244  0.53943009 -1.02228291 -1.22176621  1.05666854
   0.8

### Self Attention

$$
\text{Self-Attention} = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) \times V
$$

Where:
- \( Q \): Query matrix
- \( K \): Key matrix
- \( V \): Value matrix
- \( d_k \): Dimensionality of keys (for scaling).

\
In order to create an initial attention matrix, we need every single word to look at every single other word, just to see if it has a higher affinity towards it or not. \
This is represented by the query (for every word that I am looking for) and the key (what I currently have)

In [16]:
np.matmul(q, k.T)

array([[-1.51403088, -4.32808532,  0.25131218,  3.27331444],
       [ 0.28690374,  3.17322338, -0.03762125, -1.72145933],
       [-1.41952919, -0.32966579, -2.62113747, -2.32791178],
       [ 0.16659567, -5.2896757 ,  0.17253028, -0.2001398 ]])

In [17]:
# Why we need sqrt(d_k) in denominator

q.var(), k.var(), np.matmul(q, k.T).var()

(0.980184447628651, 0.9449598889046289, 4.8561735839417315)

In [22]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)

q.var(), k.var(), scaled.var()

(0.980184447628651, 0.9449598889046289, 0.6070216979927163)

In [23]:
scaled

array([[-0.53529075, -1.53020924,  0.08885227,  1.15729142],
       [ 0.10143579,  1.12190389, -0.01330112, -0.60862778],
       [-0.50187936, -0.11655446, -0.92671204, -0.8230411 ],
       [ 0.05890046, -1.87018278,  0.06099866, -0.0707601 ]])

###  Masking

* This is to ensure words don't get context from words generated in the future/
* Not required in the encoders, but required in the decoders.

In [25]:
mask = np.tril(np.ones( (L, L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [26]:
mask[mask == 0] = -np.inf
mask[mask == 1] = 0

In [27]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [28]:
scaled + mask

array([[-0.53529075,        -inf,        -inf,        -inf],
       [ 0.10143579,  1.12190389,        -inf,        -inf],
       [-0.50187936, -0.11655446, -0.92671204,        -inf],
       [ 0.05890046, -1.87018278,  0.06099866, -0.0707601 ]])

### Softmax Function

$$
\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_{j=1}^n e^{x_j}}
$$

Where:
- \( x_i \): Input value for the \( i \)-th class.
- \( n \): Total number of classes.
- \( e \): Euler's number (approximately 2.718).
- \( \sum_{j=1}^n e^{x_j} \): Sum of the exponential values for all classes, used for normalization.


In [29]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [30]:
attention = softmax(scaled + mask)

In [31]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.26493623, 0.73506377, 0.        , 0.        ],
       [0.3201054 , 0.47058435, 0.20931026, 0.        ],
       [0.33049381, 0.04801458, 0.33118798, 0.29030363]])

In [32]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.49762506,  0.60794233,  0.64642307,  0.27058276, -0.29746273,
        -1.03604353, -1.45607207,  0.67912089],
       [-0.19810272,  0.12632573, -0.59270452, -0.21711228,  2.26130203,
        -0.61541424, -0.75269822, -0.28743564],
       [-0.23103371,  0.18161152, -0.04438343, -0.09248808,  0.9212339 ,
        -0.43617251, -0.79525031, -0.08820247],
       [-0.0043806 , -0.07683883,  0.73943937,  0.11732923, -0.52447543,
        -0.37566379, -0.49724814,  0.05216988]])

In [33]:
v

array([[ 0.49762506,  0.60794233,  0.64642307,  0.27058276, -0.29746273,
        -1.03604353, -1.45607207,  0.67912089],
       [-0.44886124, -0.04726151, -1.03931856, -0.39289034,  3.18354786,
        -0.46380842, -0.49918386, -0.63580793],
       [-0.85566331,  0.04417569,  1.13601745,  0.02763821, -2.30124631,
         0.54336373, -0.45026779, -0.0305359 ],
       [ 0.4688012 , -0.9993717 ,  0.68710015,  0.12956891,  0.63079671,
        -0.65773818,  0.5410406 , -0.45343629]])

In [34]:
def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
    attention = softmax(scaled)
    out = np.matmul(attention, v)
    return out, attention

In [35]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[-1.2087795  -1.50828906 -1.07479776  0.05679351  0.91493615  1.17189436
   0.29009568 -0.1805797 ]
 [ 1.44051816  0.27465887  0.46615202 -0.51871373 -0.70523157 -0.07220557
   0.74783277  0.44657925]
 [ 0.63327162  1.36888217  0.11894     1.1113973   2.26261751 -0.59942165
  -1.75585029  0.94500626]
 [-0.84726224 -0.4119995  -2.24749613 -0.43089725 -0.2738503   0.38349553
   0.07496207 -0.95659101]]
K
 [[-0.96908919  0.640674   -0.25692482 -0.22074116 -1.33844394 -0.42692239
  -0.06836516  1.3179714 ]
 [ 0.23150207  0.17232346  1.85615421 -1.81812641 -0.20550524 -1.13061119
   0.2180622   1.3310825 ]
 [-0.09426905 -1.09219823  0.56409671 -1.25872832 -0.55992421 -0.06979636
  -1.02816564 -0.33305953]
 [-1.71240455 -0.93057752  0.17855682  1.24106356 -1.04459279  1.01278155
   0.0897286   1.86025359]]
V
 [[ 0.49762506  0.60794233  0.64642307  0.27058276 -0.29746273 -1.03604353
  -1.45607207  0.67912089]
 [-0.44886124 -0.04726151 -1.03931856 -0.39289034  3.18354786 -0.46380842
  -0.4