References:
- https://www.youtube.com/watch?v=QCJQG4DuHT0

In [1]:
# https://www.youtube.com/watch?v=QCJQG4DuHT0
import numpy as np
np.random.seed(42)

In [2]:
# Length of input sequence (my name is Ajay)
# size of vectors (illustrative purposes choose 8)
L, d_k, d_v = 4, 8, 8

# q, k, v represent vectors
# q = what I am looking for
# w = what I can offer
# v = what I actually offer
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [3]:
print("Q\n", q) # each word represented as 8x1 vector
print("K\n", k)
print("V\n", v)

Q
 [[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
   1.57921282  0.76743473]
 [-0.46947439  0.54256004 -0.46341769 -0.46572975  0.24196227 -1.91328024
  -1.72491783 -0.56228753]
 [-1.01283112  0.31424733 -0.90802408 -1.4123037   1.46564877 -0.2257763
   0.0675282  -1.42474819]
 [-0.54438272  0.11092259 -1.15099358  0.37569802 -0.60063869 -0.29169375
  -0.60170661  1.85227818]]
K
 [[-0.01349722 -1.05771093  0.82254491 -1.22084365  0.2088636  -1.95967012
  -1.32818605  0.19686124]
 [ 0.73846658  0.17136828 -0.11564828 -0.3011037  -1.47852199 -0.71984421
  -0.46063877  1.05712223]
 [ 0.34361829 -1.76304016  0.32408397 -0.38508228 -0.676922    0.61167629
   1.03099952  0.93128012]
 [-0.83921752 -0.30921238  0.33126343  0.97554513 -0.47917424 -0.18565898
  -1.10633497 -1.19620662]]
V
 [[ 0.81252582  1.35624003 -0.07201012  1.0035329   0.36163603 -0.64511975
   0.36139561  1.53803657]
 [-0.03582604  1.56464366 -2.6197451   0.8219025   0.08704707 -0.29900735
   0.09

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

In [4]:
# leads to a 4x4 matrix because we had a sequence of 4 words
# each case it's going to be proportional to exactly how much attention
# we want to focus on each word
np.matmul(q, k.T)

# for example
# the first line is going to be the my vector and how much it's going
# to focus on other vectors
# in this example, going to focus the most on the word name

array([[-2.72357421,  0.40818741,  2.39601116, -1.18323729],
       [ 5.60012069,  1.1597874 , -4.7248515 ,  2.43859568],
       [ 1.03699903, -3.70553788, -3.0399309 ,  0.04345596],
       [ 0.09460324,  2.97027193,  0.43247995, -0.80026704]])

In [5]:
# why do we need the sqrt(d_k)
# we want to minimize variance and hence stabilize the values of the
# Q dot K transpose vector

q.var(), k.var(), np.matmul(q, k.T).var()

# the variance of the multiplication is much higher
# so in order to make sure we stabilize these values and reduce its variance
# we divide it by the sqrt() of the dimension of the query vector

(0.8669372677550163, 0.7119028180363141, 6.837565240983166)

In [6]:
scaled = np.matmul(q, k.T) / np.sqrt(d_k)
q.var(), k.var(), scaled.var()

# if we apply the scaling you'll see that the vector generated
# will now have much lower variance and in the same range

(0.8669372677550163, 0.7119028180363141, 0.8546956551228956)

## Masking

In [7]:
# Masking
# required in decoder part so that we don't look at a future word when trying
# to generate the current context of the current word, this will be considered
# cheating

# not required in encoder part because all of our inputs are passed into the
# Transformer simultaneously

mask = np.tril(np.ones((L, L)))
mask

# for example, my can only look at itself
# name can only look at my and name

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [8]:
# some transformation
mask[mask == 0] = -np.infty
mask[mask == 1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [9]:
scaled + mask
# the values above the mask are considered -infinity
# why negative infinity?
# because of softmax operation - used to convert a vector into a 
# probability distribution
# their values add up to 1 and they're also very interpretable and stable

array([[-0.9629289 ,        -inf,        -inf,        -inf],
       [ 1.97994166,  0.41004677,        -inf,        -inf],
       [ 0.36663452, -1.31010548, -1.07477788,        -inf],
       [ 0.0334473 ,  1.05014971,  0.15290475, -0.28293713]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [10]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [11]:
# no mask
attention = softmax(scaled)
attention

# every row adds up to 1 because it's a probability distribution

array([[0.08431243, 0.25513027, 0.51521078, 0.14534652],
       [0.64059204, 0.1332861 , 0.01664257, 0.2094793 ],
       [0.47006414, 0.08789379, 0.11121405, 0.33082801],
       [0.17794451, 0.49185018, 0.20052305, 0.12968226]])

In [12]:
# mask
attention = softmax(scaled + mask)
attention

# notice that the attention vector doesn't incorporate anything or any word
# that comes after it
# this is required for the decoder, but not for the encoder

array([[1.        , 0.        , 0.        , 0.        ],
       [0.82776862, 0.17223138, 0.        , 0.        ],
       [0.7024564 , 0.13134709, 0.16619652, 0.        ],
       [0.17794451, 0.49185018, 0.20052305, 0.12968226]])

In [13]:
# before applying attention
v

array([[ 0.81252582,  1.35624003, -0.07201012,  1.0035329 ,  0.36163603,
        -0.64511975,  0.36139561,  1.53803657],
       [-0.03582604,  1.56464366, -2.6197451 ,  0.8219025 ,  0.08704707,
        -0.29900735,  0.09176078, -1.98756891],
       [-0.21967189,  0.35711257,  1.47789404, -0.51827022, -0.8084936 ,
        -0.50175704,  0.91540212,  0.32875111],
       [-0.5297602 ,  0.51326743,  0.09707755,  0.96864499, -0.70205309,
        -0.32766215, -0.39210815, -1.46351495]])

In [14]:
# if we multiply the attention matrix and the value matrix
# we get these new set of matrices which should better encapsulate the context
# of a word
new_v = np.matmul(attention, v)
new_v

# v and new_v, because they are masked, the first row looks exactly the same
# go to later words
# notice how different these vectors actually become

array([[ 0.81252582,  1.35624003, -0.07201012,  1.0035329 ,  0.36163603,
        -0.64511975,  0.36139561,  1.53803657],
       [ 0.66641301,  1.39213367, -0.51081002,  0.97225045,  0.31434319,
        -0.58550834,  0.31495603,  0.93081668],
       [ 0.52954961,  1.21756173, -0.14905901,  0.7267579 ,  0.1310981 ,
        -0.57583252,  0.41805381,  0.87397953],
       [ 0.01421368,  1.14907671, -0.99239485,  0.60451701, -0.14600018,
        -0.40496816,  0.24215067, -0.82777073]])

In [15]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / np.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [16]:
# in the encoder, we don't need to pass in the mask
# we're going to have new vectors
# and the attention vectors that can actually pay attention to any word

In [17]:
# we only did this for a single attention head
# we can have multiply attention heads and stack their results on 
# top of each other in order to get 
# multi-headed attention