# Self Attention in Transformers

## Generate Data

In [1]:
import numpy as np
import math

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[ 0.84510417  1.30397454  0.45100508 -0.67824654  2.3060361  -1.73760495
   1.1420773   2.25201506]
 [-0.9376166  -0.81154269  0.63237914  0.55951021  0.92674594 -1.39842678
  -0.14587156 -0.53715604]
 [ 0.23079479  1.69128727 -1.51817407  0.3747922  -1.90436554  1.05433816
  -0.44561986  0.91408995]
 [ 0.24704611 -0.86427673 -0.14569616 -0.91861466  1.37835583 -1.71164589
  -2.33833465 -0.13990678]]
K
 [[ 0.26287575  2.63274413  1.54619777 -0.53768774  0.00913412  0.14395188
  -1.19963181 -0.6381176 ]
 [ 2.69091111  0.46237734  0.26917815 -0.59716749 -0.51476364  0.23443706
  -0.04547368  0.14264032]
 [ 0.89627259  0.38445796 -1.99429564 -1.9715818   2.10926488  0.66549909
   2.19478266  1.36357753]
 [ 0.65696906  0.51045855  0.82746194 -0.1419481   0.4985914  -1.13144129
  -1.16321744  0.42983977]]
V
 [[ 0.6588293   1.1037471  -0.01056483 -0.66015284  0.79941062 -0.63669914
  -0.70311896 -0.46655459]
 [-0.48387534 -0.10511596 -0.13599462  0.278638   -0.9677274  -0.77453905
   0.0

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

In [2]:
np.matmul(q, k.T)

array([[ 1.68102601,  2.07832712, 10.98162318,  4.44558675],
       [-1.38119977, -3.9370655 , -3.54514458,  1.39669891],
       [ 2.05014059,  2.14871229,  0.09908186, -1.52563221],
       [ 0.71878874, -0.24991861, -1.56386356,  5.0146791 ]])

In [3]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(1.4464753310180924, 1.3153693878270274, 12.348347458496317)

In [4]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(1.4464753310180924, 1.3153693878270274, 1.5435434323120394)

Notice the reduction in variance of the product

In [5]:
scaled

array([[ 0.59433245,  0.7347996 ,  3.88259011,  1.57175227],
       [-0.48832786, -1.39196286, -1.25339789,  0.49380763],
       [ 0.72483416,  0.75968452,  0.03503073, -0.53939244],
       [ 0.2541302 , -0.08835957, -0.55290926,  1.7729568 ]])

## Masking

- This is to ensure words don't get context from words generated in the future.
- Not required in the encoders, but required int he decoders

In [7]:
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [8]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [9]:
scaled + mask

array([[ 0.59433245,        -inf,        -inf,        -inf],
       [-0.48832786, -1.39196286,        -inf,        -inf],
       [ 0.72483416,  0.75968452,  0.03503073,        -inf],
       [ 0.2541302 , -0.08835957, -0.55290926,  1.7729568 ]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [10]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [11]:
attention = softmax(scaled + mask)

attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.71169592, 0.28830408, 0.        , 0.        ],
       [0.39414468, 0.40812292, 0.1977324 , 0.        ],
       [0.14874222, 0.10560703, 0.06636536, 0.67928538]])

In [12]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.6588293 ,  1.1037471 , -0.01056483, -0.66015284,  0.79941062,
        -0.63669914, -0.70311896, -0.46655459],
       [ 0.3293829 ,  0.75522695, -0.04672675, -0.38949561,  0.28993753,
        -0.67643895, -0.49932289, -0.28563142],
       [-0.21520548,  0.41560512, -0.17209461, -0.07398176, -0.24842878,
        -0.65678561, -0.15159547, -0.31123537],
       [ 0.8390914 ,  0.44295421,  0.90657162,  0.92010714, -0.90335   ,
         0.10052127, -2.03373757, -0.2291934 ]])

In [13]:
v

array([[ 0.6588293 ,  1.1037471 , -0.01056483, -0.66015284,  0.79941062,
        -0.63669914, -0.70311896, -0.46655459],
       [-0.48387534, -0.10511596, -0.13599462,  0.278638  , -0.9677274 ,
        -0.77453905,  0.00375995,  0.16098828],
       [-1.40290072,  0.1186923 , -0.5685867 ,  0.36663395, -0.85246773,
        -0.45377942,  0.62711322, -0.97631125],
       [ 1.30328186,  0.41514855,  1.41360249,  1.41993605, -1.27116376,
         0.45214769, -2.90182862, -0.16488692]])

# Function

In [14]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
        attention = softmax(scaled)
        out = np.matmul(attention, v)
        return out, attention

In [15]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 0.84510417  1.30397454  0.45100508 -0.67824654  2.3060361  -1.73760495
   1.1420773   2.25201506]
 [-0.9376166  -0.81154269  0.63237914  0.55951021  0.92674594 -1.39842678
  -0.14587156 -0.53715604]
 [ 0.23079479  1.69128727 -1.51817407  0.3747922  -1.90436554  1.05433816
  -0.44561986  0.91408995]
 [ 0.24704611 -0.86427673 -0.14569616 -0.91861466  1.37835583 -1.71164589
  -2.33833465 -0.13990678]]
K
 [[ 0.26287575  2.63274413  1.54619777 -0.53768774  0.00913412  0.14395188
  -1.19963181 -0.6381176 ]
 [ 2.69091111  0.46237734  0.26917815 -0.59716749 -0.51476364  0.23443706
  -0.04547368  0.14264032]
 [ 0.89627259  0.38445796 -1.99429564 -1.9715818   2.10926488  0.66549909
   2.19478266  1.36357753]
 [ 0.65696906  0.51045855  0.82746194 -0.1419481   0.4985914  -1.13144129
  -1.16321744  0.42983977]]
V
 [[ 0.6588293   1.1037471  -0.01056483 -0.66015284  0.79941062 -0.63669914
  -0.70311896 -0.46655459]
 [-0.48387534 -0.10511596 -0.13599462  0.278638   -0.9677274  -0.77453905
   0.0