## Self-Attention Mechanism

In [2]:
import numpy as np
import math

In [3]:
L, d_k, d_v = 4, 8, 8           #No of words in a sequence
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

In [7]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(1.1030609393732602, 0.6858478742652863, 7.228164681456705)

from the above variance of q and k vector the variance after applying softmax transformation is far more than matching with their multiplication so as to make the training compatible

In [8]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(1.1030609393732602, 0.6858478742652863, 0.903520585182088)

## Masking
This is to ensure words don't get context from words generated in the future.
Not required in the encoders, but required int he decoders

In [14]:
mask = np.triu(np.ones( (L, L) ),1)
mask

array([[0., 1., 1., 1.],
       [0., 0., 1., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.]])

In [15]:
mask[mask==1] = -np.infty

In [16]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [17]:
scaled + mask

array([[ 0.92584833,        -inf,        -inf,        -inf],
       [ 0.3325317 ,  0.53980251,        -inf,        -inf],
       [ 0.2672791 , -1.43039679, -0.89674047,        -inf],
       [-0.03939342, -0.92651319, -0.73519263,  1.03127194]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [18]:
def softmax(x):
    return (np.exp(x) / np.sum(np.exp(x), axis=1)).T

In [43]:
attention = softmax(scaled + mask)

In [44]:
new_v = np.matmul(attention, v)
new_v

array([[-8.93239835e-01, -8.68848716e-01,  6.12334560e-03,
        -9.44621979e-01, -6.01999421e-01, -9.13011694e-01,
        -7.52002345e-01,  8.05889380e-01],
       [ 3.91952770e-01, -1.03344328e-01,  1.02396098e+00,
        -8.68597653e-01, -9.10805247e-01,  4.68592114e-04,
        -1.48538129e-01,  1.25948443e-01],
       [-6.49465346e-01, -3.50993332e-01,  5.57215007e-01,
        -7.87876565e-01, -2.24745109e-01, -1.05654241e+00,
        -2.45637701e-01,  5.66995708e-01],
       [-6.65838417e-01,  3.55557769e-01, -6.50745933e-02,
        -1.05337762e+00,  6.86674076e-02,  8.93337764e-01,
        -3.52725324e-01,  4.36782385e-02]])

In [48]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
    attention = softmax(scaled)
    out = np.matmul(attention, v)
    return out, attention

In [49]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[-1.44263697 -0.09649057 -0.77919781 -0.68726859  1.02303874 -0.05215515
   0.98522207 -1.60586385]
 [-1.6709121  -0.45338648 -0.61241272  0.70945495  1.48492191  0.60092861
  -0.13165875  0.28240596]
 [-1.81694942  1.17219678  1.78118943  0.59666832 -1.30004898 -1.68664413
  -0.95071489  1.03328314]
 [ 1.81093502  0.38335629  1.25714101  0.24707683 -0.05677332 -0.9347236
  -0.50725503 -0.42900001]]
K
 [[-0.46027543  0.42393113  0.15914651  0.40073474  0.56760977 -0.65923352
   1.037014   -0.47217593]
 [-0.01230478  0.50324977 -0.67668636 -1.91831969  1.07177521  1.30880129
  -0.39288505  0.8895817 ]
 [-0.2313224  -0.58233207 -1.29892357  0.40530313 -0.29995753 -0.4368623
   0.97749125 -0.38686029]
 [-0.08679261  0.32476516  1.28829248 -0.30865827 -1.59258497 -0.17311546
  -1.69039193 -0.69127806]]
V
 [[-0.89323983 -0.86884872  0.00612335 -0.94462198 -0.60199942 -0.91301169
  -0.75200234  0.80588938]
 [ 1.43655668  0.51885726  1.8512589  -0.80680512 -1.16180247  0.74294492
   0.341