In [39]:
import numpy as np
import math

In [40]:
L,d_k,d_v=4,8,8 # Length of Inputs 
q=np.random.randn(L,d_k)
k=np.random.rand(L,d_k)
v=np.random.rand(L,d_v)


In [41]:
print('Q\n',q)
print('R\n',k)
print('V\n',v)

Q
 [[-0.21315953  0.78910544 -0.88536451 -0.08402536 -1.29558652  0.98680525
   0.54730747  0.82251438]
 [-0.0769435   0.49750522 -0.57059826  1.12963314 -0.18533279 -0.39592955
  -1.09467784 -0.91169619]
 [-0.40779358 -0.09092223  0.49487442  0.69044584 -1.93557699 -1.13940154
   0.95956753 -0.73286109]
 [ 0.44530471  0.7700848  -1.49958739 -0.40076349  0.13298458 -0.7554102
   0.45856684 -0.22172852]]
R
 [[0.4597203  0.30121742 0.19233243 0.04346007 0.68143938 0.53094025
  0.91946342 0.638468  ]
 [0.38128218 0.79370571 0.59913675 0.11410948 0.89771511 0.42952327
  0.44851225 0.47340741]
 [0.42808702 0.23220304 0.70971639 0.36244481 0.79245553 0.94812597
  0.28845055 0.8399046 ]
 [0.49490092 0.29729715 0.78874859 0.73850375 0.59118655 0.21302009
  0.35734483 0.95234737]]
V
 [[0.10764695 0.37887483 0.50754409 0.27701106 0.9006166  0.36995677
  0.54100142 0.90223275]
 [0.6431698  0.66697757 0.47403811 0.10079472 0.76967747 0.89620527
  0.51817876 0.73439951]
 [0.07240724 0.15107793 0.51

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

In [42]:
np.matmul(q,k.T) #K.T mean the transpose of the k.T 


array([[ 0.63521175, -0.09935221,  0.19079534, -0.20810418],
       [-1.87127893, -1.10644615, -1.5167069 , -0.95932892],
       [-1.59922476, -1.99592631, -2.54711505, -1.07066432],
       [ 0.10045419, -0.26756046, -1.50488781, -1.159032  ]])

In [43]:
q.var(),k.var(),np.matmul(q,k.T).var()

(np.float64(0.6270709731401924),
 np.float64(0.06420939158995001),
 np.float64(0.7675819380446799))

In [44]:
scaled=np.matmul(q,k.T)/math.sqrt(d_k)
q.var(),k.var(),scaled.var()


(np.float64(0.6270709731401924),
 np.float64(0.06420939158995001),
 np.float64(0.09594774225558497))

In [45]:
scaled

array([[ 0.22458127, -0.03512631,  0.06745634, -0.07357594],
       [-0.66159701, -0.39118779, -0.53623687, -0.33917399],
       [-0.56541134, -0.70566652, -0.90054116, -0.378537  ],
       [ 0.03551592, -0.09459691, -0.53205819, -0.40977969]])

## Masking 
- this is ensure the code don't get context from word generated 
- Not required in the encoders,but required in the decoders
### Tril(.tril())
If you have:
$A = \begin{bmatrix} 1 & 2 & 3 \\ 4 & 5 & 6 \\ 7 & 8 & 9 \end{bmatrix}$
It gonna become :
$A  =  \begin{bmatrix} 1 & 0 & 0 \\ 4 & 5 & 0 \\ 7 & 8 & 9 \end{bmatrix}$

In [46]:
mask=np.tril(np.ones((L,L)))
mask


array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [48]:
mask[mask == 0] =-np.inf
mask[mask == 1] =0

In [50]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [51]:
scaled+mask

array([[ 0.22458127,        -inf,        -inf,        -inf],
       [-0.66159701, -0.39118779,        -inf,        -inf],
       [-0.56541134, -0.70566652, -0.90054116,        -inf],
       [ 0.03551592, -0.09459691, -0.53205819, -0.40977969]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [53]:
def softmax(x):
    return (np.exp(x).T/np.sum(np.exp(x), axis=-1)).T

In [54]:
attention=softmax(scaled+mask)

In [55]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.43280663, 0.56719337, 0.        , 0.        ],
       [0.38693975, 0.33630343, 0.27675681, 0.        ],
       [0.3240934 , 0.28455283, 0.18372823, 0.20762554]])

In [56]:
new_v=np.matmul(attention,v)
new_v


array([[0.10764695, 0.37887483, 0.50754409, 0.27701106, 0.9006166 ,
        0.36995677, 0.54100142, 0.90223275],
       [0.41139195, 0.54228479, 0.48853972, 0.17706232, 0.82634879,
        0.66844142, 0.52805656, 0.80703885],
       [0.27799229, 0.41272043, 0.49741358, 0.26059078, 0.64629081,
        0.56252213, 0.56340934, 0.67850624],
       [0.35718291, 0.47638751, 0.39782657, 0.34124561, 0.70257271,
        0.5292235 , 0.556155  , 0.66611281]])

## Function


In [58]:
def softmax(x):
    return (np.exp(x).T/np.sum(np.exp(x),axis=-1)).T

def scaled_dot_product_attention(q,k,v,mask=None):
    d_k=q.shape[-1]
    scaled=np.matmul(q,k.T)/math.sqrt(d_k)
    if mask is not None:
        scaled= scaled+mask
    attention= softmax(scaled)
    out=np.matmul(attention,v)
    return out,attention

In [59]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[-0.21315953  0.78910544 -0.88536451 -0.08402536 -1.29558652  0.98680525
   0.54730747  0.82251438]
 [-0.0769435   0.49750522 -0.57059826  1.12963314 -0.18533279 -0.39592955
  -1.09467784 -0.91169619]
 [-0.40779358 -0.09092223  0.49487442  0.69044584 -1.93557699 -1.13940154
   0.95956753 -0.73286109]
 [ 0.44530471  0.7700848  -1.49958739 -0.40076349  0.13298458 -0.7554102
   0.45856684 -0.22172852]]
K
 [[0.4597203  0.30121742 0.19233243 0.04346007 0.68143938 0.53094025
  0.91946342 0.638468  ]
 [0.38128218 0.79370571 0.59913675 0.11410948 0.89771511 0.42952327
  0.44851225 0.47340741]
 [0.42808702 0.23220304 0.70971639 0.36244481 0.79245553 0.94812597
  0.28845055 0.8399046 ]
 [0.49490092 0.29729715 0.78874859 0.73850375 0.59118655 0.21302009
  0.35734483 0.95234737]]
V
 [[0.10764695 0.37887483 0.50754409 0.27701106 0.9006166  0.36995677
  0.54100142 0.90223275]
 [0.6431698  0.66697757 0.47403811 0.10079472 0.76967747 0.89620527
  0.51817876 0.73439951]
 [0.07240724 0.15107793 0.51