In [9]:
#from transformer import Transformer
import torch
import numpy as np
import math

In [4]:
L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

print("Q\n", q) # queue
print("K\n", k) # key
print("V\n", v) # value

Q
 [[ 1.28114237  1.44752402 -0.60824052  0.40310601  1.43924712 -1.38296083
   0.95163263  1.51139518]
 [ 0.14419632  1.60078477 -0.55843829  0.20580793  0.69943993 -0.27722628
   0.05123652 -0.4286159 ]
 [-0.63881402 -0.63938131 -0.83124101 -0.86264979 -0.62493197 -0.30883055
  -1.33794331 -0.91399712]
 [-0.767213    1.4008975  -1.24628059 -0.04933137 -0.16553654  0.47415481
   1.08705046 -0.68796479]]
K
 [[-1.34879403 -0.64603967 -0.46963968 -1.14166133 -0.16892808 -0.27302121
  -0.97135169 -0.2109268 ]
 [ 0.26196293  0.31645742  0.69783567  0.77294645 -0.57830048 -1.99095362
   0.30677922  0.26174165]
 [-0.34083638 -1.47527041  0.17580892 -0.7577799  -0.36667739 -0.93340931
   0.17661323 -1.01089128]
 [ 0.93246473  0.03781071  1.82269559  1.04783508 -1.74464269 -0.78567752
   0.27887145 -1.03483974]]
V
 [[ 1.74010932  0.32677542  1.7065823   0.42829971 -1.24734848  0.22574623
  -0.13892477 -1.37359953]
 [ 0.41882027 -1.74923582  1.38457268  0.66992071 -0.29896037 -0.59319946
  -0.0

In [5]:
# self attention matrix will have every word look at every other word in the sentence
# 4 cross 4 matrix for example "My name is Ken" len(4)
np.matmul(q, k.T)

array([[-3.94642711,  3.28944871, -3.58120516, -2.15997606],
       [-1.20318828,  0.36472455, -2.2202434 , -1.15185057],
       [ 4.33221773, -1.28995282,  2.87361742, -1.13321048],
       [-0.24088914, -1.36036143, -1.48137036, -2.05436141]])

In [10]:
q.var(), k.var(), np.matmul(q, k.T).var()
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var() #minimize skew

(0.8283162637187776, 0.7187877631935535, 0.6445927040445486)

In [19]:
scaled

array([[-1.39527268,  1.16299574, -1.26614723, -0.76366686],
       [-0.42539129,  0.1289496 , -0.78497458, -0.40724067],
       [ 1.53167027, -0.45606719,  1.01597718, -0.40065041],
       [-0.08516717, -0.4809604 , -0.52374351, -0.72632644]])

## Masking ##
- This is to ensure words don't get context from words generated in the future
- Not required in the encoders, but required in the decoders

In [16]:
mask = np.tril(np.ones( (L, L) )) # triangular matrix
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [17]:
mask[mask==0] = -np.infty
mask[mask==1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [18]:
scaled + mask

array([[-1.39527268,        -inf,        -inf,        -inf],
       [-0.42539129,  0.1289496 ,        -inf,        -inf],
       [ 1.53167027, -0.45606719,  1.01597718,        -inf],
       [-0.08516717, -0.4809604 , -0.52374351, -0.72632644]])

In [23]:
# softmask, convert a vector into a probability distribution
def softmax(x): 
    return (np.exp(x).T / np.sum(np.exp(x), axis=1)).T
attention = softmax(scaled + mask)

In [24]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.36485788, 0.63514212, 0.        , 0.        ],
       [0.57667078, 0.07900681, 0.34432241, 0.        ],
       [0.35152087, 0.23662482, 0.22671478, 0.18513954]])

In [26]:
new_v = np.matmul(attention, v)
new_v
# this new matricies better encapsulates the context of the masked words

array([[ 1.74010932,  0.32677542,  1.7065823 ,  0.42829971, -1.24734848,
         0.22574623, -0.13892477, -1.37359953],
       [ 0.90090299, -0.99178677,  1.50206042,  0.58176339, -0.64498724,
        -0.29440068, -0.1126314 ,  0.22483296],
       [ 1.08994727, -0.28021472,  1.18755641,  0.46786198, -0.84146163,
         0.08912864, -0.11236506, -1.12370844],
       [ 1.19766254, -0.86210486,  0.97427464,  0.71487593, -0.76923779,
        -0.21159648, -0.17616998, -0.49321219]])

In [34]:
# MASK is set to true when we are decoding
def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
    attention = softmax(scaled)
    out = np.matmul(attention, v) # vector
    return out, attention

"""print("Q\n", q) # queue
print("K\n", k) # key
print("V\n", v) # value"""
#apply scaled product def
values, attnetion = scaled_dot_product_attention(q, k, v, mask=None)
print("New V\n", values)
print("Attention\n", attention)

New V
 [[ 0.70361206 -1.5865436   1.16601762  0.74611    -0.43798238 -0.53058441
  -0.1402274   0.70648047]
 [ 1.13572215 -1.18650689  0.94771752  0.79844007 -0.68226004 -0.36894606
  -0.18913867 -0.05737777]
 [ 1.1939916  -0.40243774  1.08971685  0.55470018 -0.85784786  0.01797784
  -0.14037853 -1.03836764]
 [ 1.19766254 -0.86210486  0.97427464  0.71487593 -0.76923779 -0.21159648
  -0.17616998 -0.49321219]]
Attention
 [[1.         0.         0.         0.        ]
 [0.36485788 0.63514212 0.         0.        ]
 [0.57667078 0.07900681 0.34432241 0.        ]
 [0.35152087 0.23662482 0.22671478 0.18513954]]
