References:
- https://machinelearningmastery.com/the-attention-mechanism-from-scratch/

In [37]:
import numpy as np
import scipy

# for reproducibility
SEED = 42
np.random.seed(SEED)

In [38]:
# encoder representations of four different words
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])

word_1.shape

(3,)

In [39]:
# stacking the word embeddings into a single array
words = np.array([word_1, word_2, word_3, word_4])

# seq_length (num of words = 4) x embedding dimension (3)
words.shape, words

((4, 3), array([[1, 0, 0],
        [0, 1, 0],
        [1, 1, 0],
        [0, 0, 1]]))

In [40]:
# generating the weight matrices
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))

W_Q.shape, W_Q

((3, 3), array([[2, 0, 2],
        [2, 0, 0],
        [2, 1, 2]]))

In [41]:
# generating the queries, keys and values
query_1 = word_1 @ W_Q # 1x3 @ 3x3 = 1x3
key_1 = word_1 @ W_K
value_1 = word_1 @ W_V
 
query_2 = word_2 @ W_Q
key_2 = word_2 @ W_K
value_2 = word_2 @ W_V
 
query_3 = word_3 @ W_Q
key_3 = word_3 @ W_K
value_3 = word_3 @ W_V
 
query_4 = word_4 @ W_Q
key_4 = word_4 @ W_K
value_4 = word_4 @ W_V

print(query_1.shape, key_1.shape, value_1.shape)

Q = np.array([query_1, query_2, query_3, query_4])
K = np.array([key_1, key_2, key_3, key_4])
V = np.array([value_1, value_2, value_3, value_4])

Q.shape, K.shape, V.shape

(3,) (3,) (3,)


((4, 3), (4, 3), (4, 3))

In [42]:
# Matrix version of above
# generating the queries, keys and values
Q = words @ W_Q
K = words @ W_K
V = words @ W_V

print('Q', Q.shape, Q)
print('K', K.shape, K)
print('V', V.shape, V)

Q (4, 3) [[2 0 2]
 [2 0 0]
 [4 0 2]
 [2 1 2]]
K (4, 3) [[2 2 2]
 [0 2 1]
 [2 4 3]
 [0 1 1]]
V (4, 3) [[1 1 0]
 [0 1 1]
 [1 2 1]
 [0 0 0]]


In [43]:
# scoring the first query vector against all key vectors
scores = np.array([
    np.dot(query_1, key_1), 
    np.dot(query_1, key_2), 
    np.dot(query_1, key_3), 
    np.dot(query_1, key_4)
])
scores.shape, scores

((4,), array([ 8,  2, 10,  2]))

In [44]:
# computing the weights by a softmax operation
weights = scipy.special.softmax(scores / key_1.shape[0] ** 0.5)
weights.shape, weights

((4,), array([0.23608986, 0.00738988, 0.74913039, 0.00738988]))

In [45]:
# computing the attention by a weighted sum of the value vectors
attention = (weights[0] * value_1) + (weights[1] * value_2) + (weights[2] * value_3) + (weights[3] * value_4)
attention.shape, attention

((3,), array([0.98522025, 1.74174051, 0.75652026]))

In [46]:
# equivalent to above
attention = weights @ np.array([value_1, value_2, value_3, value_4])
attention.shape, attention

((3,), array([0.98522025, 1.74174051, 0.75652026]))

In [47]:
Q.shape, Q

((4, 3), array([[2, 0, 2],
        [2, 0, 0],
        [4, 0, 2],
        [2, 1, 2]]))

In [48]:
K.shape, K

((4, 3), array([[2, 2, 2],
        [0, 2, 1],
        [2, 4, 3],
        [0, 1, 1]]))

In [49]:
K.transpose().shape, K.transpose()

((3, 4), array([[2, 0, 2, 0],
        [2, 2, 4, 1],
        [2, 1, 3, 1]]))

In [50]:
# scoring the query vectors against all key vectors
scores = Q @ K.transpose()
scores.shape, scores

((4, 4), array([[ 8,  2, 10,  2],
        [ 4,  0,  4,  0],
        [12,  2, 14,  2],
        [10,  4, 14,  3]]))

In [51]:
# computing the weights by a softmax operation
weights = scipy.special.softmax(scores / K.shape[1] ** 0.5, axis=1)
weights.shape, weights

((4, 4),
 array([[2.36089863e-01, 7.38987555e-03, 7.49130386e-01, 7.38987555e-03],
        [4.54826323e-01, 4.51736775e-02, 4.54826323e-01, 4.51736775e-02],
        [2.39275049e-01, 7.43870015e-04, 7.59237211e-01, 7.43870015e-04],
        [8.99501754e-02, 2.81554063e-03, 9.05653685e-01, 1.58059922e-03]]))

In [52]:
# computing the attention by a weighted sum of the value vectors
attention = weights @ V
attention.shape, attention

((4, 3), array([[0.98522025, 1.74174051, 0.75652026],
        [0.90965265, 1.40965265, 0.5       ],
        [0.99851226, 1.75849334, 0.75998108],
        [0.99560386, 1.90407309, 0.90846923]]))

## Summary

In [53]:
import numpy as np
import scipy
 
# for reproducibility
SEED = 42
np.random.seed(SEED)

# encoder representations of four different words
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])
 
# stacking the word embeddings into a single array
words = np.array([word_1, word_2, word_3, word_4])
 
# generating the weight matrices
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))
 
# generating the queries, keys and values
Q = words @ W_Q
K = words @ W_K
V = words @ W_V
 
# scoring the query vectors against all key vectors
scores = Q @ K.transpose()
 
# computing the weights by a softmax operation
weights = scipy.special.softmax(scores / K.shape[1] ** 0.5, axis=1)
 
# computing the attention by a weighted sum of the value vectors
attention = weights @ V
 
print(attention)

[[0.98522025 1.74174051 0.75652026]
 [0.90965265 1.40965265 0.5       ]
 [0.99851226 1.75849334 0.75998108]
 [0.99560386 1.90407309 0.90846923]]
