References:
- https://machinelearningmastery.com/the-attention-mechanism-from-scratch/

## Attention From Scratch

In [1]:
import numpy as np
import scipy

# for reproducibility
SEED = 42
np.random.seed(SEED)

print('NumPy version:', np.__version__)
print('SciPy version:', scipy.__version__)

NumPy version: 1.22.4
SciPy version: 1.10.1


### Word Representation

In [2]:
# encoder representations of four different words
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])

word_1.shape, word_2.shape, word_3.shape, word_4.shape

((3,), (3,), (3,), (3,))

In [3]:
# stacking the word embeddings into a single array
words = np.array([word_1, word_2, word_3, word_4])

# seq_length (num of words = 4) x embedding dimension (3)
words.shape, words

((4, 3),
 array([[1, 0, 0],
        [0, 1, 0],
        [1, 1, 0],
        [0, 0, 1]]))

### Generating Query, Key and Value Matrices

In [4]:
# generating the weight matrices (simulation)
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))

print('W_Q', W_Q.shape, W_Q)
print('W_K', W_K.shape, W_K)
print('W_V', W_V.shape, W_V)

W_Q (3, 3) [[2 0 2]
 [2 0 0]
 [2 1 2]]
W_K (3, 3) [[2 2 2]
 [0 2 1]
 [0 1 1]]
W_V (3, 3) [[1 1 0]
 [0 1 1]
 [0 0 0]]


In [5]:
# generating the queries, keys and values
query_1 = word_1 @ W_Q # 1x3 @ 3x3 = 1x3
key_1 = word_1 @ W_K
value_1 = word_1 @ W_V
 
query_2 = word_2 @ W_Q
key_2 = word_2 @ W_K
value_2 = word_2 @ W_V
 
query_3 = word_3 @ W_Q
key_3 = word_3 @ W_K
value_3 = word_3 @ W_V
 
query_4 = word_4 @ W_Q
key_4 = word_4 @ W_K
value_4 = word_4 @ W_V

print(query_1.shape, key_1.shape, value_1.shape)

Q = np.array([query_1, query_2, query_3, query_4])
K = np.array([key_1, key_2, key_3, key_4])
V = np.array([value_1, value_2, value_3, value_4])

print('Q', Q.shape, Q)
print('K', K.shape, K)
print('V', V.shape, V)

(3,) (3,) (3,)
Q (4, 3) [[2 0 2]
 [2 0 0]
 [4 0 2]
 [2 1 2]]
K (4, 3) [[2 2 2]
 [0 2 1]
 [2 4 3]
 [0 1 1]]
V (4, 3) [[1 1 0]
 [0 1 1]
 [1 2 1]
 [0 0 0]]


In [6]:
# Matrix version of above
# generating the queries, keys and values
Q = words @ W_Q # 4x3 @ 3x3 = 4x3
K = words @ W_K
V = words @ W_V

print('Q', Q.shape, Q)
print('K', K.shape, K)
print('V', V.shape, V)

Q (4, 3) [[2 0 2]
 [2 0 0]
 [4 0 2]
 [2 1 2]]
K (4, 3) [[2 2 2]
 [0 2 1]
 [2 4 3]
 [0 1 1]]
V (4, 3) [[1 1 0]
 [0 1 1]
 [1 2 1]
 [0 0 0]]


### Example: Working only on the first word

In [7]:
# scoring the first query vector against all key vectors
scores = np.array([
    np.dot(query_1, key_1), 
    np.dot(query_1, key_2), 
    np.dot(query_1, key_3), 
    np.dot(query_1, key_4)
])
scores.shape, scores

((4,), array([ 8,  2, 10,  2]))

In [8]:
# computing the weights by a softmax operation
weights = scipy.special.softmax(scores / key_1.shape[0] ** 0.5)
weights.shape, weights

((4,), array([0.23608986, 0.00738988, 0.74913039, 0.00738988]))

In [9]:
# computing the attention by a weighted sum of the value vectors
attention = (weights[0] * value_1) + (weights[1] * value_2) + \
          (weights[2] * value_3) + (weights[3] * value_4)
attention.shape, attention

((3,), array([0.98522025, 1.74174051, 0.75652026]))

In [10]:
# equivalent to above   1x4 @ 4x3 = 1x3
attention = weights @ np.array([value_1, value_2, value_3, value_4])
attention.shape, attention

((3,), array([0.98522025, 1.74174051, 0.75652026]))

### Matrix Form

In [11]:
Q.shape, Q

((4, 3),
 array([[2, 0, 2],
        [2, 0, 0],
        [4, 0, 2],
        [2, 1, 2]]))

In [12]:
K.shape, K

((4, 3),
 array([[2, 2, 2],
        [0, 2, 1],
        [2, 4, 3],
        [0, 1, 1]]))

In [13]:
K.transpose().shape, K.transpose()

((3, 4),
 array([[2, 0, 2, 0],
        [2, 2, 4, 1],
        [2, 1, 3, 1]]))

In [14]:
# scoring the query vectors against all key vectors
scores = Q @ K.transpose()
scores.shape, scores

((4, 4),
 array([[ 8,  2, 10,  2],
        [ 4,  0,  4,  0],
        [12,  2, 14,  2],
        [10,  4, 14,  3]]))

In [15]:
# computing the weights by a softmax operation
weights = scipy.special.softmax(scores / K.shape[1] ** 0.5, axis=1)
weights.shape, weights

((4, 4),
 array([[2.36089863e-01, 7.38987555e-03, 7.49130386e-01, 7.38987555e-03],
        [4.54826323e-01, 4.51736775e-02, 4.54826323e-01, 4.51736775e-02],
        [2.39275049e-01, 7.43870015e-04, 7.59237211e-01, 7.43870015e-04],
        [8.99501754e-02, 2.81554063e-03, 9.05653685e-01, 1.58059922e-03]]))

In [16]:
V.shape, V

((4, 3),
 array([[1, 1, 0],
        [0, 1, 1],
        [1, 2, 1],
        [0, 0, 0]]))

In [17]:
# computing the attention by a weighted sum of the value vectors
attention = weights @ V
attention.shape, attention

((4, 3),
 array([[0.98522025, 1.74174051, 0.75652026],
        [0.90965265, 1.40965265, 0.5       ],
        [0.99851226, 1.75849334, 0.75998108],
        [0.99560386, 1.90407309, 0.90846923]]))

## Summary

In [18]:
import numpy as np
import scipy
 
# for reproducibility
SEED = 42
np.random.seed(SEED)

# encoder representations of four different words
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])
 
# stacking the word embeddings into a single array
words = np.array([word_1, word_2, word_3, word_4])
 
# generating the weight matrices
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))
 
# generating the queries, keys and values
Q = words @ W_Q
K = words @ W_K
V = words @ W_V
 
# scoring the query vectors against all key vectors
scores = Q @ K.transpose()
 
# computing the weights by a softmax operation
weights = scipy.special.softmax(scores / K.shape[1] ** 0.5, axis=1)
 
# computing the attention by a weighted sum of the value vectors
attention = weights @ V
 
print(attention)

[[0.98522025 1.74174051 0.75652026]
 [0.90965265 1.40965265 0.5       ]
 [0.99851226 1.75849334 0.75998108]
 [0.99560386 1.90407309 0.90846923]]


## Dependencies

In [19]:
!pip install session-info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting session-info
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting stdlib_list
  Downloading stdlib_list-0.8.0-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: session-info
  Building wheel for session-info (setup.py) ... [?25l[?25hdone
  Created wheel for session-info: filename=session_info-1.0.0-py3-none-any.whl size=8042 sha256=0811235994491fc716052bf87e24407e9a68d12864596e742bb3bbff6527ae07
  Stored in directory: /root/.cache/pip/wheels/d4/fc/2e/00ca60bac7954b84907efd41baa9b4853500eaeec4228410c6
Successfully built session-info
Installing collected packages: stdlib_list, session-info
Successfully installed session-info-1.0.0 stdlib_list-0.8.0


In [20]:
import session_info

session_info.show()