In [1]:
import numpy as np
from numpy import array, random

In [2]:
# encoder representations of four different words

word_1 = array([1, 0, 0])
word_2 = array([0, 1, 0])
word_3 = array([1, 1, 0])
word_4 = array([0, 0, 1])

In [3]:
random.seed(42) # to allow us to reproduce the same attention values
W_Q = random.randint(3, size=(3, 3))
W_K = random.randint(3, size=(3, 3))
W_V = random.randint(3, size=(3, 3))

In [4]:
W_Q

array([[2, 0, 2],
       [2, 0, 0],
       [2, 1, 2]])

In [5]:
# generating the queries, keys and values

query_1 = word_1 @ W_Q
key_1 = word_1 @ W_K
value_1 = word_1 @ W_V

query_2 = word_2 @ W_Q
key_2 = word_2 @ W_K
value_2 = word_2 @ W_V

query_3 = word_3 @ W_Q
key_3 = word_3 @ W_K
value_3 = word_3 @ W_V

query_4 = word_4 @ W_Q
key_4 = word_4 @ W_K
value_4 = word_4 @ W_V

print(query_1, key_1, value_4, sep="\n\n")

[2 0 2]

[2 2 2]

[0 0 0]


In [6]:
# scoring the first query vector against all key vectors

scores = array([np.dot(query_1, key_1), np.dot(query_1, key_2), np.dot(query_1, key_3), np.dot(query_1, key_4)])
scores

array([ 8,  2, 10,  2])

In [7]:
# dividing with the dimention of keys

score_d = scores / key_1.shape[0]

In [9]:
# defining the softmax

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))  # Subtracting max(logits) for numerical stability
    return exp_logits / np.sum(exp_logits, axis=0)

In [11]:
# computing the weights by a softmax operation

weights = softmax(score_d)
weights

array([0.31071295, 0.04205043, 0.6051862 , 0.04205043])

In [13]:
# finding the attention

attention = (weights[0] * value_1) + (weights[1] * value_2) + (weights[2] * value_3) + (weights[3] * value_4)

attention

array([0.91589915, 1.56313577, 0.64723662])

this is the attention of **word_1**