In [88]:
import numpy as np

embeddings = {
    'NULL': [0, 0, 0, 0],
    '0':    [1, 0, 0, 0],
    '1':    [0, 1, 0, 0],
    '2':    [0, 0, 1, 0],
    '3':    [0, 0, 0, 1],
}

In [89]:
def get_token(embedding):
    keys = [k for k in embeddings.keys()]
    distances = [np.linalg.norm(embedding - embeddings[k]) for k in keys]
    return keys[np.argmin(distances)]

In [182]:
input_tokens = ['0', '0', '1', '3', '3']
input_embeddings = np.stack([embeddings[token] for token in input_tokens])
print(input_embeddings.shape)
input_embeddings

(5, 4)


array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1]])

In [247]:
Wq = [[1, 0, 0, 0, 0,],
      [0, 0, 0, 0, 0,],
      [0, 0, 0, 0, 0,],
      [0, 0, 0, 0, 0,],]

Wk = [[0, 0, 0, 0, 0,],
      [0, 0, 0, 0, 0,],
      [0, 0, 0, 0, 0,],
      [0, 0, 0, 0, 0,],]

Wv = [[0, 0, 0, 0,],
      [0, 0, 0, 0,],
      [0, 0, 0, 0,],
      [0, 0, 0, 0,],]

In [248]:
queries = np.matmul(input_embeddings, Wq)
print(queries.shape)
queries

(5, 5)


array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [249]:
keys = np.matmul(input_embeddings, Wk)
print(keys.T.shape)
keys.T

(5, 5)


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [250]:
values = np.matmul(input_embeddings, Wv)
print(values.shape)
values

(5, 4)


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [251]:
attention = np.matmul(queries, keys.T)
print(attention.shape)
attention

(5, 5)


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [252]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

scaled_attention = np.array([softmax(att) for att in attention])
print(scaled_attention.shape)
scaled_attention

(5, 5)


array([[0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2]])

In [255]:
attention_values = np.matmul(scaled_attention, values)
print(attention_values.shape)
attention_values

(5, 4)


array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [256]:
outputs = np.array([get_token(embedding) for embedding in attention_values])
print(outputs.shape)
outputs

(5,)


array(['NULL', 'NULL', 'NULL', 'NULL', 'NULL'], dtype='<U4')