In [5]:
import torch

#getting the vectors for the input sequence
inputs =  torch.tensor(
    [
        [0.43, 0.15, 0.89], #Your x1
        [0.55, 0.87, 0.66], #journey x2
        [0.57, 0.85, 0.64], #starts x3
        [0.22, 0.58, 0.33], #with x4
        [0.77, 0.25, 0.10], #one  x5
        [0.05, 0.80, 0.55]  #step x6
    ]
)

In [6]:
#calculating the attention scores between the query vector and the input vector.
query = inputs[1]  #second input token

attn_scores_2 = torch.empty(inputs.shape[0]) #creating an empty matrix of legth of the input embeddings

for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query) #dot product between the input token and the query

print(attn_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [7]:
#normalizing the attention scores
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()

print(f"Attention weights: {attn_weights_2_tmp}")
print(f"Sum: {attn_weights_2_tmp.sum()}")

Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: 1.0000001192092896


In [8]:
#nirmalization using siftmax in python
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax_naive(attn_scores_2)

print(f"Attention weights: {attn_weights_2_naive}")
print(f"Sum: {attn_weights_2_naive.sum()}")

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: 1.0


The naives softmax approach above can encounter numerical instability in case of overflow for too large values and under flow for very small values.. It is more advisable to implement softmax in pytotch

In [10]:
attn_weights = torch.softmax(attn_scores_2, dim=0)

print(f"Attention weights: {attn_weights}")
print(f"Sum: {attn_weights.sum()}")

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: 1.0


In [None]:
#calculating the context vectors
context_vec_2 = torch.zeros(query.shape)

for i, x_i in enumerate(inputs):
    context_vec_2 += attn_scores_2[i]*x_i

print(context_vec_2) #context weight takes the dimension of the vectors

tensor([2.8579, 4.2330, 3.7270])


In [12]:
attn_scores = torch.empty(6, 6)

for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)

print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


The for loops above are not computationally efficieent so we will use linear algebra matrix multiplication to optimize the code

In [13]:
attn_scores = inputs @ inputs.T #dot product of each vector pair

print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [15]:
#implement the softmax operation on each row to generate the attention weight
attn_weights = torch.softmax(attn_scores, dim=-1)

print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


The entries of each row should sum up to one. <br>
The dim parameter specifies the dimension of hte input tensor along which the function will be computed, by setting the dim to -1 we are instructing the softmax function to apply the normalization along the last dimension of the attn_scores, the last dimension is the cols.

In [16]:
#calculate the context vector
all_context_vec = attn_weights @ inputs

print(all_context_vec)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])
