# Transformers and Attention 

### Basic Form of Self-Attention

In [1]:
import torch

# integer mapping of some sentence
sentence = torch.tensor(
    [0, # can
     7, # you     
     1, # help
     2, # me
     5, # to
     6, # translate
     4, # this
     3] # sentence
)

In [2]:
torch.manual_seed(123)
embed = torch.nn.Embedding(10, 16)
embedded_sentence = embed(sentence).detach()
print(embedded_sentence.shape)

torch.Size([8, 16])


In [3]:
# calculate the dot product between i-th and j-th word embeddings
omega = torch.empty(8,8)
for i, x_i in enumerate(embedded_sentence):
    for j, x_j in enumerate(embedded_sentence):
        omega[i, j] = x_i.dot(x_j)

In [4]:
# the O(n^2) for loop is super inefficient so just do matrix multiplication
omega_mat = embedded_sentence.matmul(embedded_sentence.T)

In [5]:
# verify that the two tensors are the same
torch.allclose(omega_mat, omega)

True

In [10]:
# calculate the attention weights 
import torch.nn.functional as F 
attn_weights = F.softmax(omega_mat, dim=1)
attn_weights.shape

torch.Size([8, 8])

*i-th* row of the attention weight matrix contains corresponding attention weights for all words in the sentence for the *i-th* word. Each attention weight indicates how relevant each word is to the *i-th* word.

In [11]:
# sun of the cols should sum to 1.0 since softmax normalizes the weights
attn_weights.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [15]:
# compute the context vector
context_vectors = torch.matmul(attn_weights, embedded_sentence)
print(context_vectors.shape)

torch.Size([8, 16])
