# Understanding Self Attention

In [23]:
import numpy as np
import math

sequence_length, d_k, d_v = 10, 512, 512
# random sequences
q = np.random.randn(sequence_length, d_k)
k = np.random.randn(sequence_length, d_k)
v = np.random.randn(sequence_length, d_v)

q.shape, k.shape, v.shape

((10, 512), (10, 512), (10, 512))

> Dot product of Query and Key to get the similarity between them, which is attention score

> Divide by the degree of Key to reduce variance

In [24]:
np.matmul(q, k.T)
scaled = np.matmul(q, k.T) / math.sqrt(d_k)

scaled.shape, scaled

((10, 10),
 array([[-0.24377662,  1.02113081,  1.3023725 ,  1.27063396,  0.61714306,
         -0.24391184, -1.77506907,  0.10631629,  1.42332514,  1.72122403],
        [-1.712277  , -0.34658796,  0.3629974 , -0.76977711,  1.08587583,
          0.16609727,  0.27010721,  0.15325385,  0.18152852,  0.49246579],
        [ 1.05832548,  1.98228014,  2.18850259,  0.82108503,  0.42838988,
          1.8003839 , -0.39505285, -0.02929142, -0.69937748, -0.23668556],
        [ 1.09747933,  0.32267196, -0.20756995, -0.85043791, -0.20697511,
         -0.15425011,  0.59800415, -0.13253693,  1.38956401,  0.55788049],
        [ 1.442453  , -0.17433992,  1.44209349, -0.37922308,  0.06991483,
         -0.76360436,  1.27243722,  0.73817403, -1.301752  , -0.46046847],
        [ 1.9083543 , -1.33721653,  0.99054056,  0.1473052 , -0.4756667 ,
         -0.67855265,  1.67230011,  1.19530252, -1.51350064,  0.12218795],
        [ 1.51843864,  0.3968648 ,  0.73078071,  0.43601877, -2.06270872,
         -0.2010952 ,

>Masking to hide the future sequences for the decoder 

In [25]:
mask = np.tril(np.ones((sequence_length, sequence_length)))
mask[ mask == 0 ] = -np.infty
mask[ mask == 1 ] = 0

mask.shape, mask

((10, 10),
 array([[  0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [  0.,   0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [  0.,   0.,   0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [  0.,   0.,   0.,   0., -inf, -inf, -inf, -inf, -inf, -inf],
        [  0.,   0.,   0.,   0.,   0., -inf, -inf, -inf, -inf, -inf],
        [  0.,   0.,   0.,   0.,   0.,   0., -inf, -inf, -inf, -inf],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0., -inf, -inf, -inf],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., -inf, -inf],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., -inf],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.]]))

> Transform to probability matrix

In [26]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T
attention = softmax(scaled + mask)

attention.shape, attention

((10, 10),
 array([[1.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.20331724, 0.79668276, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.15116157, 0.38081127, 0.46802717, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.53346922, 0.24581863, 0.14465507, 0.07605708, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.38264445, 0.07596808, 0.38250691, 0.06189443, 0.09698614,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.5625456 , 0.02190908, 0.2246758 , 0.0966816 , 0.05185499,
         0.04233293, 0.        , 0.        , 0.        , 0.        ],
        [0.40395754, 0.13159591, 0.18376391, 0.13685061, 0.01124769,
         0.07236885, 0.06021548, 0.        , 0.        , 0.        ],
        [0.29288

> Apply to original Value

In [27]:
new_v = np.matmul(attention, v)
new_v.shape, new_v

((10, 512),
 array([[-0.77619716,  0.78746114, -1.3483487 , ...,  0.72068791,
         -0.13545548, -0.05036879],
        [-0.62739304, -0.32487966,  0.68112591, ..., -0.38931659,
          0.11685185,  0.04739216],
        [-0.29292767, -0.15508948, -0.22815548, ...,  0.15647082,
          0.77177643, -0.12194113],
        ...,
        [-0.50205301,  0.18204522, -0.29486757, ...,  0.14680658,
          0.25700999,  0.13210688],
        [-0.58436974, -0.28711701, -0.10361234, ..., -0.14411789,
          0.44227375,  0.76628905],
        [-0.47548399, -0.01235199, -0.00600266, ..., -0.43051166,
         -0.18088839,  0.12450246]]))

# Class Representation

In [28]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis = -1) ).T

# Numpy edition
def scaled_dot_product_np(q, k, v, masking=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if masking is not None:
    scaled += masking
  attention = softmax(scaled)
  new_v = np.matmul(attention, v)
  return new_v, attention

def create_mask_np(sequence_length=512):
  mask = np.tril(np.ones((sequence_length, sequence_length)))
  mask[ mask == 0 ] = -np.infty
  mask[ mask == 1 ] = 0

# Torch edition
import torch
import torch.nn.functional as F

def scaled_dot_product(q, k, v, mask=None):
  d_k = q.size()[-1]
  scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
  if mask is not None:
      scaled += mask
  attention = F.softmax(scaled, dim=-1)
  values = torch.matmul(attention, v)
  return values, attention

def create_mask(batch_size=1, num_heads=8, sequence_length=512):
  mask = torch.full([batch_size, num_heads, sequence_length, sequence_length] , float('-inf'))
  mask = torch.triu(mask, diagonal=1)

# encoder_value, encoder_attention = scaled_dot_product_np(q, k, v)
# decoder_value, decoder_attention = scaled_dot_product_np(q, k, v, mask)