#  Scaled Dot-Product Attention

![Image](https://machinelearningmastery.com/wp-content/uploads/2022/03/dotproduct_1.png)

1. https://machinelearningmastery.com/how-to-implement-scaled-dot-product-attention-from-scratch-in-tensorflow-and-keras
2. https://learning.rasa.com/transformers/kvq/
3. https://data-science-blog.com/blog/2021/04/07/multi-head-attention-mechanism/
4. https://arxiv.org/pdf/1706.03762.pdf

In [1]:
import numpy as np
a = np.ones([9, 5, 7, 4])
c = np.ones([9, 5, 4, 3])
print(np.dot(a, c).shape)
print(np.matmul(a, c).shape)

(9, 5, 7, 9, 5, 3)
(9, 5, 7, 3)


## matmul in numpys
matmul(x1, x2, /, out=None, *, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj])

Matrix product of two arrays.

In [2]:
a = np.array([9, 5, 7, 4])
c = np.array([9, 5, 4, 3])
np.matmul(a,c)

146

In [3]:
a = np.array([[1, 0],
              [0, 1]])

b = np.array([[4, 1],
              [2, 2]])

np.matmul(a, b)

array([[4, 1],
       [2, 2]])

## Cast in tensorflow:
Casts a tensor to a new type.

In [4]:
import tensorflow as tf
x = tf.constant([3.1, 2.2], dtype=tf.float32)
tf.cast(x, tf.int32)

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 2])>

## Softmax 

In [5]:
inp = np.asarray([1., 2., 1.])
layer = tf.keras.layers.Softmax()
values = layer(inp).numpy()
print(values)
print(values.sum())

[0.21194157 0.57611686 0.21194157]
1.0


In [6]:
mask = np.asarray([True, False, True], dtype=bool)
print(layer(inp, mask).numpy())

[0.5 0.  0.5]


In [7]:
from tensorflow import matmul, math, cast, float32
from tensorflow.keras.layers import Layer
from keras.backend import softmax

# Implementing the Scaled-Dot Product Attention
class DotProductAttention(Layer):
    def __init__(self, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)

    def call(self, queries, keys, values, d_k, mask=None):
        # Scoring the queries against the keys after transposing the latter, and scaling
        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))

        # Apply mask to the attention scores
        if mask is not None:
            scores += -1e9 * mask

        # Computing the weights by a softmax operation
        weights = softmax(scores)

        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)

## Exampel customer database.

In [8]:
attention = DotProductAttention()

In [9]:
from numpy import random 

queries = random.random((1, 3, 1))
keys = random.random((1, 3, 1))
values = random.random((1, 3, 1))

In [10]:
print(queries)
print(keys)
print(values)

[[[0.15150703]
  [0.59940113]
  [0.62429778]]]
[[[0.22485589]
  [0.38444304]
  [0.67865426]]]
[[[0.49856748]
  [0.04415472]
  [0.60872006]]]


In [11]:
attention(queries, keys, values, 1)

<tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
array([[[0.3862987 ],
        [0.3944249 ],
        [0.39490905]]], dtype=float32)>

In [12]:
from numpy import random

input_seq_length = 5  # Maximum length of the input sequence
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
batch_size = 64  # Batch size from the training process

In [13]:
queries = random.random((batch_size, input_seq_length, d_k))
keys = random.random((batch_size, input_seq_length, d_k))
values = random.random((batch_size, input_seq_length, d_v))

In [14]:
attention = DotProductAttention()
print(attention(queries, keys, values, d_k))

tf.Tensor(
[[[0.5611781  0.5592531  0.47254524 ... 0.51013523 0.44942003 0.55581266]
  [0.5837234  0.57207894 0.46463716 ... 0.513379   0.45144668 0.5826472 ]
  [0.55580467 0.5453951  0.45683342 ... 0.49314564 0.46573904 0.53475696]
  [0.57554805 0.5696033  0.4642083  ... 0.5080715  0.4548852  0.5723881 ]
  [0.57027376 0.5715811  0.46606666 ... 0.505357   0.45338583 0.56504273]]

 [[0.5418818  0.6224016  0.59333324 ... 0.5632441  0.64164567 0.33463573]
  [0.53591913 0.63183004 0.59787047 ... 0.558737   0.6301525  0.3387793 ]
  [0.5596818  0.6237744  0.5706537  ... 0.5420102  0.63351244 0.32748112]
  [0.53451145 0.62813604 0.61336714 ... 0.54385847 0.62513834 0.34475732]
  [0.54508036 0.6270265  0.58449066 ... 0.55972    0.63672835 0.33292016]]

 [[0.35427496 0.29653743 0.4464199  ... 0.3617815  0.6000948  0.4925366 ]
  [0.3576734  0.28109956 0.43979156 ... 0.36254445 0.60864705 0.49505597]
  [0.36143368 0.29390565 0.44873062 ... 0.34816417 0.5888282  0.48878267]
  [0.3417774  0.3090346

## Excersice!