In [3]:
import numpy as np
import torch
import torch.nn as nn

Basical Formual:
$$
y = softmax(\frac{KQ^T}{\sqrt{d}})V
$$

Actual operation we are going to implement:
$$
y = SelfAttention(z^{i}w_K,z^{i}w_Q,z^{i}w_V = \\softmax(\frac{z^{i}w_Kw_Q^Tz^{i T}}{\sqrt{d}}) z^{i}w_V

$$

In [4]:
def softmax(Z):
    """
    Compute softmax values for each sets of scores in Z.
    each row of Z is a set of scores.
    """
    # incase of the exploding of the exponential, we subtract the max of each row
    e_Z = np.exp(Z - np.max(Z, axis = 1, keepdims = True))
    A = e_Z / e_Z.sum(axis = 1, keepdims = True)
    return A

def self_attention(X, mask, W_KQV, W_out):
    K, Q, V = np.split(X@W_KQV, 3, axis = 1)
    attn = softmax(K@Q.T / np.sqrt(X.shape[1]) + mask)
    return attn@V@W_out, attn

$$
K = XW_K\\
Q = XW_Q\\
V = XW_V
$$

=> 

$$
[K Q V] = X[W_k W_Q W_V] = XW_{KQV}
$$

In [5]:
T, d = 100, 64
attn = nn.MultiheadAttention(d, 1, bias=False, batch_first=True)
M = torch.triu(-float('inf')*torch.ones(T, T), 1)

In [6]:
M

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])