In [5]:
import numpy as np
from nn import LinearLayer,Embedding

Self Attention

In [2]:
# QKV weights and shape
seq_len = 3
embed_dim = 4 # feature vector

q_w = np.random.randn(embed_dim,embed_dim)
k_w = np.random.randn(embed_dim,embed_dim)
v_w = np.random.randn(embed_dim,embed_dim)

In [3]:
# Self Attention
# input x, sequence of words
x = np.random.randn(seq_len,embed_dim) # seq,feature vector

In [4]:
# project input to linear layer for query , key and value
Q = np.dot(x,q_w)
K = np.dot(x,k_w)
V = np.dot(x,v_w)

In [None]:
# calculate attention score and output context vector
atten_scores = np.dot(Q,K.transpose()) # Q.Kt
# scale
atten_scores /=np.sqrt(K.shape[-1])
# softmaxing over each row.
atten_weights = np.exp(atten_scores)/np.sum(np.exp(atten_scores),keepdims=True,axis=-1)

In [None]:
# finally context vector. applying atten weights to each seq value
context_vec = np.dot(atten_weights,V) # seq,feature vector

In [33]:
np.sum(np.exp(atten_scores),keepdims=True,axis=-1)

array([[2.47286368],
       [2.95989443],
       [2.09954924]])

Masked Self-Attention

In [19]:
# Masked Attention
# Creating mask
mask = np.tril(np.ones((seq_len,seq_len)))
print(mask)

[[1. 0. 0.]
 [1. 1. 0.]
 [1. 1. 1.]]


In [21]:
# apply to attention weights
masked_atten = atten_weights*mask
print(masked_atten)

[[0.06670566 0.         0.        ]
 [0.40852935 0.03270521 0.        ]
 [0.28575114 0.18995286 0.524296  ]]


In [34]:
# normalize the softmax making ever row sum to 1.
sums = np.sum(masked_atten,keepdims=True, axis=1)
masked_atten_norm = masked_atten/sums
print(sums)
print(masked_atten_norm)

[[1.        ]
 [6.19848035]
 [5.23856068]]
[[1.         0.         0.        ]
 [0.9880419  0.0119581  0.        ]
 [0.81773636 0.08217966 0.10008398]]


In [36]:
# Efficient Way is to used -inf instead 0, in the scores directly , then applying softmax, exp(-inf) is 0
print(atten_scores)

[[-1.80208863 -5.54754719  0.83465224]
 [ 0.18996209 -2.3350674   0.5031281 ]
 [-0.51091133 -0.91925666  0.09602381]]


In [46]:
# masking the attention scores
mask = np.triu(np.ones((seq_len,seq_len)),k=1)
masked_scores = np.where(mask,-np.inf,atten_scores)
print(masked_scores)

[[-1.80208863        -inf        -inf]
 [ 0.18996209 -2.3350674         -inf]
 [-0.51091133 -0.91925666  0.09602381]]


In [48]:
# applying the softmax
masked_atten_scores = np.exp(masked_scores)/np.sum(np.exp(masked_scores),axis=-1,keepdims=True)
print(masked_atten_scores)

[[1.         0.         0.        ]
 [0.92587796 0.07412204 0.        ]
 [0.28575114 0.18995286 0.524296  ]]


Multi Head Attention

In [24]:
class Attention:

    def forward(self,Q,K,V):
        # calculate attention score and output context vector
        atten_scores = np.dot(Q,K.transpose()) # Q.Kt
        # scale
        atten_scores /=np.sqrt(K.shape[-1])
        # softmaxing over each row.
        atten_weights = np.exp(atten_scores)/np.sum(np.exp(atten_scores),keepdims=True,axis=-1)
        # finally context vector. applying atten weights to each seq value
        context_vec = np.dot(atten_weights,V) # seq,feature vector
        return context_vec

In [35]:
class MultiHeadAttention:

    def __init__(self,embed_dim,heads):
        self.q_w = np.random.randn(embed_dim,embed_dim)
        self.k_w = np.random.randn(embed_dim,embed_dim)
        self.v_w = np.random.randn(embed_dim,embed_dim)
        self.heads = [Attention() for _ in range(heads)]
        self.n_heads = heads
        self.dk = embed_dim//heads
        
    def forward(self,x):
        # project input to linear layer for query , key and value
        Q = np.dot(x,self.q_w)
        K = np.dot(x,self.k_w)
        V = np.dot(x,self.v_w)

        # split to heads; heads,seq_len,head_embed
        q_heads = Q.reshape(Q.shape[0],self.n_heads,self.dk).transpose(1,0,2)
        k_heads = K.reshape(K.shape[0],self.n_heads,self.dk).transpose(1,0,2)
        v_heads = V.reshape(V.shape[0],self.n_heads,self.dk).transpose(1,0,2)

        heads = []
        for i,head in enumerate(self.heads):
            heads.append(head.forward(q_heads[i],k_heads[i],v_heads[i]))
        return np.concatenate(heads,axis=-1)



In [9]:
# x into heads
# 2 heads
heads = 2
embed_dim = 4
head_dim = 4//2
x = np.random.randn(3,4)
print(x)

[[-0.02859584 -1.70349588 -0.06870893 -0.86886527]
 [-0.88822215 -0.50872112 -0.45928556 -0.36587025]
 [ 0.82561654  0.87119922  0.92009989 -0.03337084]]


In [14]:
x.reshape(3,heads,head_dim)

array([[[-0.02859584, -1.70349588],
        [-0.06870893, -0.86886527]],

       [[-0.88822215, -0.50872112],
        [-0.45928556, -0.36587025]],

       [[ 0.82561654,  0.87119922],
        [ 0.92009989, -0.03337084]]])

In [15]:
x.reshape(3,heads,head_dim).transpose(1,0,2) # head,seq,head_dim

array([[[-0.02859584, -1.70349588],
        [-0.88822215, -0.50872112],
        [ 0.82561654,  0.87119922]],

       [[-0.06870893, -0.86886527],
        [-0.45928556, -0.36587025],
        [ 0.92009989, -0.03337084]]])

In [17]:
x_heads = x.reshape(3,heads,head_dim).transpose(1,0,2)

In [18]:
heads = []
for head in x_heads:
    heads.append(head)

In [19]:
heads

[array([[-0.02859584, -1.70349588],
        [-0.88822215, -0.50872112],
        [ 0.82561654,  0.87119922]]),
 array([[-0.06870893, -0.86886527],
        [-0.45928556, -0.36587025],
        [ 0.92009989, -0.03337084]])]

In [21]:
np.concatenate(heads,axis=-1)

array([[-0.02859584, -1.70349588, -0.06870893, -0.86886527],
       [-0.88822215, -0.50872112, -0.45928556, -0.36587025],
       [ 0.82561654,  0.87119922,  0.92009989, -0.03337084]])

In [36]:
# MHA
mha= MultiHeadAttention(4,2)

In [37]:
mha.forward(x)

array([[ 3.40463364,  2.40482304,  1.26445166, -0.56788762],
       [-0.2439183 , -0.26813713,  1.28104496, -0.49546929],
       [-2.27833796, -1.09096959, -1.3063785 , -0.48213754]])

In [2]:
import numpy as np
from nn import LinearLayer,Linear
from tensor import Tensor
import time

In [None]:
# MHA with Mask, for autoregressive model
class MultiHeadAttention:

    def __init__(self,embed_dim,num_heads,context_len):
        assert embed_dim%num_heads==0,"feature vector must be divisible by heads"
        self.embed_dim =  embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim//num_heads
        self.W_query = None
        self.W_key = None
        self.W_value = None
        self.out_proj = None

        

In [2]:
x = np.random.randn(786)