In [1]:
import numpy as np
import tensorflow as tf

In [70]:
def softmax(x):
    expx = np.exp(x-np.max(x, axis=-1))
    return expx/np.sum(expx, axis=-1, keepdims=True)    

def attention(key, data, sharpness=1.0):
    # returns [mb, capacity]
    return softmax(np.einsum("mcl,ml->mc", data, key))

def attention_jacobian(key, data, sharpness=1.0):
    # returns [m,c,l] = dA[m,c]/dK[m,l]
    a = attention(key, data, sharpness)   # [m,c]
    eye = np.eye(data.shape[1])[None,:,:]
    jac = a[:,None,:]*(eye-a[:,:,None])     # softmax jacobian J[m,i,j]= dS(m,i)/da(m,j)
    print("jac:", jac)
    return sharpness*np.einsum("mcx,mxl->mcl", jac, data)
    
    
def key_r_jacobian(key, data, sharpness=1.0):
    # dG[m,l]/dKr[m,l]   -> [m,l,l]
    jac = attention_jacobian(key, data, sharpness)
    return np.einsum("mba,mbj->maj", data, jac)

def key_w_jacobian(key, data, sharpness=1.0)
    


In [71]:
C = 3
L = 2
B = 1
data0 = np.random.random((B,C,L))
data = data0.copy()
key = np.random.random((B,L))


In [72]:
jac = attention_jacobian(key, data)
print("attention jacobian dA/dK:", jac.shape, jac)
key_r_jac = key_r_jacobian(key, data)
print("key R jacobian:", key_r_jac.shape, key_r_jac)

jac: [[[ 0.23965215 -0.1034726  -0.13617955]
  [-0.1034726   0.19230468 -0.08883208]
  [-0.13617955 -0.08883208  0.22501163]]]
attention jacobian dA/dK: (1, 3, 2) [[[-0.040613    0.11095417]
  [-0.01780815 -0.10709514]
  [ 0.05842115 -0.00385903]]]
jac: [[[ 0.23965215 -0.1034726  -0.13617955]
  [-0.1034726   0.19230468 -0.08883208]
  [-0.13617955 -0.08883208  0.22501163]]]
key R jacobian: (1, 2, 2) [[[ 0.01534393 -0.00463273]
  [-0.00463273  0.07510054]]]


In [76]:
def tf_attention(key, data, sharpness=1.0):
    return tf.nn.softmax(tf.reduce_sum(data*key[:,None,:]*sharpness, axis=-1), axis=1)

with tf.GradientTape() as tape:
    d = tf.convert_to_tensor(data)
    k = tf.convert_to_tensor(key)
    tape.watch(d)
    tape.watch(k)
    a = tf_attention(k, d)
    g = tf.reduce_sum(a[:,:,None]*d, axis=1)
print("jacobian:", tape.jacobian(g, k))

jacobian: tf.Tensor(
[[[[ 0.01534393 -0.00463273]]

  [[-0.00463273  0.07510054]]]], shape=(1, 2, 1, 2), dtype=float64)


In [52]:
x = np.random.random((3,))
with tf.GradientTape() as tape:
    x = tf.convert_to_tensor(x)
    tape.watch(x)
    s = tf.nn.softmax(x)
print("jacobian:", tape.jacobian(s, x))

jacobian: tf.Tensor(
[[ 0.22885467 -0.12581106 -0.10304361]
 [-0.12581106  0.22892029 -0.10310923]
 [-0.10304361 -0.10310923  0.20615284]], shape=(3, 3), dtype=float64)
