## HVP

implement HVP using cntk
refer: https://cntk.ai/pythondocs/cntk.ops.functions.html#cntk.ops.functions.Function.forward

In [1]:
import cntk as C
import numpy as np

Calculate HVP using $\frac{g(x+rv)-g(x-rv)}{2r}$.

In [2]:
def weight_update(w, v, r):
    # w: weights of neural network (tuple)
    # v: value for delta w (dictionary, e.g., gradient value)
    # r: hyperparameter for a gradient (scalar)
    
    for p in w:
        p.value += r * v[p]

In [3]:
def HVP(y, x, v):
    # y: function to be differentiated (function, e.g. neural network logit)
    # w: variables to differentiate (numeric, e.g. neural network weight)
    # x: feed_dict value for the network y (numpy array, e.g., image)
    # v: vector to be producted (by Hessian) (numeric dictionary, e.g., g(z_test))
    
    # hyperparameter r
    r = 1e-4
    
    feed = y.inputs[-1] # input of the neural network
    
    x = np.asarray(x, dtype=feed.dtype)
    assert(type(v)==dict)
    
    w = y.parameters
    
    # gradient for plus
    weight_update(w, v, +r)
    g_plus = y.grad({feed:x}, wrt=params)
    weight_update(w, v, -r)
    
    # gradient for minus
    weight_update(w, v, -r)
    g_minus = y.grad({feed:x}, wrt=params)
    weight_update(w, v, +r)
    
    # hvp = (g({feed:x+np.dot(r,v_stop)}, wrt=params) - g({feed:x-np.dot(r,v_stop)}, wrt=params))/(2*r) # dict implemented
    
    hvp = {ks: (g_plus[ks] - g_minus[ks])/(2*r) for ks in g_plus.keys()}
       
    return hvp

In [4]:
# toy example

x = C.input_variable(shape=(1,))
#h = C.layers.Dense(1, activation=None, init=C.uniform(1), bias=True, init_bias=0)(x)
#y = C.layers.Dense(1, activation=None, init=C.uniform(1), bias=True, init_bias=0)(h)
h = C.layers.Dense(1, activation=None, init=C.uniform(1), bias=False)(x)
y = C.layers.Dense(1, activation=None, init=C.uniform(1), bias=False)(h)

y_plus = y.clone('clone')
y_minus = y.clone('clone')
x_feed = [[1.]]
params = y.parameters
v_feed = {p: np.ones_like(p.value) for p in params}

HVP(y, x_feed, v_feed)

# output should be 1, 1

{Parameter('W', [], [1 x 1]): array([[ 1.00016594]], dtype=float32),
 Parameter('W', [], [1 x 1]): array([[ 1.00016594]], dtype=float32)}

원래 답은 1.0, 1.0이 나와야 함. 이런 차이는 hyperparameter r과 관계가 있을 것. r이 작이질 수록 오차도 적어질 것으로 예상되지만, 지나치게 r이 적게 되면 precision number보다 적은 값이 나와서 문제가 생길 수 있음. 특히 gradient 값이 작은 부분에선 큰 문제가 됨.

In [None]:
# neural network architecture

# neural network load

# calculate HVP for an example

# check tack time

In [None]:
# stochastic estimation