In [3]:
import numpy as np


## Numerical Differential

In [4]:
def squared(x):
    return x**2


def numerical_diff(f, x):
    h = 1e-4

    return (f(x + h) - f(x)) / h

numerical_diff(squared, 4)


8.00009999998963

In [5]:
def centered_numerical_diff(f, x):
    h = 1e-4

    return (f(x + h) - f(x - h)) / (2 * h)

centered_numerical_diff(squared, 4)


7.999999999999119

## Partial Derivative

In [6]:
def numerical_gradient_1d(f, x):
    """
    x = [x_1, x_2, ...]
    """
    h = 1e-7
    grad = np.zeros_like(x)

    for i in range(len(x)):
        old = x[i]

        x[i] = old + h
        a = f(x)

        x[i] = old - h
        b = f(x)

        # revert
        x[i] = old

        grad_i = (a - b)/(2 * h)
        
        grad[i] = grad_i

    return grad
        

In [7]:
def f(x):
    a, b = x[0], x[1]
    return a**2 + a*b

numerical_gradient_1d(f, np.array([3., 4.]))


array([9.99999999, 3.        ])

## D = XW


In [24]:
# softmax
def softmax(z):
    return np.exp(z) / np.sum(np.exp(z))


# one-hot, M by K, K by M -> should calculate trace
def cross_entropy(p, t): 
    delta = 1e-7
    return -np.sum(t @ np.log(p + delta))


def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
      
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 还原值
    
        it.iternext()   
        
    return grad

In [25]:

# 2 * 3 a two-layer network
# W = np.random.randn(2, 3)
W = np.array([
    [ 0.47355232, 0.9977393, 0.84668094],
    [ 0.85557411, 0.03563661, 0.69422093]
])
# 
x = np.array([0.6, 0.9])

y_hat = x @ W

# loss function 
p = softmax(y_hat)

t = np.zeros(len(p))
t[np.argmax(p)] = 1

W, t, cross_entropy(p, t)


(array([[0.47355232, 0.9977393 , 0.84668094],
        [0.85557411, 0.03563661, 0.69422093]]),
 array([0., 0., 1.]),
 0.9280682857864075)

In [27]:
def f(W):
    p = softmax(x @ W)
    return cross_entropy(p, t)

numerical_gradient(f, W)


array([[ 0.21924757,  0.14356243, -0.36281   ],
       [ 0.32887136,  0.21534364, -0.544215  ]])