##### 1. 推导softmax clf 的 analytic gradient，并采用 numpy 实现其计算

##### 2. 用numerical gradient 来检验实现的正确性

In [1]:
import numpy as np

In [2]:
x = np.array([0.2,0.3,0.1,0.5]).reshape(1,4) #x n,3072
y = np.array([1]) #n ,1
W = np.random.randn(3,4) #10，3072 10 n

In [3]:
def eval_analytic_grad(W, xi, yi):
    scores = xi.dot(W.T)
    exp_scores = np.exp(scores).reshape(W.shape[0],)
    pro_scores = exp_scores / np.sum(exp_scores)
    grad = np.zeros(W.shape)
    for i in range(W.shape[0]):
        if(i==yi):
            grad[i] = (pro_scores[i] - 1) * xi
        else:
            grad[i] = pro_scores[i] * xi
    return grad

In [4]:
def loss(W, x, y):
    scores = x.dot(W.T)
    exp_scores = np.exp(scores).reshape(W.shape[0],)
    pro_scores = exp_scores / np.sum(exp_scores)
    return -np.log(pro_scores[y])

In [5]:
def eval_numerical_grad(W, xi, yi):
    h = 1e-5
    grad = np.zeros(W.shape)
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            fx = loss(W, xi, yi)
            W[i][j] =  W[i][j] + h
            fx_h = loss(W, xi, yi)
            grad[i][j] = (fx_h - fx) / h
    return grad

In [6]:
numerical_grad = eval_numerical_grad(W,x,y)
analytic_grad = eval_analytic_grad(W,x,y)
print('numerical grad:',numerical_grad)
print('analytic_grad:',analytic_grad)
print('diff',np.sum(np.abs(numerical_grad-analytic_grad)))

numerical grad: [[ 0.03771119  0.05656689  0.01885566  0.09427854]
 [-0.15482872 -0.23224296 -0.07741428 -0.38707116]
 [ 0.11711702  0.17567572  0.05855862  0.29279347]]
analytic_grad: [[ 0.03771116  0.05656673  0.01885558  0.09427789]
 [-0.15482867 -0.232243   -0.07741433 -0.38707166]
 [ 0.11711751  0.17567627  0.05855876  0.29279378]]
diff 3.0538085514855706e-06


In [10]:
print('Analytic gradient:')
for i in range(analytic_grad.shape[0]):
    print(analytic_grad[i])
print('Numerical gradient:')
for i in range(numerical_grad.shape[0]):
    print(numerical_grad[i])

Analytic gradient:
[0.03771116 0.05656673 0.01885558 0.09427789]
[-0.15482867 -0.232243   -0.07741433 -0.38707166]
[0.11711751 0.17567627 0.05855876 0.29279378]
Numerical gradient:
[0.03771119 0.05656689 0.01885566 0.09427854]
[-0.15482872 -0.23224296 -0.07741428 -0.38707116]
[0.11711702 0.17567572 0.05855862 0.29279347]


In [8]:
epoch = 200;
eta = 0.5;
for i in range(epoch):   
    if(i%20==0):
        print('%d loss:%f' %(i,loss(W,x,y)))
    grad = eval_numerical_grad(W,x,y)
    W = W - grad * eta

0 loss:1.487855
20 loss:0.224291
40 loss:0.103308
60 loss:0.065840
80 loss:0.048057
100 loss:0.037754
120 loss:0.031055
140 loss:0.026358
160 loss:0.022886
180 loss:0.020218


In [9]:
W = np.random.randn(3,4) #10，3072 10 n
epoch = 200;
eta = 0.5;
for i in range(epoch):   
    if(i%20==0):
        print('%d loss:%f' %(i,loss(W,x,y)))
    grad = eval_analytic_grad(W,x,y)
    W = W - grad * eta

0 loss:2.248857
20 loss:0.292913
40 loss:0.118931
60 loss:0.072401
80 loss:0.051644
100 loss:0.040016
120 loss:0.032614
140 loss:0.027500
160 loss:0.023760
180 loss:0.020909
