In [114]:
import numpy as np

def step_sigmoid(x):
    # 隐藏层的激活函数
    return 1 / (1 + np.exp(-x))

def softmax_function(a):
    # 全连接层转概率分布
    c = np.max(a)
    expa = np.exp(a - c)
    # overflow avoid
    sum_exp = np.sum(expa)
    y = expa / sum_exp
    return y

def cross_entropy_error(pred,real):
    # 就是重点关注 在正确的上面,是不是熵越小
    # 还是符合 误差越大,这个数字就越大
    # 交叉熵损失函数,用于衡量输入张量和真实张量的差值
    delta = 1e-7 # avoid overflow
    return -np.sum(real * np.log(pred+delta))

def numerical_gradient(func,input_points_array):
    # 计算梯度,就是在输入维度的每个维度上求其导数
    delta = 1e-4
    tmp_shape = input_points_array.shape
    # 缓存原始矩阵形状 如 2,3 ! size 返回的是长度
    # 扁平化处理
    if input_points_array.ndim != 1:
        input_points_array = input_points_array.reshape(1,input_points_array.size).squeeze()
    
    # 扁平化后直接迭代全部,求偏导
    grad = np.zeros_like(input_points_array)
    for index in range(input_points_array.size):
        temp_val = input_points_array[index]
        # f(x+h)
        input_points_array[index] = temp_val + delta
        fv1 = func(input_points_array)

        input_points_array[index] = temp_val - delta
        fv2 = func(input_points_array)

        grad[index] = (fv1 - fv2) / (delta * 2)
        input_points_array[index] = temp_val
    
    # 还原原始维度
    input_points_array = input_points_array.reshape(tmp_shape)
    grad = grad.reshape(tmp_shape)
    return grad

def gradient_decent(f,init_x,lr=0.1,step_num=100):
    x = init_x
    for i in range(step_num):
        grad = numerical_gradient(f,x)
        x -= lr * grad # 梯度下降 
    return x



In [127]:
class TwoLayerNetwork:
    def __init__(self,input_size,hidden_size,output_size,weight_initial_std=0.01):
        # input layer 输入层
        # hidden_size 隐藏层神经元个数
        # output 输出神经元个数
        # 初始权重,对 初始神经元正则化
        self.params = {}
        self.params["W1"] = weight_initial_std * np.random.randn(input_size,hidden_size)
        self.params["b1"] = np.zeros(hidden_size)
        self.params["W2"] = weight_initial_std * np.random.randn(hidden_size,output_size)
        self.params["b2"] = np.zeros(output_size)

    def predict(self,input_array):
        W1,W2 = self.params["W1"],self.params["W2"]
        b1,b2 = self.params["b1"],self.params["b2"]

        # input -> a1 -> z1 -> L2
        a1 = np.dot(input_array,W1) + b1
        z1 = step_sigmoid(a1) # sigmoid 隐藏层激活函数

        a2 = np.dot(z1,W2) + b2
        y = softmax_function(a2) # 全连接层转分布
        return y

    def loss(self,input_arrays,reals):
        preds = self.predict(input_arrays)
        return cross_entropy_error(preds,reals)
    
    def accuracy(self,input_arrays,reals):
        y = self.predict(input_arrays)
        y = np.argmax(y,axis=1) # --
        reals = np.argmax(reals,axis=1) # ==
        accuracy = np.sum(y == reals) / float(input_arrays.shape[0])
        return accuracy
    
    def numerical_gradient(self,input_arrays,reals):
        loss_W = lambda fake : self.loss(input_arrays,reals)

        grads = {}
        grads['W1'] = numerical_gradient(loss_W,self.params['W1'])
        grads["b1"] = numerical_gradient(loss_W,self.params['b1'])
        grads["W2"] = numerical_gradient(loss_W,self.params['W2'])
        grads["b2"] = numerical_gradient(loss_W,self.params['b2'])

        return grads


In [116]:
from  mnist import load_mnist
(x_train, t_train) ,(x_test, t_test) = load_mnist(normalize=True,one_hot_label=True)

# hyper parameters

iter_num = 100
tran_size = x_train.shape[0]
batch_size = 1
lr = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(tran_size/batch_size,1)


net = TwoLayerNetwork(input_size=784,hidden_size=50,output_size=10)


In [117]:
# mask = np.random.choice(10,5) # array([2, 0, 2, 5, 2])
# print(np.random.randn(10,4)[mask])

for i in range(iter_num):
    print(i)
    batch = np.random.choice(tran_size,batch_size)
    x_batch = x_train[batch]
    t_batch = t_train[batch]

    grad = net.numerical_gradient(x_batch,t_batch)

    for key in ('W1','b1','W2','b2'):
        # grad 是各个参数矩阵的梯度值
        net.params[key] -= lr * grad[key]#梯度下降

    if i % iter_per_epoch == 0:
        train_acc = net.accuracy(x_train,t_train)
        test_acc = net.accuracy(x_test,t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
    
    loss = net.loss(x_batch,t_batch)
    train_loss_list.append(loss)

    


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [123]:
net.params['W1']

array([[-9.22222062e-03, -1.52184059e-03,  6.72948419e-03, ...,
         8.00174958e-03, -2.39313615e-03, -1.33481390e-02],
       [-4.49973982e-03,  1.37269519e-02,  4.06990424e-03, ...,
         1.01186751e-02,  1.08493838e-02, -1.97001029e-03],
       [ 1.17904155e-02, -4.63902606e-03, -3.33463003e-03, ...,
        -1.34637463e-02, -6.20693957e-03, -5.91758782e-03],
       ...,
       [ 4.64403001e-03, -1.17841175e-02,  1.79572840e-03, ...,
        -3.56421296e-03, -8.97899883e-03, -1.25014819e-02],
       [ 9.69811792e-03, -1.51452051e-03,  9.83568274e-03, ...,
        -7.65879571e-03, -1.31815423e-02, -1.30733235e-02],
       [ 1.86015706e-04, -6.23936669e-03, -4.48041228e-03, ...,
         1.89045681e-05,  1.92439341e-03, -8.98577422e-04]])

In [135]:
net.predict(np.array([x_test[0]])),np.array([t_test[0]])

(array([[0.08401433, 0.0626904 , 0.13044008, 0.19866606, 0.05155596,
         0.06534478, 0.2074063 , 0.05673433, 0.11087824, 0.03226952]]),
 array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]))