In [2]:
import gzip
import numpy as np
import os
import struct


# 1 获取数据

一共4个数据包, 分别是 训练数据 及其标签, 验证数据 及其标签

In [3]:
def load_train_images():
    with gzip.open('data/train-images-idx3-ubyte.gz', 'rb') as f:
        magic, n, rows, cols = struct.unpack('>IIII', f.read(16))
        assert magic == 2051
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(n, rows, cols)
    
train_images_defalt = load_train_images()

In [4]:
def load_train_labels():
    with gzip.open('data/train-labels-idx1-ubyte.gz', 'rb') as f:
        magic, n = struct.unpack('>II', f.read(8))
        assert magic == 2049
        return np.frombuffer(f.read(), dtype=np.uint8)

train_labels_defalt = load_train_labels()

In [5]:
def load_verify_images():
    with gzip.open('data/t10k-images-idx3-ubyte.gz', 'rb') as f:
        magic, n, rows, cols = struct.unpack('>IIII', f.read(16))
        assert magic == 2051
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(n, rows, cols)
    
verify_images_defalt = load_verify_images()

In [6]:
def load_verify_labels():
    with gzip.open('data/t10k-labels-idx1-ubyte.gz', 'rb') as f:
        magic, n = struct.unpack('>II', f.read(8))
        assert magic == 2049
        return np.frombuffer(f.read(), dtype=np.uint8)

verify_labels_defalt = load_verify_labels()

## 1.1 数据预处理

### 1.1.1 处理Labels: 把Labels转换为one-hot编码
防止网络认为数字有所含义, 影响网络判断

In [7]:
train_labels_one_hot = np.zeros((train_labels_defalt.size, train_labels_defalt.max() + 1)) # 先全部初始化为0
train_labels_one_hot[np.arange(train_labels_defalt.size), train_labels_defalt] = 1 # 根据label的值，将对应的位置置为1

verify_labels_one_hot = np.zeros((verify_labels_defalt.size, verify_labels_defalt.max() + 1)) 
verify_labels_one_hot[np.arange(verify_labels_defalt.size), verify_labels_defalt] = 1 

i = 0
print(train_labels_defalt[i])
print(train_labels_one_hot[i])


# 要用到的数据
train_labels = train_labels_one_hot
verify_labels = verify_labels_one_hot


5
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


### 1.1.2 处理Images: 把Images展平, 匹配输入 + 把像素灰度归一化
1. 展平: 二维图像 -> 一位数组 -> 输入层感知器
2. 归一化: 把[0, 255]的灰度值 映射到 [0, 1]区间内

In [8]:
train_images_flatten = train_images_defalt.reshape(train_images_defalt.shape[0], -1) # -1: 自动计算列数
verify_images_flatten = verify_images_defalt.reshape(verify_images_defalt.shape[0], -1)

print(train_images_flatten.shape)
print(train_images_flatten[0].shape)
print(train_images_flatten[0])

train_images_normalized = train_images_flatten / 255 # numpy会自动广播 (即对每个元素都除以255)
verify_images_normalized = verify_images_flatten / 255

print(train_images_normalized[0])

# 要用到的数据
train_images = train_images_normalized
verify_images = verify_images_normalized

(60000, 784)
(784,)
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   3  18  18  18 126 136 175  26 166 255
 247 127   0   0   0   0   0   0   0   0   0   0   0   0  30  36  94 154
 170 253 253 253 253 253 225 172 253 242 195  64   0   0   0   0   0   0
   0   0   0   0   0  49 238 253 253 253 253 253 253 253 253 251  93  82
  82  56  39   0   0   0   0   0   0   0   0   0   0   0   0  18 219 253
 253 253 253 253 198 182 247 24

# 2 定义工具函数


In [9]:
# 前向传播所需的函数

def sigmoid(x): # 输入层, 隐藏层的激活函数
    return 1 / (1 + np.exp(-x))

def softmax(x): # 输出层的激活函数
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # 减去最大值, 提高数值稳定性
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy_loss(y, y_hat): # 交叉熵损失函数
    return -np.sum(y * np.log(y_hat)) / y.shape[0]  # 平均损失

# 反向传播所需的函数

def sigmoid_derivative(x): # sigmoid的导数
    return x * (1 - x)

def softmax_and_cross_entropy_derivative(y, y_hat): # softmax和交叉熵的导数
    return y_hat - y

def matrix_derivative(x, delta):
    """
    计算权重的梯度
    x: 输入矩阵 (batch_size, input_dim)
    delta: 误差矩阵 (batch_size, output_dim)
    return: 权重的梯度矩阵 (input_dim, output_dim)
    """
    return np.dot(x.T, delta)



# 3 定义网络

In [10]:
input_size = 28 * 28
hidden_size = 64
output_size = 10

learning_rate = 0.01

# 初始化
np.random.seed(0)
weights_input_hidden = np.random.randn(input_size, hidden_size) 
weights_hidden_output = np.random.randn(hidden_size, output_size) 
bias_input_hidden = np.zeros(hidden_size) 
bias_hidden_output = np.zeros(output_size) 


# 前向传播
def forward(x):
    global weights_input_hidden, weights_hidden_output, bias_input_hidden, bias_hidden_output # global: 用于在函数内部修改全局变量
    
    # 输入层到隐藏层
    hidden_layer_input = np.dot(x, weights_input_hidden) + bias_input_hidden # 线性方程
    hidden_layer_output = sigmoid(hidden_layer_input) # 激活函数
    
    # 隐藏层到输出层
    output_layer_input = np.dot(hidden_layer_output, weights_hidden_output) + bias_hidden_output # 线性方程
    output_layer_output = softmax(output_layer_input) # 激活函数
    
    # return output_output
    return hidden_layer_input, hidden_layer_output, output_layer_input, output_layer_output # 返回每一层的输入和输出, 以便反向传播

# 测试前向传播
_, _, _, output = forward(train_images[0:1]) 
print(output)     


[[2.68663086e-08 4.59179373e-02 6.77974378e-04 2.75506586e-02
  7.74795216e-01 1.94717364e-02 7.94041780e-05 5.08554970e-04
  2.98237201e-02 1.01174772e-01]]


# 4 Train
## 4.1 定义 反向传播 的细节

In [11]:
def train(x, y):
    global weights_input_hidden, weights_hidden_output, bias_input_hidden, bias_hidden_output, learning_rate
    
    # 前向传播
    hidden_layer_input, hidden_layer_output, output_layer_input, output_layer_output = forward(x)
    
    # 反向传播
    loss = cross_entropy_loss(y, output_layer_output)
    
    # hidden to output layer
    ## 权重
    delta_hidden_output = softmax_and_cross_entropy_derivative(y, output_layer_output) # 误差: 交叉熵和softmax的导数
    weights_hidden_output_gradient = matrix_derivative(hidden_layer_output, delta_hidden_output) # 矩阵的导数 * 误差
    weights_hidden_output -= learning_rate * weights_hidden_output_gradient # 更新
    ## 偏置 (省略不讲)
    bias_hidden_output_gradient = softmax_and_cross_entropy_derivative(y, output_layer_output) 
    bias_hidden_output -= learning_rate * bias_hidden_output_gradient.squeeze() # 更新
    
    # input to hidden layer (需要从最末尾开始计算)
    ## 权重
    delta_input_hidden = np.dot(delta_hidden_output, weights_hidden_output.T) * sigmoid_derivative(hidden_layer_output) # 误差: 上一层的误差 * 权重的转置 * sigmoid的导数
    weights_input_hidden_gradient = matrix_derivative(x, delta_input_hidden) # 矩阵的导数 * 误差(这里的误差是上一层的误差)
    weights_input_hidden -= learning_rate * weights_input_hidden_gradient # 更新
    ## 偏置 (省略不讲)
    bias_input_hidden_gradient = delta_input_hidden.sum(axis=0) # 求和
    bias_input_hidden -= learning_rate * bias_input_hidden_gradient # 更新
    
    return loss # 监视训练过程

# 测试训练
index = 1
for _ in range(10):
    loss = train(train_images[index:index+1], train_labels[index:index+1]) # 因为使用了one-hot编码, 所以这里的x和y都是1x10的矩阵(二维), 而不是一个数
    print(loss)



9.16264812588529
7.223333161807007
5.830257800794516
4.705050411253215
3.731189480421831
2.895925896543204
2.213008228387067
1.6775624030332286
1.2733157905064107
0.9784849978958721


## 4.2 开始训练

### 4.2.1 验证

In [12]:
def verify(pred_label, verify_label):
    pred_label = np.argmax(pred_label, axis=1)
    verify_label = np.argmax(verify_label, axis=1)
    accuracy = np.sum(pred_label == verify_label) / verify_label.size
    return accuracy


In [13]:
# 开始训练
for epoch in range(5):
    # 打乱数据
    shuffle_index = np.random.permutation(60000)
    train_images = train_images[shuffle_index]
    train_labels = train_labels[shuffle_index]
    
    for index in range(60000):
        loss = train(train_images[index:index+1], train_labels[index:index+1])
        loss = loss.mean()
        if index % 10000 == 0:
            print(f'epoch: {epoch}, index: {index}, loss: {loss}')
            print(f'accuracy: {verify(forward(verify_images)[3], verify_labels)}')

epoch: 0, index: 0, loss: 8.21996605027983
accuracy: 0.0892
epoch: 0, index: 10000, loss: 1.2390590076281032
accuracy: 0.7833
epoch: 0, index: 20000, loss: 0.16352187075209745
accuracy: 0.8358
epoch: 0, index: 30000, loss: 0.18625180174804273
accuracy: 0.8518
epoch: 0, index: 40000, loss: 0.017464559994938084
accuracy: 0.8649
epoch: 0, index: 50000, loss: 2.336440394287572
accuracy: 0.8795
epoch: 1, index: 0, loss: 0.03388167125926495
accuracy: 0.8892
epoch: 1, index: 10000, loss: 0.0003113355522074509
accuracy: 0.8915
epoch: 1, index: 20000, loss: 2.4832208308321015
accuracy: 0.8934
epoch: 1, index: 30000, loss: 3.5124014555825642
accuracy: 0.8997
epoch: 1, index: 40000, loss: 0.019325449800918094
accuracy: 0.9031
epoch: 1, index: 50000, loss: 3.402583545306575
accuracy: 0.9053
epoch: 2, index: 0, loss: 0.29551786162378396
accuracy: 0.9081
epoch: 2, index: 10000, loss: 0.00176281519735282
accuracy: 0.9067
epoch: 2, index: 20000, loss: 0.31044146769123204
accuracy: 0.9101
epoch: 2, ind