In [172]:
import numpy as np
import pandas as pd
import random

s1 = np.array(
    [
        [1.58, 2.32, -5.8],
        [0.67, 1.58, -4.78],
        [1.04, 1.01, -3.63],
        [-1.49, 2.18, -3.39],
        [-0.41, 1.21, -4.73],
        [1.39, 3.16, 2.87],
        [1.20, 1.40, -1.89],
        [-0.92, 1.44, -3.22],
        [0.45, 1.33, -4.38],
        [-0.76, 0.84, -1.96],
    ]
)
s2 = np.array(
    [
        [0.21, 0.03, -2.21],
        [0.37, 0.28, -1.8],
        [0.18, 1.22, 0.16],
        [-0.24, 0.93, -1.01],
        [-1.18, 0.39, -0.39],
        [0.74, 0.96, -1.16],
        [-0.38, 1.94, -0.48],
        [0.02, 0.72, -0.17],
        [0.44, 1.31, -0.14],
        [0.46, 1.49, 0.68],
    ]
)
s3 = np.array(
    [
        [-1.54, 1.17, 0.64],
        [5.41, 3.45, -1.33],
        [1.55, 0.99, 2.69],
        [1.86, 3.19, 1.51],
        [1.68, 1.79, -0.87],
        [3.51, -0.22, -1.39],
        [1.40, -0.44, -0.92],
        [0.44, 0.83, 1.97],
        [0.25, 0.68, -0.99],
        [0.66, -0.45, 0.08],
    ]
)


def cacl(A: np.array, B: np.array, v: np.array, t: np.array):
    """
    计算权重更新所需的梯度和误差。

    参数:
    A -- 输入到隐藏层的权重矩阵
    B -- 隐藏层到输出层的权重矩阵
    v -- 输入向量
    t -- 目标向量

    返回:
    dA -- A 的梯度 (sigma_1 和 v 的外积)
    dB -- B 的梯度 (sigma_2 和 W_1 的外积)
    loss -- 当前输出的平方误差
    """
    # 前向传播
    W_1 = np.tanh(A @ v)  # 隐藏层输出
    W_2 = 1 / (1 + np.exp(-B @ W_1))  # 输出层输出

    # 计算导数
    W_2_d = W_2 * (1 - W_2)  # 输出层激活函数导数
    W_1_d = 1 - W_1**2  # 隐藏层激活函数导数

    # 计算误差系数
    sigma_2 = (t - W_2) * W_2_d  # 输出层误差
    sigma_1 = W_1_d * (sigma_2 @ B)  # 隐藏层误差

    # 计算梯度
    dA = np.outer(sigma_1, v)  # A 的梯度
    dB = np.outer(sigma_2, W_1)  # B 的梯度

    # 计算误差 (平方误差)
    loss = np.sum((W_2 - t) ** 2) / 2

    return dA, dB, loss


def train(A, B, s, t, delta, theta, mod = 1, max_time=-1):
    x, y = [], []
    while True:
        A1, B1, Delta = 0, 0, []
        if mod == 1:
            for i in range(len(s)):
                a1, b1, ddelta = cacl(A, B, s[i], t[i])
                A1 += a1
                B1 += b1
                Delta.append(ddelta)
            A += A1 * delta
            B += B1 * delta
        else :
            for i in range(len(s)):
                a1, b1, ddelta = cacl(A, B, s[i], t[i])
                A += a1 * delta
                B += b1 * delta
                Delta.append(ddelta)
        x.append(len(x)+1)
        y.append(np.mean(Delta))
        if y[-1] < theta or (max_time != -1 and len(x) >= max_time):
            break
    return A, B, (x, y)


X = np.array([1, 0, 0])
Y = np.array([0, 1, 0])
Z = np.array([0, 0, 1])

s = np.vstack((s1, s2, s3))
t = np.vstack(
    (np.tile(X, (len(s1), 1)), np.tile(Y, (len(s2), 1)), np.tile(Z, (len(s3), 1)))
)

n1, n2, n3 = s1.shape[1], 10, len(X)
delta, theta = 0.2, 1e-6

import matplotlib.pyplot as plt

# 设置全局字体为 SimHei（黑体），适用于中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置字体为黑体
plt.rcParams['axes.unicode_minus'] = False   # 正常显示负号


## (a)	隐含层不同结点数目对训练精度的影响

In [None]:
import matplotlib.cm as cm

fig, axs = plt.subplots(1, 1, figsize=(20, 10))  # 1行2列的子图
be, en = 6, 20
# colors = cm.plasma(np.linspace(0.3, 1, int((en-be)/3+1)))  # 从 colormap 中选择颜色
colors = cm.get_cmap('tab10').colors
for mid_num in range(be, en, 3):
    A = np.random.randn(mid_num, n1)
    B = np.random.randn(n3, mid_num)
    
    A, B, (xi, yi) = train(A, B, s, t, 0.01, 0.1, mod=1, max_time=600)
    plt.plot(xi, yi, label=f"{mid_num}", color=colors[int(mid_num/3)-2], linewidth=1)
    # axs[1].plot(xi[:100], (-np.diff(yi))[:100], label=f"{mid_num}", color=colors[int(mid_num/3)-2], linewidth=1)
# axs[0].legend()
# axs[1].legend()
plt.legend()
plt.show() 

## (b) 观察不同的梯度更新步长对训练的影响，并给出一些描述或解释

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 20))  # 1行2列的子
be, en = 6, 20
# colors = cm.plasma(np.linspace(0.3, 1, int((en-be)/3+1)))  # 从 colormap 中选择颜色
colors = cm.get_cmap('Set2').colors
A_ = np.random.randn(10, n1)
B_ = np.random.randn(n3, 10)
for id, mid_num in enumerate(np.arange(0.01, 0.3, 0.05)):
    A, B, (xi, yi) = train(A_.copy(), B_.copy(), s, t, mid_num, 0.01, mod=1, max_time=600)
    # plt.plot(xi, yi, label=f"{mid_num:.2f}", color=colors[id], linewidth=1)
    axs[0].plot(xi, yi, label=f"{mid_num:.2f}", color=colors[id], linewidth=1)
axs[0].legend()
axs[0].set_title("批处理")

for id, mid_num in enumerate(np.arange(0.21, 0.5, 0.05)):
    A, B, (xi, yi) = train(A_.copy(), B_.copy(), s, t, mid_num, 0.00001, mod=0, max_time=600)
    # plt.plot(xi, yi, label=f"{mid_num:.2f}", color=colors[id], linewidth=1)
    axs[1].plot(xi, yi, label=f"{mid_num:.2f}", color=colors[id], linewidth=1)
axs[1].legend()
axs[1].set_title("单样本")

plt.legend()
plt.show() 

## (c) 在网络结构固定的情况下，绘制出目标函数随着迭代步数增加的变化曲线。

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 15))  # 1行2列的子图
colors = cm.get_cmap('Set2').colors

A_ = np.random.randn(10, n1)
B_ = np.random.randn(n3, 10)
mid_num = 0.2
A, B, (xi, yi) = train(A_.copy(), B_.copy(), s, t, mid_num, 0.001, mod=1, max_time=600)
axs[0].plot(xi, yi, label=f"批量更新", color=colors[4], linewidth=1)
axs[1].plot(xi[:-1], -np.diff(yi), label=f"批量更新损失差", color=colors[4], linewidth=1)

# axs[0].legend()
A, B, (xi, yi) = train(A_.copy(), B_.copy(), s, t, mid_num, 0.001, mod=0, max_time=600)
axs[0].plot(xi, yi, label=f"单样本更新", color=colors[5], linewidth=1)
axs[1].plot(xi[:-1], -np.diff(yi), label=f"单样本更新损失差", color=colors[5], linewidth=1)
axs[0].legend()
axs[1].legend()

plt.legend()
plt.show() 