# 神经网络实现详解

本笔记本包含神经网络的完整实现和可视化实验。

## 目录
1. [从零实现神经网络](#从零实现神经网络)
2. [激活函数详解](#激活函数详解)
3. [反向传播算法](#反向传播算法)
4. [梯度消失问题](#梯度消失问题)
5. [深度学习框架](#深度学习框架)

## 从零实现神经网络

首先，我们从零开始实现一个简单的全连接神经网络。

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_circles
from sklearn.model_selection import train_test_split

class NeuralNetwork:
    """简单神经网络实现"""
    
    def __init__(self, layer_sizes, activation='relu'):
        self.layer_sizes = layer_sizes
        self.activation = activation
        self.weights = []
        self.biases = []
        
        # 初始化参数
        for i in range(len(layer_sizes) - 1):
            # Xavier 初始化
            limit = np.sqrt(6 / (layer_sizes[i] + layer_sizes[i + 1]))
            w = np.random.uniform(-limit, limit,
                              (layer_sizes[i], layer_sizes[i + 1]))
            b = np.zeros((1, layer_sizes[i + 1]))
            self.weights.append(w)
            self.biases.append(b)
    
    def _activate(self, z, derivative=False):
        """激活函数"""
        if self.activation == 'relu':
            if derivative:
                return (z > 0).astype(float)
            return np.maximum(0, z)
        elif self.activation == 'sigmoid':
            if derivative:
                s = 1 / (1 + np.exp(-z))
                return s * (1 - s)
            return 1 / (1 + np.exp(-z))
        elif self.activation == 'tanh':
            if derivative:
                return 1 - np.tanh(z)**2
            return np.tanh(z)
    
    def forward(self, X):
        """前向传播"""
        self.a = [X]
        self.z = []
        
        for i in range(len(self.weights)):
            z = np.dot(self.a[-1], self.weights[i]) + self.biases[i]
            self.z.append(z)
            a = self._activate(z)
            self.a.append(a)
        
        return self.a[-1]
    
    def backward(self, X, y, learning_rate=0.01):
        """反向传播"""
        m = X.shape[0]
        
        # 前向传播
        output = self.forward(X)
        
        # 输出层梯度
        delta = output - y
        dW = []
        db = []
        
        # 反向传播
        for i in range(len(self.weights) - 1, -1, -1):
            dW.insert(0, np.dot(self.a[i].T, delta) / m)
            db.insert(0, np.sum(delta, axis=0, keepdims=True) / m)
            
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * \
                       self._activate(self.z[i-1], derivative=True)
        
        # 更新参数
        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * dW[i]
            self.biases[i] -= learning_rate * db[i]
    
    def train(self, X, y, epochs=1000, learning_rate=0.01, verbose=True):
        """训练网络"""
        losses = []
        for epoch in range(epochs):
            self.backward(X, y, learning_rate)
            
            if epoch % 100 == 0:
                output = self.forward(X)
                loss = np.mean(np.square(output - y))
                losses.append(loss)
                if verbose:
                    print(f"Epoch {epoch}, Loss: {loss:.4f}")
        
        return losses
    
    def predict(self, X):
        """预测"""
        return (self.forward(X) > 0.5).astype(float)

In [None]:
# 测试网络 - XOR 问题
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

# 创建网络
nn = NeuralNetwork([2, 4, 1], activation='sigmoid')

# 训练
losses = nn.train(X, y, epochs=5000, learning_rate=0.1)

# 测试
print("\n预测结果:")
for i in range(len(X)):
    pred = nn.predict(X[i:i+1])[0][0]
    print(f"输入: {X[i]}, 预测: {pred}, 真实: {y[i][0]}")

## 激活函数详解

让我们对比不同激活函数的特性。

In [None]:
# 可视化激活函数
x = np.linspace(-5, 5, 100)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# ReLU
relu = np.maximum(0, x)
axes[0].plot(x, relu, linewidth=2)
axes[0].set_title('ReLU')
axes[0].set_xlabel('x')
axes[0].set_ylabel('f(x)')
axes[0].grid(True, alpha=0.3)

# Sigmoid
sigmoid = 1 / (1 + np.exp(-x))
axes[1].plot(x, sigmoid, linewidth=2)
axes[1].set_title('Sigmoid')
axes[1].set_xlabel('x')
axes[1].set_ylabel('f(x)')
axes[1].grid(True, alpha=0.3)

# Tanh
tanh = np.tanh(x)
axes[2].plot(x, tanh, linewidth=2)
axes[2].set_title('Tanh')
axes[2].set_xlabel('x')
axes[2].set_ylabel('f(x)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 反向传播算法

反向传播是神经网络训练的核心算法。让我们详细理解梯度如何向后流动。

In [None]:
def visualize_backprop():
    """可视化反向传播过程"""
    # 创建简单网络
    nn = NeuralNetwork([2, 3, 1], activation='sigmoid')
    
    # 单个样本
    X_sample = np.array([[0.5, 0.5]])
    y_sample = np.array([[1]])
    
    # 前向传播
    output = nn.forward(X_sample)
    loss = np.mean((output - y_sample) ** 2)
    
    print("=== 前向传播 ===")
    print(f"输入: {X_sample[0]}")
    for i, a in enumerate(nn.a[1:-1]):
        print(f"隐藏层 {i+1} 激活: {a[0]}")
    print(f"输出: {output[0][0]:.4f}")
    print(f"损失: {loss:.4f}")
    
    print("\n=== 反向传播 ===")
    # 计算输出层梯度
    delta_output = (output - y_sample) * output * (1 - output)
    print(f"输出层 delta: {delta_output[0]}")
    
    # 反向传播
    for i in range(len(nn.weights) - 1, -1, -1):
        if i > 0:
            delta_hidden = np.dot(delta_output, nn.weights[i].T) * \
                          nn.a[i] * (1 - nn.a[i])
            print(f"隐藏层 {i} delta: {delta_hidden[0]}")
            delta_output = delta_hidden

visualize_backprop()

## 梯度消失问题

深层网络中，梯度可能会在反向传播过程中逐渐消失。让我们演示这个问题。

In [None]:
def demonstrate_vanishing_gradient():
    """演示梯度消失"""
    depths = range(2, 11)
    sigmoid_grads = []
    relu_grads = []
    
    x = 1.0  # 输入值
    
    for depth in depths:
        # Sigmoid 梯度
        sigmoid_grad = 1
        for _ in range(depth):
            sigmoid_grad *= (1 / (1 + np.exp(-x))) * (1 - 1 / (1 + np.exp(-x)))
        sigmoid_grads.append(sigmoid_grad)
        
        # ReLU 梯度
        relu_grad = 1
        for _ in range(depth):
            relu_grad *= 1 if x > 0 else 0
        relu_grads.append(relu_grad)
    
    plt.figure(figsize=(10, 6))
    plt.semilogy(depths, sigmoid_grads, 'o-', label='Sigmoid')
    plt.semilogy(depths, relu_grads, 's-', label='ReLU')
    plt.xlabel('网络深度')
    plt.ylabel('梯度值（对数坐标）')
    plt.title('梯度消失问题演示')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print("Sigmoid 梯度随深度变化:")
    for d, grad in zip(depths, sigmoid_grads):
        print(f"深度 {d}: {grad:.6e}")

demonstrate_vanishing_gradient()

## 深度学习框架

实际应用中，我们使用 TensorFlow 或 PyTorch。

In [None]:
# TensorFlow 示例
import tensorflow as tf
from sklearn.datasets import make_circles

# 生成数据
X, y = make_circles(n_samples=500, noise=0.1, factor=0.5, random_state=42)
y = y.reshape(-1, 1)

# 构建模型
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(2,)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练
history = model.fit(X, y, epochs=100, batch_size=32, verbose=0)

# 评估
loss, accuracy = model.evaluate(X, y, verbose=0)
print(f"训练集准确率: {accuracy:.2%}")

## 逐层激活可视化

让我们可视化网络各层的激活模式。

In [None]:
def visualize_layer_activations(model, X, y):
    """可视化各层激活"""
    # 创建中间层模型
    layer_outputs = [layer.output for layer in model.layers]
    activation_model = tf.keras.Model(inputs=model.input, outputs=layer_outputs)
    
    # 获取激活
    activations = activation_model.predict(X[:10])
    
    # 绘制
    fig, axes = plt.subplots(1, len(activations), figsize=(15, 4))
    
    for i, (activation, ax) in enumerate(zip(activations, axes)):
        # 绘制热力图
        im = ax.imshow(activation.T, aspect='auto', cmap='viridis')
        ax.set_title(f'层 {i+1}')
        ax.set_xlabel('样本')
        ax.set_ylabel('神经元')
        plt.colorbar(im, ax=ax)
    
    plt.tight_layout()
    plt.show()

# visualize_layer_activations(model, X, y)

## 总结

**神经网络的优缺点：**

### 优点
1. 强大的表示能力
2. 自动特征提取
3. 端到端学习

### 缺点
1. 需要大量数据
2. 训练计算成本高
3. 可解释性差（"黑盒"）
4. 容易过拟合

**实践技巧：**
- 使用 ReLU 激活函数缓解梯度消失
- 使用 Batch Normalization 稳定训练
- 使用 Dropout 防止过拟合
- 合理初始化权重（Xavier/He）
- 使用合适的优化器（Adam、SGD+Momentum）