# Task2: 基于PyTorch框架的手写数字识别
## 引入相关库

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import torch.nn as nn
from tqdm import tqdm
import os

## 设置超参数

In [2]:
learning_rate = 0.01
epoches = 100
batch_size = 128

## Re-implement MLP
利用PyTorch的内置神经网络模块（torch.nn.Module的子类），在MLP类中实现两个函数：
+ 在__init__函数中，定义一个网络结构为[784-245-128-10]的MLP模型结构
+ 在forward函数中，实现该MLP模型的前向传播过程

下面是一些供你参考/可能用到的API函数：

- torch.nn.Linear(*in_features*, *out_features*, *bias=True*, *device=None*, *dtype=None*) [
  Link](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)
  - in_features: 输入网络层的特征维度
  - out_features: 输出网络层的特征维度
- torch.nn.Module.forward(**input*) [Link](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.forward)
  - 执行模型的前向过程，继承nn.Module类的类实例可以直接通过变量名加括号实现forward函数的调用，不需要写明调用forward函数
  - 如定义了MLP(nn.Module)，则对于mlp = MLP()，可以通过mlp(**input*)调用
- torch.Tensor.reshape(*shape*) [Link](https://pytorch.org/docs/stable/generated/torch.Tensor.reshape.html)
  - shape: 当前tensor希望修改为的形状，如(2, 2)或(-1, 3)
    - -1指该维度大小根据原数据维度大小和其它给定维度大小计算得到，至多可以给一个-1
- torch.nn.Sigmoid() [Link](https://pytorch.org/docs/stable/generated/torch.nn.Sigmoid.html)

In [3]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        # TODO: 定义上文要求的MLP模型结构
        self.fc1 = nn.Linear(784, 512) # 输入层→隐藏层1（784→512维）
        self.bn1 = nn.BatchNorm1d(512)  # 对隐藏层1输出做归一化处理
        self.dropout1 = nn.Dropout(0.3) # 随机丢弃30%的神经元
        
        self.fc2 = nn.Linear(512, 256) # 隐藏层1→隐藏层2（512→256维）
        self.bn2 = nn.BatchNorm1d(256) # 对隐藏层2输出做归一化处理
        self.dropout2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(256, 10) # 隐藏层2→输出层（256→10维）
        # 带负斜率的ReLU变体
        # 允许负值输入产生微小梯度，避免ReLU在负区梯度为0的问题
        self.relu = nn.LeakyReLU(0.01)
        self._init_weights()

    def _init_weights(self):
        # 隐藏层使用 He(Kaiming) 初始化，适配 LeakyReLU 的负斜率(非线性)参数，确保前向传播时激活值的方差稳定
        # 模式选择：'fan_in'表示权重方差的计算方式为输入节点个数，保持各层输入方差稳定
        nn.init.kaiming_normal_(self.fc1.weight, mode='fan_in', nonlinearity='leaky_relu')
        nn.init.kaiming_normal_(self.fc2.weight, mode='fan_in', nonlinearity='leaky_relu')
        # 输出层使用 Xavier 初始化，平衡梯度传播，适合线性输出层
        nn.init.xavier_normal_(self.fc3.weight)

    def forward(self, x):
        # TODO: 定义MLP模型的前向过程
        x = x.view(-1, 784)
        x = self.relu(self.bn1(self.fc1(x))) # 全连接→BN→激活
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        o = self.dropout2(x) # 输出层不激活

        return self.fc3(o)

## 示例化MLP

In [4]:
mlp = MLP()

## 定义损失函数、优化算法

- torch.nn.CrossEntropyLoss(*weight=None*, *size_average=None*, *ignore_index=- 100*, *reduce=None*, *reduction='mean'*, *label_smoothing=0.0*) [Link](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)
  - loss.backward(): loss通过特定的计算方式获得，如调用CrossEntropyLoss；对loss执行backward()会为计算图中涉及的tensor反向计算梯度，累积到tensor.grad上
- torch.optim.SGD(*params*, *lr=<required parameter>*, *momentum=0*, *dampening=0*, *weight_decay=0*, *nesterov=False*, ***, *maximize=False*, *foreach=None*, *differentiable=False*)  [Link](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html)
  - params: 需优化的参数Tensor
  - lr: 参数优化的学习率
  - zero_grad(): 清空相关参数上累积的梯度
  - step(): 根据tensor上累积的梯度，进行一次参数更新

In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(mlp.parameters(), lr=learning_rate)

## 加载数据集

- 自动下载MNIST数据集到./MNIST路径

In [6]:
transform = transforms.ToTensor()

trainset = torchvision.datasets.MNIST(root="./MNIST", train=True, download=True, transform=transform)
testset = torchvision.datasets.MNIST(root="./MNIST", train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, drop_last=True)

## 训练模型

此处关于loss和optimizer的用法请参考上一段落的API介绍。

In [7]:
mlp.train()

for e in range(epoches):
    t = tqdm(train_loader)
    for img, label in t:
        # Forward img and compute loss
        pred = mlp(img)
        loss = criterion(pred, label)
        
        # TODO: 基于优化器的使用方法，完成反向梯度传播、参数更新
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        t.set_postfix(epoch=e, train_loss=loss.item())

100%|██████████| 468/468 [00:04<00:00, 115.14it/s, epoch=0, train_loss=0.559]
100%|██████████| 468/468 [00:04<00:00, 110.59it/s, epoch=1, train_loss=0.354]
100%|██████████| 468/468 [00:04<00:00, 114.90it/s, epoch=2, train_loss=0.379]
100%|██████████| 468/468 [00:04<00:00, 113.41it/s, epoch=3, train_loss=0.226]
100%|██████████| 468/468 [00:04<00:00, 111.23it/s, epoch=4, train_loss=0.229]
100%|██████████| 468/468 [00:04<00:00, 110.39it/s, epoch=5, train_loss=0.405]
100%|██████████| 468/468 [00:04<00:00, 107.99it/s, epoch=6, train_loss=0.261]
100%|██████████| 468/468 [00:04<00:00, 112.48it/s, epoch=7, train_loss=0.208]
100%|██████████| 468/468 [00:04<00:00, 111.28it/s, epoch=8, train_loss=0.196] 
100%|██████████| 468/468 [00:04<00:00, 111.12it/s, epoch=9, train_loss=0.268] 
100%|██████████| 468/468 [00:04<00:00, 111.89it/s, epoch=10, train_loss=0.128] 
100%|██████████| 468/468 [00:04<00:00, 112.87it/s, epoch=11, train_loss=0.164]
100%|██████████| 468/468 [00:04<00:00, 113.56it/s, epoch=12

## 测试模型

- torch.argmax(*input*, *dim*, *keepdim=False*) [Link](https://pytorch.org/docs/stable/generated/torch.argmax.html)
  - input: 计算基于的tensor
  - dim: 希望按哪个维度求max下标

In [8]:
mlp.eval()

correct_cnt, sample_cnt = 0, 0

t = tqdm(test_loader)
for img, label in t:
    # Predict label for img
    img = img.reshape(img.shape[0], -1)
    pred = mlp(img)
    pred_label = pred.argmax(dim=1)
    correct_cnt += (pred_label == label).sum().item()
    sample_cnt += pred_label.shape[0]

    t.set_postfix(test_acc=correct_cnt/sample_cnt)

100%|██████████| 78/78 [00:00<00:00, 164.65it/s, test_acc=0.981]


## 保存模型

- 将完成训练的模型保存到服务器的model/目录下

- ModelScope服务器端无法长久保存文件，因此请及时下载、本地保存你完成的代码，以及模型的参数文件（model/mlp.pt）。

In [9]:
if not os.path.exists('model/'):
    os.mkdir('model/')

torch.save(mlp.state_dict(), 'model/mlp.pt')