In [None]:
# 本章开始正式介绍卷积神经网络（使用了卷积层的神经网络）

# 比较非卷积 NN (以MLP为例)，MLP的几个明显缺陷是：
# 1. 由于要将输入延展（将第i+1行放到第i行之后，最终形成一个行向量），
# 故原本临近像素（i,j）和（i,j+1）之间的模式丢失，构成的行向量模式
# 可能会导致Ｉnputs不能被正确识别．
# 2. 1000x1000的图片与第一层隐藏层（假设隐藏单元的个数为256），经过全连接
# 总的参数达到256,000,000，接近３Ｇ内存/显存．（即存储开销过大）

# 卷积层改进：１．不延展输入图像，这样，图像像素在宽与高两个方向上的相关性得以保留
# 2. 参数重复利用，这避免了参数过多的问题．

In [1]:
# LeNet手写字符识别

![LeNet](./5.5_lenet.png)


特点：
1. 包括卷积模块　＋　fc模块
2. 卷积模块：卷积层(无padding)，sigmoid和下采样层
3. fc模块：flatten + fc(120) + fc(84) + fc(10)

In [6]:
import time
import torch
import os
from torch import nn, optim
import sys
sys.path.append('../d2lzh/')
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 构建模型
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 6, 5),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(6, 16, 5),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
        )
        self.fc = nn.Sequential(
            nn.Linear(16*4*4, 120),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            nn.Sigmoid(),
            nn.Linear(84, 10),
        )
        
    def forward(self, img):
        feature = self.conv(img)
        output = self.fc(feature.view(img.shape[0], -1))
        return output

In [7]:
net = LeNet()
print(net)

LeNet(
  (conv): Sequential(
    (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
    (1): Sigmoid()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (4): Sigmoid()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=120, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=120, out_features=84, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=84, out_features=10, bias=True)
  )
)


In [8]:
### 训练数据

In [9]:
batch_size = 256
root=os.path.join('..', 'Datasets', 'FashionMINST')

train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size, root=root)

In [10]:
### 修改　精度估算函数使其支持cuda tensor

In [29]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device # 无指定，则使用net中的设置
    
    acc_sum, num = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, nn.Module): # net是nn.Module的子类
                net.eval()   # 进入评估模式
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()  # 回到训练模式
            else:   # net不是Module的子类，而是自己构造的模型
                if('is_training' in net.__code__.co_varnames):
                    # 将这个参数设置为false
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            num += y.shape[0]
    return acc_sum / num       

In [30]:
### 对train_ch3作修改

In [31]:
def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)                   # 网络模型
    print('training on ', device)
    loss = torch.nn.CrossEntropyLoss()   # 损失函数
    for epoch in range(num_epochs):
        train_loss_sum, train_acc_sum, n, batch_count, start_time = 0.0, 0.0, 0, 0, time.time()
        
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            ls = loss(y_hat, y)
            optimizer.zero_grad()
            ls.backward()
            optimizer.step()
            
            train_loss_sum += ls.cpu().item()   # 损失值（每计算一个batch＿size就计算一次）
            train_acc_sum  += (y_hat.argmax(dim=1) == y).sum().item()   # 训练精度（逐个样本计算）
            n += y.shape[0]    # 总样本个数
            batch_count += 1   # 所有的训练数据被分成了几个batch
            
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train_acc %.3f, test acc %.3f, time %.1f sec'
             % (epoch + 1, train_loss_sum / batch_count,
               train_acc_sum / n, test_acc, time.time() - start_time))        

In [32]:
lr, num_epochs = 0.001, 50
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.2821, train_acc 0.894, test acc 0.882, time 1.8 sec
epoch 2, loss 0.2795, train_acc 0.896, test acc 0.875, time 1.7 sec
epoch 3, loss 0.2773, train_acc 0.897, test acc 0.885, time 1.8 sec
epoch 4, loss 0.2753, train_acc 0.897, test acc 0.886, time 1.7 sec
epoch 5, loss 0.2720, train_acc 0.899, test acc 0.882, time 1.7 sec
epoch 6, loss 0.2711, train_acc 0.898, test acc 0.885, time 1.8 sec
epoch 7, loss 0.2695, train_acc 0.900, test acc 0.883, time 1.7 sec
epoch 8, loss 0.2701, train_acc 0.900, test acc 0.883, time 1.7 sec
epoch 9, loss 0.2679, train_acc 0.899, test acc 0.883, time 1.7 sec
epoch 10, loss 0.2666, train_acc 0.900, test acc 0.880, time 1.8 sec
epoch 11, loss 0.2649, train_acc 0.900, test acc 0.877, time 1.8 sec
epoch 12, loss 0.2621, train_acc 0.902, test acc 0.884, time 1.7 sec
epoch 13, loss 0.2612, train_acc 0.903, test acc 0.884, time 1.7 sec
epoch 14, loss 0.2614, train_acc 0.903, test acc 0.887, time 1.7 sec
epoch 15, loss 0.2571, tr