In [1]:
# ---------------------定义网络---------------------
# 定义网络时，需要继承nn.Module，并实现它的forward方法
# 把网络中具有可学习参数的层放在构造函数__init__中。
# 如果某一层不具有可学习的参数，则可以不放其中，而在forward中使用nn.functional代替
import torch as t
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [3]:
class Net(nn.Module):
    def __init__(self):
        # nn.Module子类的函数必须在构造函数中执行父类的构造函数
        # 下式等价于nn.Module.__init__(self)
        super(Net, self).__init__()
        # 卷积层‘1’表示输入图片为单通道，‘6’表示输出通道数，‘5’表示卷积核为5×5
        self.conv1 = nn.Conv2d(1, 6, 5)
        # 卷积层
        self.conv2 = nn.Conv2d(6, 16, 5)
        # 仿射层/全连接层，y = Wx + b
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # 卷积 -> 激活 -> 池化
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # reshape, '-1'表示自适应
        x = x.view(x.size()[0], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net()
print(net)

Net (
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear (400 -> 120)
  (fc2): Linear (120 -> 84)
  (fc3): Linear (84 -> 10)
)


In [4]:
params = list(net.parameters())
print(len(params))

10


In [5]:
for name,parameters in net.named_parameters():
    print(name, ':', parameters.size())

('conv1.weight', ':', torch.Size([6, 1, 5, 5]))
('conv1.bias', ':', torch.Size([6]))
('conv2.weight', ':', torch.Size([16, 6, 5, 5]))
('conv2.bias', ':', torch.Size([16]))
('fc1.weight', ':', torch.Size([120, 400]))
('fc1.bias', ':', torch.Size([120]))
('fc2.weight', ':', torch.Size([84, 120]))
('fc2.bias', ':', torch.Size([84]))
('fc3.weight', ':', torch.Size([10, 84]))
('fc3.bias', ':', torch.Size([10]))


In [6]:
# forward函数的输入输出都是Variable，因此在输入时需要把Tensor封装成Variable
input = Variable(t.randn(1, 1, 32, 32))
out = net(input)
out.size()

torch.Size([1, 10])

In [7]:
net.zero_grad() # 所有参数的梯度清零
out.backward(Variable(t.ones(1,10))) # 反向传播

RuntimeError: element 0 of gradients tuple is not a Tensor or None

In [8]:
# 出错的原因估计是torch.nn只支持mini-batches，不支持一次只输入一个样本

In [9]:
# ---------------------损失函数---------------------
output = net(input)
target = Variable(t.arange(0, 10))
criterion = nn.MSELoss()
loss = criterion(output, target)
loss

Variable containing:
 28.5791
[torch.FloatTensor of size 1]

In [10]:
Variable(t.arange(0, 10))

Variable containing:
 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
[torch.FloatTensor of size 10]

In [11]:
output

Variable containing:
 0.0030  0.0488 -0.1379 -0.0965  0.0228 -0.0128 -0.1067 -0.0114  0.0496  0.0489
[torch.FloatTensor of size 1x10]

In [12]:
# 当调用loss.backward()时，该图会动态生成并自动微分，也会自动计算图中参数的倒数
# 运行.backward()时，观察调用之前和调用之后的grad
net.zero_grad() # 把net中所有可学习参数的梯度清零
print('反向传播之前conv1.bias的梯度')
print(net.conv1.bias.grad)
loss.backward()
print('反向传播之后conv1.bias的梯度')
print(net.conv1.bias.grad)

反向传播之前conv1.bias的梯度
None
反向传播之后conv1.bias的梯度
Variable containing:
1.00000e-02 *
  2.8068
 -6.1308
  1.3126
  1.0651
  0.5337
 -0.6042
[torch.FloatTensor of size 6]



In [14]:
# ---------------------优化器---------------------
# 随机梯度下降法的更新策略：
# weight = weight - learning_rate * gradient

In [18]:
# 手动实现
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate) # inplace 减法
    #print(f)

In [19]:
import torch.optim as optim
# 新建一个优化器，指定要调整的参数和学习率
optimizer = optim.SGD(net.parameters(), lr = 0.01)

# 在训练过程中
# 先梯度清零(与net.zero_grad()效果一样)
optimizer.zero_grad()

# 计算损失
output = net(input)
loss = criterion(output, target)

# 反向传播
loss.backward()

# 更新参数
optimizer.step()