In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np

## nn.Module
* Base class for all neural network modules
* 只要在nn.Module的子類中定義了forward函數，backward函數就會被自動實現（利用Autograd）
* nn.Conv2d 本身也是nn.Module的類別(此時我們可以先不用理解nn.Conv2D做了什麼，只需了解其包含一些參數與操作)

In [3]:
# class Model(nn.Module):
#     def __init__(self):
#         super(Model, self).__init__()
#         self.conv1 = nn.Conv2d(1, 20, 5)
#         self.conv2 = nn.Conv2d(20, 20, 5)

#     def forward(self, x):
#         x = F.relu(self.conv1(x))
#         return F.relu(self.conv2(x))
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    self.conv1 = nn.Conv2d(1,20,5)
    self.conv2 = nn.Conv2d(20,20,5)

  def forward(self, x):
    x = F.relu(self.conv1(x))
    return F.relu(self.conv2(x))

In [4]:
model = Model()

### 實踐 forward propagation 
* 為什麼不應該直接call model.forward : https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690

In [7]:
input_ = torch.randn(1,1,124,124)
output = model(input_)
output.shape

torch.Size([1, 20, 116, 116])

### 查看 model 底下的 modules

#### .modules

* model.modules 遞迴的列出所有的 modules

In [8]:
# for module in model.modules():
#     print(module)
for module in model.modules():
  print(module)

Model(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))
)
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))


#### .children

* model.children 只列出第一層的子 modules

In [9]:
for module in model.children():
    print(module)

Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))


### 查看 model 內的 parameters (torch.nn.parameter.Parameter)

#### .named_parameters
* named_parameters會列出每個nn.Module底下parameters 的名字,數值
* 同時可以查看 requires_grad是否開啟(for backpropagation)

In [10]:
# for name, param in model.named_parameters():
#     print(name,param.requires_grad)
#     #param.requires_grad=True

for name, param in model.named_parameters():
  print(name, param.requires_grad)
  

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


#### .parameters
* 不會印出名字

In [11]:
for param in model.parameters():
    print(type(param),param.shape, param.requires_grad)

<class 'torch.nn.parameter.Parameter'> torch.Size([20, 1, 5, 5]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20, 20, 5, 5]) True
<class 'torch.nn.parameter.Parameter'> torch.Size([20]) True


#### 計算模型可訓練參數總量

In [12]:
# model_parameters = filter(lambda p: p.requires_grad, model.parameters())
# params = sum([np.prod(p.size()) for p in model_parameters])
# print('總共參數量：' ,params)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('總共參數量：' ,params)

總共參數量： 10540


### Backpropagation

In [13]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

#### 確認 requires_grad為 True (default 就是 True)

In [14]:
for name, param in model.named_parameters():
    print(name,param.requires_grad)

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


#### 此時還沒做backpropagation，parameters沒有gradient value

In [15]:
print(model.conv1.weight.grad)

None


#### 執行backward，完成後就能看到每個parameters底下的gradient value

In [16]:
output.sum().backward()

In [17]:
print(model.conv1.weight.grad)

tensor([[[[  522.9692,   454.4109,  -592.5370,   330.3468,   468.0063],
          [ -529.6297,   927.8635,   665.0973,   162.6237,  -770.4840],
          [  651.2021,  -190.9409,   553.1579,   650.3015,   284.0137],
          [ -103.7130,  -786.4781,    46.0564,   613.2404,   328.7287],
          [   28.6708,   196.4807,  -190.1977,   262.4489, -1008.4745]]],


        [[[  379.6396,    15.3367,   -37.3638,    15.9327,    12.5054],
          [  234.2216,   252.4350,   219.4751,   393.4599,  -257.3908],
          [  -95.9540,   -46.5520,   280.6422,    62.3226,  -171.3840],
          [ -228.7197,    16.7131,   418.8062,  -142.5411,     7.6002],
          [ -323.3826,   377.8612,   126.3426,   -70.7429,   -81.7496]]],


        [[[  844.4539,  -600.1066,  -520.8076,   769.9362, -1079.1150],
          [  993.7142,  1268.3313, -1454.2256,  -703.0765,   120.9676],
          [ -736.9225,   698.8443,   107.3563, -1214.4150,   -74.9459],
          [ -977.1591,  -280.8511,   -29.4711,   217.352

#### 當我們把 parameters 的 requires_grad關閉時，就無法成功的完成backward
* 什麼時候會關閉requires_grad關閉時？ prediction (inference)的階段
* 設定 requires_grad = True 是為了之後要做 backpropagation，在計算每個paramters的 gradient時，我們在forward propagation時需要保留額外的訊息(根據chain rule)，這會導致記憶體使用量上升與計算速度下降，然而只有在 training 階段時我們材需要做backpropagation，在 prediction (inference)的階段，我們則可以設定 requires_grad = False 來提升速度與降低記憶體使用量 

In [18]:
for param in model.parameters():
    param.requires_grad = False

In [19]:
input_ = torch.randn(1,1,124,124)
output = model(input_)

In [20]:
output.sum().backward()

RuntimeError: ignored

#### with torch.no_grad()
* 此行底下的requires_grad都會關閉

In [21]:
for param in model.parameters():
    param.requires_grad = True
with torch.no_grad():
    input_ = torch.randn(1,1,124,124)
    output = model(input_)
    output.sum().backward()

RuntimeError: ignored

### 讓我們自行搭建一個 nn.Module 並試算gradient

In [24]:
# class Model(nn.Module):
#     def __init__(self):
#         super(Model, self).__init__()
#         self.x = torch.nn.Parameter(torch.tensor(2.4,dtype=torch.float32))
#         self.y = torch.nn.Parameter(torch.tensor(4.3,dtype=torch.float32))

#     def forward(self, x):
#         output = x*self.x**2 + x*self.y + x # 可以看成 output = w*x*x + w*y+2
#         return output
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    self.x = torch.nn.Parameter(torch.tensor(2.4, dtype = torch.float32))
    self.y = torch.nn.Parameter(torch.tensor(4.3, dtype = torch.float32))

  def forward(self,x):
    output = x * self.x**2 + x*self.y + x
    return output

In [25]:
model = Model()
input_ = torch.tensor(1.3, dtype = torch.float32)
output = model(input_)
output.backward()
# output 對 self.x 的偏微分為 2 * w * x = 2 * 1.3 * 2.4 = 6.24 
print('self.x 的 gradient : {}'.format(model.x.grad))
# output 對 self.y 的偏微分為 w = 1.3
print('self.y 的 gradient : {}'.format(model.y.grad))

self.x 的 gradient : 6.240000247955322
self.y 的 gradient : 1.2999999523162842


## Sequential
* nn.Module 的容器

In [30]:
# layer = nn.Sequential(
#                         nn.Conv2d(3,
#                                   20,
#                                   kernel_size=3,
#                                   stride=1,
#                                   padding=1,
#                                   bias=False), 
#                         nn.BatchNorm2d(20),
#                         nn.LeakyReLU(inplace=True))

layer = nn.Sequential(
    nn.Conv2d(3,
              20,
              kernel_size = 3,
              stride = 1,
              padding = 1, 
              bias = False),
    nn.BatchNorm2d(20),
    nn.LeakyReLU(inplace=True)
)

In [31]:
for name, param in layer.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

0.weight True
1.weight True
1.bias True


In [33]:
input_ = torch.randn(1, 3, 124, 124)
output = layer(input_)
output.shape

torch.Size([1, 20, 124, 124])

#### OrderedDict+Sequential, 讓我們替每一個module命名

In [34]:
from collections import OrderedDict

In [37]:
# layer = nn.Sequential(OrderedDict([
#           ('conv1', nn.Conv2d(1,20,5)),
#           ('relu1', nn.ReLU()),
#           ('conv2', nn.Conv2d(20,64,5)),
#           ('relu2', nn.ReLU())
#         ]))
layer = nn.Sequential(OrderedDict([
                       ('conv1', nn.Conv2d(1,20,5)),
                       ('relu1', nn.ReLU()),
                       ('conv2', nn.Conv2d(20, 64, 5)),
                       ('relu2', nn.ReLU())
]))

In [38]:
for module in layer.modules():
    print(module)
    #param.requires_grad=True

Sequential(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (conv2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
)
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
ReLU()
Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
ReLU()


In [39]:
for name, param in layer.named_parameters():
    print(name,param.requires_grad)
    #param.requires_grad=True

conv1.weight True
conv1.bias True
conv2.weight True
conv2.bias True


In [40]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


#### append 新的 module到 sequential上

In [41]:
# import torch.nn as nn

# modules = []
# modules.append(nn.Conv2d(1,20,5))
# modules.append(nn.ReLU())
# modules.append(nn.Conv2d(20,64,5))
# modules.append(nn.ReLU())

# layer = nn.Sequential(*modules)
import torch.nn as nn
modules = []
modules.append(nn.Conv2d(1, 20, 5))
modules.append(nn.ReLU())
modules.append(nn.Conv2d(20, 64, 5))
modules.append(nn.ReLU())

layer = nn.Sequential(*modules)

In [42]:
layer

Sequential(
  (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (3): ReLU()
)

In [43]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 另一種方式

In [44]:
# layer = torch.nn.Sequential()
# layer.add_module("conv1", nn.Conv2d(1,20,5))
# layer.add_module("relu1", nn.ReLU())
# layer.add_module("conv2", nn.Conv2d(20,64,5))
# layer.add_module("relu2", nn.ReLU())
layer = torch.nn.Sequential()
layer.add_module("conv1", nn.Conv2d(1,20,5))
layer.add_module("relu1", nn.ReLU())
layer.add_module("conv2", nn.Conv2d(20, 64, 5))
layer.add_module("conv2", nn.ReLU())

In [46]:
layer

Sequential(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (conv2): ReLU()
)

In [47]:
input_ = torch.randn(1, 1, 124, 124)
output = layer(input_)
print(output.shape)

torch.Size([1, 20, 120, 120])


## ModuleList
* 操作就像是python list, 但其內的module, parameters是可以被追蹤的

In [48]:
# layer = nn.ModuleList()
# layer.append(nn.Conv2d(1,20,5))
# layer.append(nn.ReLU())
# layer.append(nn.Conv2d(20,64,5))
# layer.append(nn.ReLU())
layer = nn.ModuleList()
layer.append(nn.Conv2d(1, 20, 5))
layer.append(nn.ReLU())
layer.append(nn.Conv2d(20, 64, 5))
layer.append(nn.ReLU())

ModuleList(
  (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (3): ReLU()
)

In [49]:
# input_ = torch.randn(1, 1, 124, 124)
# for _, module in enumerate(layer):
#     if _ == 0:
#         output = module(input_)
#     else:
#         output = module(output)
# print(output.shape)
input_ = torch.randn(1,1,124, 124)
for _, module in enumerate(layer):
  if _ == 0:
    output = module(input_)
  else:
    output = module(output)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 可以追蹤是什麼意思？ nn.Module有辦法去獲取ModuleList裡面的資訊

In [50]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layer = nn.ModuleList()
        self.layer.append(nn.Conv2d(1,20,5))
        self.layer.append(nn.ReLU())
        self.layer.append(nn.Conv2d(20,64,5))
        self.layer.append(nn.ReLU())

    def forward(self, x):
        for module in self.layer:
            x = module(x)
        return x

In [51]:
model = Model()

In [52]:
model

Model(
  (layer): ModuleList(
    (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
    (3): ReLU()
  )
)

In [53]:
input_ = torch.randn(1, 1, 124, 124)
output = model(input_)
print(output.shape)

torch.Size([1, 64, 116, 116])


* 如果是一般的 python list

In [54]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layer = []
        self.layer.append(nn.Conv2d(1,20,5))
        self.layer.append(nn.ReLU())
        self.layer.append(nn.Conv2d(20,64,5))
        self.layer.append(nn.ReLU())

    def forward(self, x):
        for module in self.layer:
            x = module(x)
        return

In [55]:
model = Model()

In [56]:
model

Model()