In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim

### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 1024)
        self.layer3 = LinearBNAC(1024, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [4]:
model = Model(input_dimention=256,output_classes=10)
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [5]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [6]:
output = model(dummy_input)
print(output)

tensor([[0.1741, 0.1028, 0.1339, 0.0602, 0.1255, 0.1472, 0.0522, 0.0708, 0.0835,
         0.0498],
        [0.1205, 0.1706, 0.0546, 0.0971, 0.1083, 0.0909, 0.0735, 0.1148, 0.1102,
         0.0595],
        [0.1029, 0.1371, 0.0880, 0.0752, 0.1559, 0.1036, 0.1091, 0.0732, 0.0944,
         0.0605],
        [0.1364, 0.1196, 0.0317, 0.0737, 0.1545, 0.1070, 0.0266, 0.1921, 0.1010,
         0.0573]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [7]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [8]:
criterion = NLLLoss()

In [9]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [10]:
loss.backward()

In [11]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0387,  0.0101, -0.0257,  ...,  0.0073, -0.0487,  0.0072],
        [ 0.0436,  0.0526, -0.0449,  ..., -0.0414, -0.0086, -0.0449],
        [ 0.0138,  0.0077,  0.0482,  ...,  0.0361,  0.0115,  0.0472],
        ...,
        [-0.0210,  0.0137,  0.0022,  ...,  0.0394,  0.0123,  0.0287],
        [ 0.0035,  0.0163, -0.0039,  ..., -0.0115,  0.0615,  0.0273],
        [-0.0192, -0.0370, -0.0539,  ...,  0.0537,  0.0135,  0.0601]],
       requires_grad=True)


grad : tensor([[ 0.1952, -0.1153,  0.1882,  ...,  0.1929, -0.1104, -0.2503],
        [-0.0974, -0.0230,  0.1486,  ..., -0.0189, -0.0489,  0.0741],
        [-0.0432, -0.0380,  0.0167,  ...,  0.0378, -0.0191, -0.1038],
        ...,
        [-0.0868,  0.0204, -0.4313,  ...,  0.6616, -0.0604, -0.3534],
        [-0.3748, -0.1435,  0.0517,  ...,  0.1723, -0.0884, -0.3708],
        [ 1.2663,  0.4143, -0.4671,  ...,  1.1927, -0.1391,  0.6286]])


In [12]:
optimizer.step()

In [13]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0386,  0.0102, -0.0258,  ...,  0.0072, -0.0486,  0.0073],
        [ 0.0437,  0.0527, -0.0450,  ..., -0.0413, -0.0085, -0.0450],
        [ 0.0139,  0.0078,  0.0481,  ...,  0.0360,  0.0116,  0.0473],
        ...,
        [-0.0209,  0.0136,  0.0023,  ...,  0.0393,  0.0124,  0.0288],
        [ 0.0036,  0.0164, -0.0040,  ..., -0.0116,  0.0616,  0.0274],
        [-0.0193, -0.0371, -0.0538,  ...,  0.0536,  0.0136,  0.0600]],
       requires_grad=True)


grad : tensor([[ 0.1952, -0.1153,  0.1882,  ...,  0.1929, -0.1104, -0.2503],
        [-0.0974, -0.0230,  0.1486,  ..., -0.0189, -0.0489,  0.0741],
        [-0.0432, -0.0380,  0.0167,  ...,  0.0378, -0.0191, -0.1038],
        ...,
        [-0.0868,  0.0204, -0.4313,  ...,  0.6616, -0.0604, -0.3534],
        [-0.3748, -0.1435,  0.0517,  ...,  0.1723, -0.0884, -0.3708],
        [ 1.2663,  0.4143, -0.4671,  ...,  1.1927, -0.1391,  0.6286]])


### 清空 gradient

In [14]:
optimizer.zero_grad()

In [15]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0386,  0.0102, -0.0258,  ...,  0.0072, -0.0486,  0.0073],
        [ 0.0437,  0.0527, -0.0450,  ..., -0.0413, -0.0085, -0.0450],
        [ 0.0139,  0.0078,  0.0481,  ...,  0.0360,  0.0116,  0.0473],
        ...,
        [-0.0209,  0.0136,  0.0023,  ...,  0.0393,  0.0124,  0.0288],
        [ 0.0036,  0.0164, -0.0040,  ..., -0.0116,  0.0616,  0.0274],
        [-0.0193, -0.0371, -0.0538,  ...,  0.0536,  0.0136,  0.0600]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
