In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [4]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)

In [5]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

prediction = model(dummy_input)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [6]:
output = model(dummy_input)
print(output)

tensor([[0.0819, 0.0682, 0.0977, 0.1358, 0.1519, 0.0607, 0.0716, 0.0417, 0.1508,
         0.1396],
        [0.2085, 0.0617, 0.1058, 0.0737, 0.0894, 0.0606, 0.0807, 0.0534, 0.0629,
         0.2033],
        [0.0731, 0.0727, 0.1169, 0.0835, 0.1379, 0.0866, 0.1128, 0.1113, 0.1028,
         0.1023],
        [0.1862, 0.0615, 0.0378, 0.1024, 0.2061, 0.1206, 0.0846, 0.0302, 0.0510,
         0.1195]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [7]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [8]:
criterion = NLLLoss()


In [9]:
#loss = criterion(torch.log("自行輸入"), "自行輸入")
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [10]:
loss.backward()

In [11]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0432, -0.0288,  0.0422,  ...,  0.0333,  0.0266,  0.0556],
        [ 0.0227, -0.0260,  0.0312,  ...,  0.0284,  0.0182, -0.0456],
        [-0.0465, -0.0581, -0.0081,  ...,  0.0398,  0.0237, -0.0583],
        ...,
        [ 0.0259, -0.0057, -0.0495,  ..., -0.0597,  0.0158, -0.0104],
        [-0.0231, -0.0553, -0.0210,  ...,  0.0276, -0.0293, -0.0350],
        [-0.0268,  0.0299, -0.0472,  ...,  0.0515,  0.0096, -0.0547]],
       requires_grad=True)


grad : tensor([[ 1.2182e-03,  3.8655e-02, -2.9386e-02,  ..., -5.2881e-02,
          3.2501e-02, -3.0949e-02],
        [-8.8570e-03,  2.2299e-02, -5.3892e-03,  ...,  3.7316e-03,
         -2.9559e-02, -8.2788e-03],
        [ 3.3034e-04, -3.7261e-03,  5.3577e-04,  ...,  2.2806e-03,
         -1.7010e-04,  1.8871e-04],
        ...,
        [-2.7321e-02, -7.6902e-02, -3.2328e-02,  ...,  3.1032e-02,
         -3.3806e-02,  3.8291e-02],
        [ 9.5917e-02,  2.0031e-01,  9.8496e-02,  ..., -1.1444e-01,
       

In [12]:
optimizer.step()

In [13]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0422, -0.0298,  0.0432,  ...,  0.0343,  0.0256,  0.0566],
        [ 0.0237, -0.0270,  0.0322,  ...,  0.0274,  0.0192, -0.0446],
        [-0.0475, -0.0571, -0.0091,  ...,  0.0388,  0.0247, -0.0593],
        ...,
        [ 0.0269, -0.0047, -0.0485,  ..., -0.0607,  0.0168, -0.0114],
        [-0.0241, -0.0563, -0.0220,  ...,  0.0286, -0.0303, -0.0340],
        [-0.0258,  0.0309, -0.0462,  ...,  0.0505,  0.0086, -0.0557]],
       requires_grad=True)


grad : tensor([[ 1.2182e-03,  3.8655e-02, -2.9386e-02,  ..., -5.2881e-02,
          3.2501e-02, -3.0949e-02],
        [-8.8570e-03,  2.2299e-02, -5.3892e-03,  ...,  3.7316e-03,
         -2.9559e-02, -8.2788e-03],
        [ 3.3034e-04, -3.7261e-03,  5.3577e-04,  ...,  2.2806e-03,
         -1.7010e-04,  1.8871e-04],
        ...,
        [-2.7321e-02, -7.6902e-02, -3.2328e-02,  ...,  3.1032e-02,
         -3.3806e-02,  3.8291e-02],
        [ 9.5917e-02,  2.0031e-01,  9.8496e-02,  ..., -1.1444e-01,
       

### 清空 gradient

In [14]:
optimizer.zero_grad()

In [15]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0422, -0.0298,  0.0432,  ...,  0.0343,  0.0256,  0.0566],
        [ 0.0237, -0.0270,  0.0322,  ...,  0.0274,  0.0192, -0.0446],
        [-0.0475, -0.0571, -0.0091,  ...,  0.0388,  0.0247, -0.0593],
        ...,
        [ 0.0269, -0.0047, -0.0485,  ..., -0.0607,  0.0168, -0.0114],
        [-0.0241, -0.0563, -0.0220,  ...,  0.0286, -0.0303, -0.0340],
        [-0.0258,  0.0309, -0.0462,  ...,  0.0505,  0.0086, -0.0557]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
