In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [36]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params=model.parameters(), lr = 1e-4, weight_decay= 1e-3)

In [37]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [38]:
output = model(dummy_input)
print(output)

tensor([[0.1285, 0.0939, 0.1215, 0.0599, 0.1146, 0.1136, 0.0876, 0.0998, 0.0989,
         0.0816],
        [0.1029, 0.0842, 0.0935, 0.0870, 0.1149, 0.0981, 0.1334, 0.0866, 0.1152,
         0.0840],
        [0.1337, 0.1099, 0.0777, 0.0711, 0.1222, 0.0849, 0.1056, 0.1100, 0.0922,
         0.0925],
        [0.1239, 0.0786, 0.0795, 0.0841, 0.1494, 0.0923, 0.1353, 0.0732, 0.0933,
         0.0904]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [39]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [40]:
criterion = NLLLoss()

In [41]:
loss = criterion(torch.log(output), target)
loss

tensor(2.2075, grad_fn=<NllLossBackward>)

### 完成back propagation並更新梯度

In [42]:
loss.backward()

In [43]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0081, -0.0354,  0.0548,  ...,  0.0269, -0.0479,  0.0596],
        [-0.0245,  0.0241,  0.0508,  ..., -0.0383, -0.0368,  0.0595],
        [ 0.0012, -0.0259,  0.0446,  ...,  0.0111,  0.0108, -0.0057],
        ...,
        [ 0.0278,  0.0206, -0.0387,  ...,  0.0385,  0.0253, -0.0475],
        [ 0.0275, -0.0298,  0.0123,  ...,  0.0056, -0.0177,  0.0362],
        [ 0.0012, -0.0533,  0.0419,  ..., -0.0334, -0.0103,  0.0613]],
       requires_grad=True)


grad : tensor([[ 3.3597e-04, -3.2556e-04,  2.6416e-03,  ...,  9.4995e-04,
         -3.0276e-03, -1.0230e-03],
        [ 1.2469e-04,  9.9443e-04,  2.1933e-03,  ...,  7.5693e-04,
         -1.6727e-03, -3.1982e-04],
        [-2.4734e-03, -1.4547e-02,  1.6177e-02,  ..., -7.1748e-04,
         -1.9863e-02,  1.5758e-03],
        ...,
        [-1.1231e-02, -5.0577e-02, -4.2793e-02,  ..., -1.9578e-02,
          2.3238e-02,  2.7673e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
       

In [44]:
optimizer.step()

In [45]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0082, -0.0353,  0.0547,  ...,  0.0268, -0.0478,  0.0597],
        [-0.0246,  0.0240,  0.0507,  ..., -0.0384, -0.0367,  0.0596],
        [ 0.0013, -0.0258,  0.0445,  ...,  0.0112,  0.0109, -0.0058],
        ...,
        [ 0.0279,  0.0207, -0.0386,  ...,  0.0386,  0.0252, -0.0476],
        [ 0.0274, -0.0297,  0.0122,  ...,  0.0055, -0.0176,  0.0361],
        [ 0.0013, -0.0534,  0.0420,  ..., -0.0335, -0.0104,  0.0612]],
       requires_grad=True)


grad : tensor([[ 3.3597e-04, -3.2556e-04,  2.6416e-03,  ...,  9.4995e-04,
         -3.0276e-03, -1.0230e-03],
        [ 1.2469e-04,  9.9443e-04,  2.1933e-03,  ...,  7.5693e-04,
         -1.6727e-03, -3.1982e-04],
        [-2.4734e-03, -1.4547e-02,  1.6177e-02,  ..., -7.1748e-04,
         -1.9863e-02,  1.5758e-03],
        ...,
        [-1.1231e-02, -5.0577e-02, -4.2793e-02,  ..., -1.9578e-02,
          2.3238e-02,  2.7673e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
       

### 清空 gradient

In [46]:
optimizer.zero_grad()

In [47]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0082, -0.0353,  0.0547,  ...,  0.0268, -0.0478,  0.0597],
        [-0.0246,  0.0240,  0.0507,  ..., -0.0384, -0.0367,  0.0596],
        [ 0.0013, -0.0258,  0.0445,  ...,  0.0112,  0.0109, -0.0058],
        ...,
        [ 0.0279,  0.0207, -0.0386,  ...,  0.0386,  0.0252, -0.0476],
        [ 0.0274, -0.0297,  0.0122,  ...,  0.0055, -0.0176,  0.0361],
        [ 0.0013, -0.0534,  0.0420,  ..., -0.0335, -0.0104,  0.0612]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
