In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )

    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x


### 準備輸入資料、優化器、標籤資料、模型輸出

In [4]:
model = Model(input_dimention=256,output_classes=10)
optimizer =optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)

In [6]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [7]:
output = model(dummy_input)
print(output)

tensor([[0.1126, 0.1762, 0.1937, 0.0973, 0.0530, 0.0675, 0.0838, 0.0812, 0.0513,
         0.0833],
        [0.0718, 0.0922, 0.2005, 0.0962, 0.1334, 0.0883, 0.0883, 0.0764, 0.0650,
         0.0879],
        [0.1173, 0.1443, 0.1635, 0.0412, 0.0658, 0.0849, 0.0960, 0.0646, 0.1332,
         0.0892],
        [0.0841, 0.1182, 0.1347, 0.1095, 0.0578, 0.1578, 0.0777, 0.0933, 0.0890,
         0.0778]], grad_fn=<SoftmaxBackward0>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [8]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [9]:
criterion = NLLLoss()

In [10]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [11]:
loss.backward()

In [12]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0499,  0.0043, -0.0200,  ...,  0.0394,  0.0335, -0.0498],
        [-0.0577, -0.0111,  0.0504,  ..., -0.0484, -0.0587, -0.0391],
        [-0.0326,  0.0474,  0.0052,  ...,  0.0447,  0.0226, -0.0352],
        ...,
        [ 0.0560,  0.0033,  0.0624,  ...,  0.0311, -0.0291,  0.0279],
        [-0.0561,  0.0140, -0.0579,  ..., -0.0477,  0.0131, -0.0527],
        [-0.0049, -0.0284,  0.0386,  ..., -0.0473, -0.0161,  0.0452]],
       requires_grad=True)


grad : tensor([[-4.3875e-04,  3.1590e-04,  1.4305e-03,  ...,  1.1045e-03,
          1.2792e-03,  1.0117e-03],
        [ 2.7689e-02,  1.4445e-02,  5.9352e-02,  ...,  1.7079e-01,
          2.2467e-01, -9.1824e-02],
        [-9.1062e-03,  2.7616e-02,  4.3894e-03,  ...,  3.9027e-02,
          7.5643e-03, -4.6782e-03],
        ...,
        [ 3.0281e-03, -4.4459e-03, -2.6227e-03,  ..., -5.0397e-03,
         -3.3687e-04, -2.9794e-03],
        [ 5.6751e-03, -3.7922e-01,  2.6699e-02,  ..., -7.0738e-01,
       

In [13]:
optimizer.step()

In [14]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0509,  0.0033, -0.0210,  ...,  0.0384,  0.0325, -0.0508],
        [-0.0587, -0.0121,  0.0494,  ..., -0.0494, -0.0597, -0.0381],
        [-0.0316,  0.0464,  0.0042,  ...,  0.0437,  0.0216, -0.0342],
        ...,
        [ 0.0550,  0.0043,  0.0634,  ...,  0.0321, -0.0281,  0.0289],
        [-0.0571,  0.0150, -0.0589,  ..., -0.0467,  0.0141, -0.0537],
        [-0.0039, -0.0294,  0.0396,  ..., -0.0483, -0.0151,  0.0462]],
       requires_grad=True)


grad : tensor([[-4.3875e-04,  3.1590e-04,  1.4305e-03,  ...,  1.1045e-03,
          1.2792e-03,  1.0117e-03],
        [ 2.7689e-02,  1.4445e-02,  5.9352e-02,  ...,  1.7079e-01,
          2.2467e-01, -9.1824e-02],
        [-9.1062e-03,  2.7616e-02,  4.3894e-03,  ...,  3.9027e-02,
          7.5643e-03, -4.6782e-03],
        ...,
        [ 3.0281e-03, -4.4459e-03, -2.6227e-03,  ..., -5.0397e-03,
         -3.3687e-04, -2.9794e-03],
        [ 5.6751e-03, -3.7922e-01,  2.6699e-02,  ..., -7.0738e-01,
       

### 清空 gradient

In [15]:
optimizer.zero_grad()

In [16]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0509,  0.0033, -0.0210,  ...,  0.0384,  0.0325, -0.0508],
        [-0.0587, -0.0121,  0.0494,  ..., -0.0494, -0.0597, -0.0381],
        [-0.0316,  0.0464,  0.0042,  ...,  0.0437,  0.0216, -0.0342],
        ...,
        [ 0.0550,  0.0043,  0.0634,  ...,  0.0321, -0.0281,  0.0289],
        [-0.0571,  0.0150, -0.0589,  ..., -0.0467,  0.0141, -0.0537],
        [-0.0039, -0.0294,  0.0396,  ..., -0.0483, -0.0151,  0.0462]],
       requires_grad=True)


grad : None
