In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TwoLayerNet(nn.Module):
    """
     Define the own modules by subclassing nn.Module and define a forward function.
    """
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H)
        self.linear2 = nn.Linear(H, D_out)
    
    def forward(self, x):
        # accept a tensor of input data and return a tensor of output data
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)
loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(200):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 720.0653076171875
1 663.3251953125
2 614.59423828125
3 572.47119140625
4 535.4714965820312
5 502.67022705078125
6 472.8448486328125
7 445.65667724609375
8 420.64337158203125
9 397.44085693359375
10 375.7000427246094
11 355.3523254394531
12 336.023681640625
13 317.5968017578125
14 300.1414794921875
15 283.4847412109375
16 267.6226806640625
17 252.48182678222656
18 238.0513458251953
19 224.2923126220703
20 211.24942016601562
21 198.87649536132812
22 187.14193725585938
23 175.97097778320312
24 165.36468505859375
25 155.2587127685547
26 145.67869567871094
27 136.64906311035156
28 128.14297485351562
29 120.11801147460938
30 112.57102966308594
31 105.45256042480469
32 98.77152252197266
33 92.4934310913086
34 86.61393737792969
35 81.1101303100586
36 75.95616149902344
37 71.12483978271484
38 66.59734344482422
39 62.36522674560547
40 58.40692901611328
41 54.702518463134766
42 51.2402229309082
43 47.99764633178711
44 44.96238327026367
45 42.12602233886719
46 39.4791259765625
47 37.009654998779

**control flow and weight sharing**   

Each forward pass builds a dynamic computation graph. We can implement weight sharing among the innermost layers by simply reusing the same Module multiple times when defining the forward pass

In [14]:
import random
import torch
import torch.nn as nn

class DynamicNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = nn.Linear(D_in, H)
        self.middle_linear = nn.Linear(H, H)
        self.output_linear = nn.Linear(H, D_out)
    
    def forward(self, x):
        """
        each forward pass builds a dynamic computation graph,
        we can use normal python control-flow operators like loop or conditional
        statements when defining the forward pass of the model
        """
        h_relu = self.input_linear(x).clamp(min=0)
        # randomly choose either 0, 1, 2, 3 and reuse the middle layer multiple times
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred
        
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = DynamicNet(D_in, H, D_out)
loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(200):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 680.8883666992188
1 677.4639892578125
2 678.362548828125
3 677.9025268554688
4 551.5308227539062
5 671.5601196289062
6 667.9768676757812
7 408.26690673828125
8 359.4100036621094
9 662.3095703125
10 662.2933349609375
11 230.71780395507812
12 634.9635009765625
13 615.9895629882812
14 656.5223388671875
15 559.65283203125
16 651.8340454101562
17 112.62139892578125
18 628.0842895507812
19 639.66796875
20 632.5289306640625
21 380.13568115234375
22 345.82305908203125
23 106.98684692382812
24 104.65642547607422
25 571.0079345703125
26 552.107421875
27 451.140869140625
28 73.8045654296875
29 379.9350280761719
30 160.9483642578125
31 146.0660400390625
32 58.363826751708984
33 336.92108154296875
34 51.7586784362793
35 106.48206329345703
36 280.08587646484375
37 84.62905883789062
38 90.7477798461914
39 51.864437103271484
40 52.197731018066406
41 52.832008361816406
42 52.791358947753906
43 109.14999389648438
44 39.60805892944336
45 27.833356857299805
46 16.688016891479492
47 13.216318130493164
48

create the small ConvNet  

All networks are derived from the base class nn.Module:  
1. In the constructor, declare all the layers you want to use  
2. In the forward function, define how the model is going to be run, from input to output

卷积操作   
输出 output_shape[0] = (input_shape[0] - filter_shape[0] + 2* padding)/stride + 1  
nn.Conv2d （对由多个输入平面组成的输入信号进行二维卷积）  
$out(N_i, C_{out_j}) = bias(C_{out_j}) + \sum_{k=0}^{C_{in} - 1} weight(C_{out_j}, k)*input(N_i, k)$

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MNISTConvNet(nn.Module):
    
    def __init__(self):
        # declare all the layers you want to use
        super(MNISTConvNet, self).__init__()
        # nn.Conv2d(in_channels, out_channels, kernel_size)
        self.conv1 = nn.Conv2d(1, 10, 5) 
        # nn.MaxPool2d(kernel_size, stride=None, padding=0)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(10, 20, 5)
        self.pool2 = nn.MaxPool2d(2, 2)
        # nn.Linear(in_features, out_features)
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
        
        
    def forward(self, input):
        # define how the model is going to be run, from input to output
        # input size is (N, C_in, H, W) ===> (1, 1, 28, 28)
        x = self.conv1(input)
        # output_shape = (input_shape - filter_shape + 2* padding)/stride + 1  
        # output = (28 - 5 + 2*0)/1 + 1 = 24
        #  x size is (N, C_out, H_out, W_out) ===> (1, 10, 24, 24)
        x = F.relu(x)
        x = self.pool1(x)
        #  x size is (N, 10, H_out_2, W_out_2) ===> (1, 10, 12, 12)
        x = self.conv2(x)
        #  x size is (N, 20, H_out_3, W_out_3) ===> (1, 10, 8, 8)
        x = F.relu(x)
        x = self.pool2(x)
        #  x size is (N, 20, H_out_4, W_out_4) ===> (1, 20, 4, 4)
        x = x.view(x.size(0), -1) 
        #  x size is (N, 20*H_out_4*W_out_4)   ===> (1, 320)
        x = F.relu(self.fc1(x))
        #  x size is (N, 50)   ===> (1, 50)
        x = F.relu(self.fc2(x))
        #  x size is (N, 10)   ===> (1, 10)
        return x
    
model = MNISTConvNet()
print(model)
input_X = torch.randn(1, 1, 28, 28)
output = model(input_X)
target = torch.tensor([3], dtype=torch.long)
# there are 3 classes
print(target.size())
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(output, target)
loss.backward()
print(loss)

MNISTConvNet(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)
torch.Size([1])
tensor(2.1832, grad_fn=<NllLossBackward>)


 **hyperparameters and parameters**  
 
Hyperparameters are user defined, such as the training epoches, batch size, learning rate. During the training process, we need to tune these hyperparameters to obtain the optimal performance of the model. But parameters are learned. The learnable parameters (i.e. weights and biases) of the model are stored in the model's parameters (can be accessed with model.parameters()). 

In [48]:
import torchvision

model = torchvision.models.resnet18(pretrained=True)
for param in model.parameters():
    # requires_grad can be used as a flag in the frozen base
    param.requires_grad = False
# replace the last fully connected layer
model.fc = nn.Linear(512, 10)
# optimize only the classifier
optimizer = torch.optim.SGD(model.fc.parameters(), lr=1e-4, momentum=0.9)

# model's state_dict
# state_dict is a dictionary object that maps each layer to its parameter tensor
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())
print(" ")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

conv1.weight 	 torch.Size([64, 3, 7, 7])
bn1.weight 	 torch.Size([64])
bn1.bias 	 torch.Size([64])
bn1.running_mean 	 torch.Size([64])
bn1.running_var 	 torch.Size([64])
bn1.num_batches_tracked 	 torch.Size([])
layer1.0.conv1.weight 	 torch.Size([64, 64, 3, 3])
layer1.0.bn1.weight 	 torch.Size([64])
layer1.0.bn1.bias 	 torch.Size([64])
layer1.0.bn1.running_mean 	 torch.Size([64])
layer1.0.bn1.running_var 	 torch.Size([64])
layer1.0.bn1.num_batches_tracked 	 torch.Size([])
layer1.0.conv2.weight 	 torch.Size([64, 64, 3, 3])
layer1.0.bn2.weight 	 torch.Size([64])
layer1.0.bn2.bias 	 torch.Size([64])
layer1.0.bn2.running_mean 	 torch.Size([64])
layer1.0.bn2.running_var 	 torch.Size([64])
layer1.0.bn2.num_batches_tracked 	 torch.Size([])
layer1.1.conv1.weight 	 torch.Size([64, 64, 3, 3])
layer1.1.bn1.weight 	 torch.Size([64])
layer1.1.bn1.bias 	 torch.Size([64])
layer1.1.bn1.running_mean 	 torch.Size([64])
layer1.1.bn1.running_var 	 torch.Size([64])
layer1.1.bn1.num_batches_tracked 	 torch.

**Checking point**  

Checkpoint is the term to describe saving a snapshot of the model parameters after every epoch of training. 
Create checkpoints while training the model and then it allows you to load the saved weights and resume training from any epoch that has a checkpoint.