In [None]:
# code is adapted from d2l.ai - Chapter 5 with additions and other experiments

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

## Layers and Blocks

In [70]:
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[ 0.2049,  0.1291, -0.2545, -0.1309, -0.3377,  0.1748,  0.1382, -0.0494,
         -0.0386,  0.1386],
        [ 0.3130,  0.1367, -0.3545,  0.0466, -0.2267,  0.1971,  0.1951, -0.0432,
          0.0279,  0.2282]], grad_fn=<AddmmBackward>)

## Custom Blocks

In [5]:
class MLP(nn.Module):
    # Declare a layer with model parameters. Here, we declare two fully
    # connected layers
    def __init__(self):
        # Call the constructor of the `MLP` parent class `Module` to perform
        # the necessary initialization. In this way, other function arguments
        # can also be specified during class instantiation, such as the model
        # parameters, `params` (to be described later)
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # Hidden layer
        self.out = nn.Linear(256, 10)  # Output layer

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input `X`
    def forward(self, X):
        # Note here we use the funtional version of ReLU defined in the
        # nn.functional module.
        return self.out(F.relu(self.hidden(X)))

In [6]:
net = MLP()

net(X)

tensor([[ 0.0198, -0.1493, -0.0520, -0.0848, -0.3571,  0.0851,  0.0647,  0.0770,
         -0.3174, -0.2176],
        [ 0.1417, -0.2123,  0.0369, -0.1612, -0.3030,  0.1386,  0.1829,  0.0036,
         -0.1788, -0.2200]], grad_fn=<AddmmBackward>)

In [None]:
#net.apply(init_fn) #kaiming_uniform

## Custom Sequential Block

In [7]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            # Here, `module` is an instance of a `Module` subclass. We save it
            # in the member variable `_modules` of the `Module` class, and its
            # type is OrderedDict
            self._modules[str(idx)] = module

    def forward(self, X):
        # OrderedDict guarantees that members will be traversed in the order
        # they were added
        for block in self._modules.values():
            X = block(X)
        return X

In [8]:
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[-0.0233,  0.0504, -0.1248, -0.1506,  0.0467,  0.0192,  0.2111,  0.0061,
          0.3107, -0.0725],
        [ 0.0296,  0.1258, -0.0918, -0.1302,  0.0314, -0.1343,  0.0989,  0.1752,
          0.1854, -0.1540]], grad_fn=<AddmmBackward>)

## Tensor Ops in forward()

In [68]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # Random weight parameters that will not compute gradients and
        # therefore keep constant during training
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
        self.linear2 = nn.Linear(20, 21)
        self.linear3 = nn.Linear(21, 20)

    def forward(self, X):
        X2 = X.clone()
        X2 = self.linear2(X2)
        X2 = self.linear3(X2)
        X = self.linear(X)
        # Use the created constant parameters, as well as the `relu` and `mm`
        # functions
        X += X2
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # Reuse the fully-connected layer. This is equivalent to sharing
        # parameters with two fully-connected layers
        X = self.linear(X)
        # Control flow
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [71]:
net = FixedHiddenMLP()
net(X)

tensor(0.1283, grad_fn=<SumBackward0>)

## Nested Blocks

In [26]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(-0.1106, grad_fn=<SumBackward0>)

## Params management

In [72]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[-0.2727],
        [-0.2193]], grad_fn=<AddmmBackward>)

In [28]:
net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.0528,  0.4438, -0.3360, -0.4050],
                      [-0.2925, -0.3386, -0.2287, -0.0947],
                      [ 0.1488,  0.4780,  0.0384,  0.2653],
                      [-0.1007,  0.0819,  0.4925,  0.4800],
                      [ 0.4627,  0.2437,  0.4010, -0.0476],
                      [-0.2228, -0.2406, -0.3776, -0.1833],
                      [-0.2953, -0.3693,  0.3597,  0.3371],
                      [ 0.2478,  0.2785, -0.1406,  0.0031]])),
             ('0.bias',
              tensor([ 0.4938, -0.3451, -0.3527,  0.2861, -0.4172, -0.2159,  0.2688, -0.0559])),
             ('2.weight',
              tensor([[ 0.0450,  0.0599, -0.2341, -0.2108, -0.1914, -0.0172, -0.3188,  0.1648]])),
             ('2.bias', tensor([0.0291]))])

In [34]:
net[0].state_dict()

OrderedDict([('weight',
              tensor([[ 0.0528,  0.4438, -0.3360, -0.4050],
                      [-0.2925, -0.3386, -0.2287, -0.0947],
                      [ 0.1488,  0.4780,  0.0384,  0.2653],
                      [-0.1007,  0.0819,  0.4925,  0.4800],
                      [ 0.4627,  0.2437,  0.4010, -0.0476],
                      [-0.2228, -0.2406, -0.3776, -0.1833],
                      [-0.2953, -0.3693,  0.3597,  0.3371],
                      [ 0.2478,  0.2785, -0.1406,  0.0031]])),
             ('bias',
              tensor([ 0.4938, -0.3451, -0.3527,  0.2861, -0.4172, -0.2159,  0.2688, -0.0559]))])

## Targeting Params

In [37]:
net[0].bias.data

tensor([ 0.4938, -0.3451, -0.3527,  0.2861, -0.4172, -0.2159,  0.2688, -0.0559])

In [35]:
net[0].weight.data[:4,:4] = 99.

net[0].weight.data

tensor([[ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
        [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
        [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
        [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
        [ 4.6275e-01,  2.4371e-01,  4.0105e-01, -4.7602e-02],
        [-2.2279e-01, -2.4063e-01, -3.7756e-01, -1.8331e-01],
        [-2.9535e-01, -3.6929e-01,  3.5972e-01,  3.3714e-01],
        [ 2.4776e-01,  2.7851e-01, -1.4060e-01,  3.1351e-03]])

In [38]:
net.state_dict()['2.bias'].data

tensor([0.0291])

## All params

In [39]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [None]:
## <named_parameters()>, children, parameters, modules

In [40]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [41]:
list(net.named_parameters())

[('0.weight',
  Parameter containing:
  tensor([[ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
          [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
          [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
          [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
          [ 4.6275e-01,  2.4371e-01,  4.0105e-01, -4.7602e-02],
          [-2.2279e-01, -2.4063e-01, -3.7756e-01, -1.8331e-01],
          [-2.9535e-01, -3.6929e-01,  3.5972e-01,  3.3714e-01],
          [ 2.4776e-01,  2.7851e-01, -1.4060e-01,  3.1351e-03]],
         requires_grad=True)),
 ('0.bias',
  Parameter containing:
  tensor([ 0.4938, -0.3451, -0.3527,  0.2861, -0.4172, -0.2159,  0.2688, -0.0559],
         requires_grad=True)),
 ('2.weight',
  Parameter containing:
  tensor([[ 0.0450,  0.0599, -0.2341, -0.2108, -0.1914, -0.0172, -0.3188,  0.1648]],
         requires_grad=True)),
 ('2.bias',
  Parameter containing:
  tensor([0.0291], requires_grad=True))]

In [None]:
## <parameters()>

In [42]:
list(net.parameters())

[Parameter containing:
 tensor([[ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
         [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
         [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
         [ 9.9000e+01,  9.9000e+01,  9.9000e+01,  9.9000e+01],
         [ 4.6275e-01,  2.4371e-01,  4.0105e-01, -4.7602e-02],
         [-2.2279e-01, -2.4063e-01, -3.7756e-01, -1.8331e-01],
         [-2.9535e-01, -3.6929e-01,  3.5972e-01,  3.3714e-01],
         [ 2.4776e-01,  2.7851e-01, -1.4060e-01,  3.1351e-03]],
        requires_grad=True),
 Parameter containing:
 tensor([ 0.4938, -0.3451, -0.3527,  0.2861, -0.4172, -0.2159,  0.2688, -0.0559],
        requires_grad=True),
 Parameter containing:
 tensor([[ 0.0450,  0.0599, -0.2341, -0.2108, -0.1914, -0.0172, -0.3188,  0.1648]],
        requires_grad=True),
 Parameter containing:
 tensor([0.0291], requires_grad=True)]

In [43]:
list(net.named_children())

[('0', Linear(in_features=4, out_features=8, bias=True)),
 ('1', ReLU()),
 ('2', Linear(in_features=8, out_features=1, bias=True))]

In [44]:
list(net.children())

[Linear(in_features=4, out_features=8, bias=True),
 ReLU(),
 Linear(in_features=8, out_features=1, bias=True)]

## Collecting Parameters

In [47]:
X.shape

torch.Size([2, 4])

In [45]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4),
                         nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # Nested here
        net.add_module(f'block {i}', block1())
    return net

def block3():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block_2 {i}', block2())
    return net

rgnet2 = nn.Sequential(block3(), nn.Linear(4,1))
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)



tensor([[-0.2556],
        [-0.2556]], grad_fn=<AddmmBackward>)

In [48]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [51]:
rgnet.add_module('fc', nn.Linear(1,1, bias=True))
rgnet

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
  (fc): Linear(in_features=1, out_features=1, bias=True)
)

In [52]:
list(rgnet.named_parameters())

[('0.block 0.0.weight',
  Parameter containing:
  tensor([[ 0.1639,  0.2251, -0.4275, -0.4276],
          [-0.1055,  0.1964, -0.2566, -0.3146],
          [-0.0649,  0.1358, -0.2405, -0.4776],
          [-0.2704, -0.0756, -0.1673,  0.4028],
          [-0.1686,  0.3300,  0.0248,  0.0440],
          [-0.0455, -0.0727,  0.1271,  0.0872],
          [-0.3912, -0.1992,  0.0052, -0.0626],
          [-0.1598,  0.0920, -0.2388, -0.4927]], requires_grad=True)),
 ('0.block 0.0.bias',
  Parameter containing:
  tensor([ 0.3822,  0.1829,  0.0156, -0.1684,  0.0761,  0.1758, -0.0640, -0.4413],
         requires_grad=True)),
 ('0.block 0.2.weight',
  Parameter containing:
  tensor([[ 2.9448e-04, -1.2497e-01, -2.6117e-01,  3.0121e-01, -2.4586e-01,
            1.1525e-01,  1.1274e-01, -2.8943e-01],
          [ 6.0678e-05, -3.0486e-01, -1.8523e-01,  1.7244e-01, -8.3854e-02,
           -2.0622e-01, -1.7788e-01,  1.3064e-01],
          [ 6.0024e-02,  3.1581e-01,  2.8538e-02,  2.4565e-02, -3.3780e-01,
       

In [None]:
## <parameters()>

In [55]:
list(rgnet.parameters())

[Parameter containing:
 tensor([[ 0.1639,  0.2251, -0.4275, -0.4276],
         [-0.1055,  0.1964, -0.2566, -0.3146],
         [-0.0649,  0.1358, -0.2405, -0.4776],
         [-0.2704, -0.0756, -0.1673,  0.4028],
         [-0.1686,  0.3300,  0.0248,  0.0440],
         [-0.0455, -0.0727,  0.1271,  0.0872],
         [-0.3912, -0.1992,  0.0052, -0.0626],
         [-0.1598,  0.0920, -0.2388, -0.4927]], requires_grad=True),
 Parameter containing:
 tensor([ 0.3822,  0.1829,  0.0156, -0.1684,  0.0761,  0.1758, -0.0640, -0.4413],
        requires_grad=True),
 Parameter containing:
 tensor([[ 2.9448e-04, -1.2497e-01, -2.6117e-01,  3.0121e-01, -2.4586e-01,
           1.1525e-01,  1.1274e-01, -2.8943e-01],
         [ 6.0678e-05, -3.0486e-01, -1.8523e-01,  1.7244e-01, -8.3854e-02,
          -2.0622e-01, -1.7788e-01,  1.3064e-01],
         [ 6.0024e-02,  3.1581e-01,  2.8538e-02,  2.4565e-02, -3.3780e-01,
          -3.0268e-01,  2.7581e-01, -3.2305e-01],
         [ 3.1388e-01, -2.5387e-01,  1.0965e-01

In [56]:
list(rgnet.named_children())

[('0',
  Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )),
 ('1', Linear(in_features=4, out_features=1, bias=True)),
 ('fc', Linear(in_features=1, out_features=1, bias=True))]

In [57]:
list(rgnet.children())

[Sequential(
   (block 0): Sequential(
     (0): Linear(in_features=4, out_features=8, bias=True)
     (1): ReLU()
     (2): Linear(in_features=8, out_features=4, bias=True)
     (3): ReLU()
   )
   (block 1): Sequential(
     (0): Linear(in_features=4, out_features=8, bias=True)
     (1): ReLU()
     (2): Linear(in_features=8, out_features=4, bias=True)
     (3): ReLU()
   )
   (block 2): Sequential(
     (0): Linear(in_features=4, out_features=8, bias=True)
     (1): ReLU()
     (2): Linear(in_features=8, out_features=4, bias=True)
     (3): ReLU()
   )
   (block 3): Sequential(
     (0): Linear(in_features=4, out_features=8, bias=True)
     (1): ReLU()
     (2): Linear(in_features=8, out_features=4, bias=True)
     (3): ReLU()
   )
 ),
 Linear(in_features=4, out_features=1, bias=True),
 Linear(in_features=1, out_features=1, bias=True)]

In [58]:
rgnet[0][1][0].bias.data

tensor([ 0.3766, -0.0611, -0.4432, -0.4459, -0.0248, -0.1326, -0.4823, -0.4727])

In [59]:
rgnet.state_dict()['0.block 1.0.bias']

tensor([ 0.3766, -0.0611, -0.4432, -0.4459, -0.0248, -0.1326, -0.4823, -0.4727])

In [60]:
rgnet2(X)

tensor([[-0.0186],
        [-0.0186]], grad_fn=<AddmmBackward>)

In [61]:
from IPython.display import clear_output

In [63]:
list(rgnet2.named_modules())

for mod in rgnet.named_modules():
    print(mod)
    input()
    clear_output()

('0.block 2.1', ReLU())


KeyboardInterrupt: Interrupted by user

In [None]:
rgnet2

In [None]:
## 1 -> 2 -> 3 -> 4 

In [66]:
rgnet3 = nn.Module()
rgnet3.add_module('first', rgnet2)
rgnet3

Sequential(
  (first): Sequential(
    (0): Sequential(
      (block_2 0): Sequential(
        (block 0): Sequential(
          (0): Linear(in_features=4, out_features=8, bias=True)
          (1): ReLU()
          (2): Linear(in_features=8, out_features=4, bias=True)
          (3): ReLU()
        )
        (block 1): Sequential(
          (0): Linear(in_features=4, out_features=8, bias=True)
          (1): ReLU()
          (2): Linear(in_features=8, out_features=4, bias=True)
          (3): ReLU()
        )
        (block 2): Sequential(
          (0): Linear(in_features=4, out_features=8, bias=True)
          (1): ReLU()
          (2): Linear(in_features=8, out_features=4, bias=True)
          (3): ReLU()
        )
        (block 3): Sequential(
          (0): Linear(in_features=4, out_features=8, bias=True)
          (1): ReLU()
          (2): Linear(in_features=8, out_features=4, bias=True)
          (3): ReLU()
        )
      )
      (block_2 1): Sequential(
        (block 0): Seq

## Initialization

In [None]:
## weights are initialized with kaiming uniform by default

In [67]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [73]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0040, -0.0066,  0.0019, -0.0078]), tensor(0.))

In [74]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [75]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.0836, -0.3756,  0.3619,  0.3465])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [None]:
"""
U(5,10)   with probability 1/4
0         with probability 1/2
U(−10,−5) with probability 1/4
"""
0

In [83]:
def my_init(m):
    if type(m) == nn.Linear:
        print(
            "Init",
            *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 0.0000, -0.0000, -9.3964, -8.0282],
        [-9.0053,  0.0000, -0.0000, -7.8644]], grad_fn=<SliceBackward>)

## Tied Params

In [94]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared,
                    nn.ReLU(), nn.Linear(8, 1))
net(X)
# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


## Transfer Learning

In [84]:
from torchvision.models import resnet18

In [85]:
model = resnet18(pretrained=True)

In [87]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [86]:
list(model.named_children())

[('conv1',
  Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)),
 ('bn1',
  BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)),
 ('relu', ReLU(inplace=True)),
 ('maxpool',
  MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)),
 ('layer1',
  Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=

In [115]:
## Rough way

module_list = list(model.children())
final = module_list[-1]
in_feats = final.in_features
in_feats

new_model = nn.Sequential(*module_list[:-1],nn.Linear(in_feats, 10, bias=True))
new_model

Sequential(
  (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con

In [115]:
## Rough way

module_list = list(model.children())
final = module_list[-1]
initial = module_list[0]
new_initial = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

new_initial.weight.data = initial.weight.data.mean(-3)

module_list[0] = new_initial

in_feats = final.in_features
in_feats

new_model = nn.Sequential(*module_list[:-1],nn.Linear(in_feats, 10, bias=True))
new_model

Sequential(
  (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con

In [None]:
# Kaiming He 

## File I/O

In [None]:
## Anything serializable

In [88]:
x = torch.arange(4)
torch.save(x, 'x-file')

x2 = torch.load('x-file')
x2

tensor([0, 1, 2, 3])

In [89]:
y = torch.zeros(4)
torch.save([x, y], 'x-files')
x2, y2 = torch.load('x-files')
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [90]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [95]:
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

In [96]:
Y  = net(X)

In [97]:
torch.save(net.state_dict(), 'mlp_params')

In [98]:
clone = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared,
                    nn.ReLU(), nn.Linear(8, 1))
clone.load_state_dict(torch.load('mlp_params'))
clone.eval()

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=8, bias=True)
  (3): ReLU()
  (4): Linear(in_features=8, out_features=8, bias=True)
  (5): ReLU()
  (6): Linear(in_features=8, out_features=1, bias=True)
)

In [99]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True],
        [True]])

## GPUs

In [114]:
!nvidia-smi

Sun Jun 13 18:36:57 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.31       Driver Version: 462.31       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX330      WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   64C    P0    N/A /  N/A |    471MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [103]:
torch.device('cpu'), torch.cuda.device('cuda'), torch.cuda.device('cuda:0')

(device(type='cpu'),
 <torch.cuda.device at 0x1f913f864c0>,
 <torch.cuda.device at 0x1f913f86ee0>)

In [104]:
torch.cuda.device_count()

1

In [105]:
def try_gpu(i=0): 
    """Return gpu(i) if exists, otherwise return cpu()."""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    devices = [
        torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [106]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [107]:
x = x.to(torch.device('cuda'))
# x = x.cuda()
# x = x.cpu()
x.device

device(type='cuda', index=0)

In [108]:
x.numpy()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [109]:
x.cpu().numpy()

array([1, 2, 3], dtype=int64)

In [110]:
x = torch.ones(2,3, device=try_gpu(0))
x

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [111]:
Z = x.cpu()
print(x)
print(Z)
x+Z

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')
tensor([[1., 1., 1.],
        [1., 1., 1.]])


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [112]:
net[0].weight.data.device

device(type='cpu')

In [113]:
net.to(torch.device('cuda'))
# net.cuda()
net[0].weight.data.device

device(type='cuda', index=0)

In [None]:
"""
delta = 1e-3
for epoch in range(epochs):
    loss = 
    
"""

## Extras

In [115]:
## Rough way

module_list = list(model.children())
final = module_list[-1]
initial = module_list[0]
new_initial = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

new_initial.weight.data = initial.weight.data.mean(-3)

module_list[0] = new_initial

in_feats = final.in_features
in_feats

new_model = nn.Sequential(*module_list[:-1],nn.Linear(in_feats, 10, bias=True))
new_model

Sequential(
  (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con