In [4]:
import torch
from torch import nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
%config InlineBackend.figure_formats = ['svg']
plt.style.use('fivethirtyeight')

In [12]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

X = torch.rand(2,20)
net(X).shape



torch.Size([2, 10])

In [13]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.out = nn.LazyLinear(10)

    def forward(self, x):
        return self.out(F.relu(self.hidden(x)))

In [18]:
net = MLP()
net(X)

tensor([[-0.2365,  0.0398, -0.0060, -0.1194,  0.0429, -0.0620,  0.2160, -0.0042,
          0.1008, -0.1062],
        [-0.2079,  0.0912, -0.0548, -0.1818,  0.0892, -0.1424,  0.0525,  0.0170,
          0.0544, -0.0792]], grad_fn=<AddmmBackward0>)

In [29]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()

        for idx, module in enumerate(args):
            self.add_module(str(idx), module)

    def forward(self, X):
        for module in self.children():
            X = module(X)
        return X


In [37]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

for nnn in net.children():
    print (nnn)

print (net)

LazyLinear(in_features=0, out_features=256, bias=True)
ReLU()
LazyLinear(in_features=0, out_features=10, bias=True)
MySequential(
  (0): LazyLinear(in_features=0, out_features=256, bias=True)
  (1): ReLU()
  (2): LazyLinear(in_features=0, out_features=10, bias=True)
)




In [63]:
class FixedHiddenMLP(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.rand_weight = torch.rand((20,20))
        self.linear = nn.LazyLinear(20)
        #self.add_module('rw',self.rand_weight)

    def forward(self, X):
        X = self.linear(X)
        X = F.relu(X @ self.rand_weight + 1)

        X = self.linear(X)

        while X.abs().sum() > 1:
            X /= 2.0
        return X.sum()

In [65]:
net = FixedHiddenMLP()
print (net(X))

print (net)

#net.rand_weight

tensor(0.2793, grad_fn=<SumBackward0>)
FixedHiddenMLP(
  (linear): Linear(in_features=20, out_features=20, bias=True)
)


In [66]:
class NestMLP(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.net = nn.Sequential(nn.LazyLinear(64), nn.ReLU(),
                                 nn.LazyLinear(32), nn.ReLU())
        self.linear = nn.LazyLinear(16)

    def forward(self, X):
        return self.linear(self.net(X))


In [94]:
chimera = nn.Sequential(NestMLP(), nn.LazyLinear(20), FixedHiddenMLP())
print (chimera(X))

print (chimera)

print ('Numer of parameters in model is: ', sum([p.numel() for p in chimera.parameters()]))

tensor(0.0212, grad_fn=<SumBackward0>)
Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=20, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=32, bias=True)
      (3): ReLU()
    )
    (linear): Linear(in_features=32, out_features=16, bias=True)
  )
  (1): Linear(in_features=16, out_features=20, bias=True)
  (2): FixedHiddenMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)
Numer of parameters in model is:  4712




In [83]:
for name, p in chimera.named_parameters():
    print (name)

0.net.0.weight
0.net.0.bias
0.net.2.weight
0.net.2.bias
0.linear.weight
0.linear.bias
1.weight
1.bias
2.linear.weight
2.linear.bias


In [92]:
chimera.state_dict()['0.net.0.weight']

tensor([[-0.1260, -0.0682,  0.1857,  ..., -0.2194, -0.0924,  0.0373],
        [-0.1225,  0.0505, -0.1204,  ..., -0.1875,  0.0770,  0.1464],
        [ 0.0537, -0.1504,  0.2096,  ...,  0.2202, -0.2199, -0.1091],
        ...,
        [ 0.1355,  0.2057,  0.1611,  ...,  0.1853, -0.2099,  0.1138],
        [ 0.0966, -0.0052, -0.0559,  ..., -0.0286, -0.0376, -0.1118],
        [ 0.0082,  0.0724, -0.0150,  ..., -0.0298, -0.0066, -0.1291]])

In [97]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [99]:
net[2].state_dict()

OrderedDict([('weight',
              tensor([[-0.2361, -0.1360,  0.3349,  0.1423, -0.3353, -0.0525, -0.1059, -0.0160]])),
             ('bias', tensor([0.0776]))])

In [113]:
type(net[2].bias), net[2].bias.data

net[2].weight.grad == None

print ([(name, param.shape) for name, param in net.named_parameters()])

[('0.weight', torch.Size([8, 4])), ('0.bias', torch.Size([8])), ('2.weight', torch.Size([1, 8])), ('2.bias', torch.Size([1]))]


In [125]:
shared = nn.LazyLinear(8)
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.LazyLinear(1))

net(X)

print (net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0,0] = 100
print (net[2].weight.data[0] == net[4].weight.data[0])



tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])




In [129]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape



torch.Size([2, 1])

In [131]:
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

net.apply(init_normal)

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [133]:
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [134]:
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)
def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([0.2354, 0.1718, 0.1633, 0.3361])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [135]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

In [136]:
net[0].weight

<UninitializedParameter>

In [139]:
X = torch.randn(2,20)
net(X)

net[0].weight.shape

torch.Size([256, 20])

In [140]:
class CenteredLayer(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, X):
        return X - X.mean()


layer = CenteredLayer()

In [141]:
layer(torch.tensor([1.0, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [142]:
net = nn.Sequential(nn.LazyLinear(128), CenteredLayer())



In [143]:
Y = net(torch.rand(4, 8))
Y.mean()

tensor(3.7253e-09, grad_fn=<MeanBackward0>)

In [151]:
class MyLinear(nn.Module):
    def __init__(self, in_units, out_units) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, out_units))
        self.bias = nn.Parameter(torch.randn(out_units,))

    def forward(self, X):
        linear = X @ self.weight.data + self.bias.data
        return F.relu(linear)



In [152]:
linear = MyLinear(5, 3)
print (linear.weight)

Parameter containing:
tensor([[ 1.4096,  0.8063, -0.0838],
        [-1.6516, -0.9158, -0.4086],
        [-0.2189, -0.5343,  0.2757],
        [-0.6843, -0.5116, -0.4355],
        [-1.0724,  1.0991, -1.6686]], requires_grad=True)


In [154]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.output = nn.LazyLinear(10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)



In [155]:
Y

tensor([[-0.1782, -0.0168,  0.2794,  0.2977,  0.0740, -0.2114, -0.1714, -0.0461,
          0.0619, -0.4791],
        [ 0.1122,  0.1589, -0.2360,  0.1217,  0.6947, -0.1850,  0.0623, -0.3756,
          0.0451, -0.3886]], grad_fn=<AddmmBackward0>)

In [156]:
torch.save(net.state_dict(), 'mlp.params')

In [157]:
clone = MLP()



In [158]:
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): LazyLinear(in_features=0, out_features=256, bias=True)
  (output): LazyLinear(in_features=0, out_features=10, bias=True)
)

In [159]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

In [163]:
def cpu():
    return torch.device('cpu')

torch.backends.mps.is_available()

True

In [164]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [175]:
X = torch.ones(2, 3, device='mps:0')
Y = torch.ones(2, 3, device='mps:0')

In [176]:
X, Y

(tensor([[1., 1., 1.],
         [1., 1., 1.]], device='mps:0'),
 tensor([[1., 1., 1.],
         [1., 1., 1.]], device='mps:0'))

In [177]:
X + Y

tensor([[2., 2., 2.],
        [2., 2., 2.]], device='mps:0')

In [178]:
net = nn.Sequential(nn.LazyLinear(1))
net = net.to(device='mps:0')



In [179]:
net(X)

tensor([[0.0100],
        [0.0100]], device='mps:0', grad_fn=<LinearBackward0>)

In [181]:
del X

In [182]:
del Y

del net

NameError: name 'Y' is not defined