# Building simple NN with pytorch

A linear regression fit: 

$y = w_1 * x_1 + w_2 * x_2 + b$

$Y = X \vec w + b$

In [1]:
import torch
import torchvision
from torch import nn
from torch.utils import data
from torchvision import transforms

# from torch.nn import functional as F

In [2]:
def synthetic_data(w, b, num_examples):
    """Generate y = Xw + b + noise"""
    # draw from normal distribution
    X = torch.normal(0, 1, (num_examples, len(w)))
    # make use of broadcasting here
    y = torch.matmul(X, w) + b
    # add some noise
    y += torch.normal(0, 0.01, y.shape)
    return X, y.reshape((-1, 1))


def load_array(data_arrays, batch_size, is_train=True):
    """Construct a Pytorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [3]:
true_w = torch.tensor([2, 3.4])
true_b = 4.2
features, labels = synthetic_data(true_w, true_b, 1000)
features.shape, labels.shape

(torch.Size([1000, 2]), torch.Size([1000, 1]))

In [4]:
batch_size = 10
data_iter = load_array((features, labels), batch_size)

In [5]:
# sanity test
next(iter(data_iter))

[tensor([[ 1.7426, -1.8236],
         [-0.0831, -1.6884],
         [ 0.6476, -0.1389],
         [-0.0250, -0.4641],
         [-1.1752,  0.1984],
         [-1.6225, -1.8055],
         [-0.8927,  0.5474],
         [ 0.4309,  1.1383],
         [ 1.5729, -0.3027],
         [ 0.8980, -0.3180]]),
 tensor([[ 1.4806],
         [-1.6905],
         [ 5.0158],
         [ 2.5794],
         [ 2.5536],
         [-5.1871],
         [ 4.2896],
         [ 8.9372],
         [ 6.3109],
         [ 4.9221]])]

In [6]:
# the fully-connected layer is defined in the Linear class
model = nn.Sequential(nn.Linear(2, 1))

In [7]:
# initialize w1, w2, and b
model[0].weight.data.normal_(0, 0.01)
model[0].bias.data.fill_(0)

tensor([0.])

In [8]:
# define the cost function and optmizer
loss = nn.MSELoss()  # squared L2 norm
trainer = torch.optim.SGD(model.parameters(), lr=0.03)

In [9]:
# the most simplistic training loop
num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:
        l_estimation = loss(model(X), y)
        trainer.zero_grad()  # reset the gradients
        l_estimation.backward()  # find the new gradients based on the current loss
        trainer.step()  # update the parameters

    l_estimation = loss(model(features), labels)
    print(f"epoch {epoch + 1}, loss {l_estimation:f}")

epoch 1, loss 0.000196
epoch 2, loss 0.000102
epoch 3, loss 0.000102


## Logistic/Softmax regression and cross entropy loos

The logistic function:
$\displaystyle f(x)= \sigma(x) = {\frac {1}{1+e^{-x}}}$

The softmax function:


The cross entropy function: 
${\displaystyle H(p,q)=-\sum _{x\in {\mathcal {X}}}p(x)\,\log q(x)}{\displaystyle H(p,q)=-\sum _{x\in {\mathcal {X}}}p(x)\,\log q(x)}$

In [10]:
def load_data_fashion_mnist(batch_size, resize=None):
    """Donwload the Fashion-MNIST dataset and load it into memory"""
    transformations = [transforms.ToTensor()]
    if resize:
        transformations.insert(0, transforms.Resize(resize))

    transformations = transforms.Compose(transformations)
    mnist_train = torchvision.datasets.FashionMNIST(
        root="../data", train=True, transform=transformations, download=True
    )
    mnist_test = torchvision.datasets.FashionMNIST(
        root="../data", train=False, transform=transformations, download=True
    )
    return (
        data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=2),
        data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=2),
    )


def initialize_weights_randomly(layer):
    if isinstance(layer, nn.Linear):
        nn.init.normal_(layer.weight, std=0.01)


def init_normal(layer):
    if isinstance(layer, nn.Linear):
        nn.init.normal_(layer.weight, mean=0, std=0.01)
        nn.init.zeros_(layer.bias)


def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())


def train_loop(net, data_iter, trainer, loss_function, num_epochs=3):
    for epoch in range(num_epochs):
        for X, y in data_iter:
            y_hat = net(X)
            l_estimation = loss_function(y_hat, y)
            trainer.zero_grad()  # reset the gradients
            l_estimation.backward()  # find the new partial derivatives & gradients based on the current loss
            trainer.step()  # update the parameters with 1 iteration of Gradient Descent

        acc = accuracy(net(X), y)
        print(f"epoch {epoch + 1}, accuracy {acc:.2f}")

In [11]:
model = nn.Sequential(nn.Flatten(), nn.Linear(784, 10))
model.apply(initialize_weights_randomly)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=10, bias=True)
)

In [12]:
train_iter, test_iter = load_data_fashion_mnist(batch_size=100)
# for X, y in train_iter:
#     print(X.shape, y.shape)
#     break

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100.0%


Extracting ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100.0%
3.0%

Extracting ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100.0%
100.0%


Extracting ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw



In [13]:
train_loop(
    model,
    data_iter=train_iter,
    trainer=torch.optim.SGD(model.parameters(), lr=0.1),
    loss_function=nn.CrossEntropyLoss(),
    num_epochs=10,
)

epoch 1, accuracy 84.00
epoch 2, accuracy 82.00
epoch 3, accuracy 83.00
epoch 4, accuracy 85.00
epoch 5, accuracy 91.00
epoch 6, accuracy 84.00
epoch 7, accuracy 90.00
epoch 8, accuracy 88.00
epoch 9, accuracy 89.00
epoch 10, accuracy 85.00


### Inspecting the model


In [14]:
model.state_dict()

OrderedDict([('1.weight',
              tensor([[ 0.0008,  0.0025, -0.0004,  ..., -0.0317, -0.0184, -0.0082],
                      [-0.0033, -0.0025, -0.0019,  ..., -0.0153,  0.0078, -0.0076],
                      [ 0.0119,  0.0011, -0.0039,  ...,  0.0806,  0.0449,  0.0015],
                      ...,
                      [ 0.0245, -0.0116,  0.0023,  ..., -0.0103, -0.0027, -0.0094],
                      [ 0.0072, -0.0151, -0.0237,  ..., -0.0823, -0.0302, -0.0224],
                      [-0.0005,  0.0022, -0.0029,  ..., -0.0169,  0.0070,  0.0028]])),
             ('1.bias',
              tensor([ 0.5171, -0.6317, -0.0854,  0.4342, -1.4327,  2.6480,  0.5326,  0.0185,
                      -0.5418, -1.4618]))])

In [15]:
model[1].state_dict()

OrderedDict([('weight',
              tensor([[ 0.0008,  0.0025, -0.0004,  ..., -0.0317, -0.0184, -0.0082],
                      [-0.0033, -0.0025, -0.0019,  ..., -0.0153,  0.0078, -0.0076],
                      [ 0.0119,  0.0011, -0.0039,  ...,  0.0806,  0.0449,  0.0015],
                      ...,
                      [ 0.0245, -0.0116,  0.0023,  ..., -0.0103, -0.0027, -0.0094],
                      [ 0.0072, -0.0151, -0.0237,  ..., -0.0823, -0.0302, -0.0224],
                      [-0.0005,  0.0022, -0.0029,  ..., -0.0169,  0.0070,  0.0028]])),
             ('bias',
              tensor([ 0.5171, -0.6317, -0.0854,  0.4342, -1.4327,  2.6480,  0.5326,  0.0185,
                      -0.5418, -1.4618]))])

In [16]:
for n, _ in model.named_parameters():
    print(n)

1.weight
1.bias


In [17]:
print(model)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=10, bias=True)
)


#### Looking into nested architecture

In [18]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(4, 8), nn.ReLU())


def block2(size: int = 2):
    net = nn.Sequential()
    for i in range(size):
        net.add_module(f"inner block {i}", block1())
    return net

In [19]:
nested_net = nn.Sequential(block2(), nn.Linear(4, 1))

In [20]:
print(nested_net)

Sequential(
  (0): Sequential(
    (inner block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=4, out_features=8, bias=True)
      (3): ReLU()
    )
    (inner block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=4, out_features=8, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [21]:
# accessing a specific layer. The first Linear inside the "inner block 0":
nested_net[0][1][0].weight.data

tensor([[ 0.4445,  0.1798,  0.1653,  0.1982],
        [-0.0170,  0.0357,  0.1242,  0.2522],
        [ 0.2361,  0.0823, -0.3425, -0.2136],
        [-0.0222, -0.0907,  0.3527, -0.1307],
        [ 0.3407, -0.1102, -0.2743, -0.2541],
        [ 0.2337,  0.3699,  0.2471, -0.1450],
        [-0.3615,  0.1531, -0.3844, -0.3738],
        [-0.3422, -0.2349,  0.3745,  0.3075]])

## Load and save parameters


In [22]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        flatted = self.flatten(x)
        return self.linear_relu_stack(flatted)  # logits

In [23]:
model = MLP()
print(model)

MLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [24]:
X = torch.randn(size=(1, 28, 28))
logits = model(X)
print(logits)

tensor([[ 0.0402,  0.0196,  0.0289, -0.1012,  0.1121,  0.2018,  0.1489, -0.1140,
          0.0565, -0.0296]], grad_fn=<AddmmBackward0>)


In [25]:
# Now we applies the Softmax function to the logits Tensor
# rescaling them so that the elements lie in the range [0,1] and sum to 1.
pred_probabilities = nn.Softmax(dim=1)(logits)
print(pred_probabilities)
print(torch.argmax(pred_probabilities))

tensor([[0.0999, 0.0979, 0.0988, 0.0867, 0.1074, 0.1175, 0.1114, 0.0856, 0.1016,
         0.0932]], grad_fn=<SoftmaxBackward0>)
tensor(5)


In [26]:
# saving
torch.save(model.state_dict(), "mlp.params")

In [27]:
# loading
clone = MLP()
clone.load_state_dict(torch.load("mlp.params"))

<All keys matched successfully>