# Оптимизация нейронных сетей. Метод обратного распространения ошибки (ноутбук)

> Узнаем как устроена оптимизация нейронных сетей. Оптимизация в `PyTorch`.

## План ноутбука

1. Высокоуровневое API для обучение нейросетей в `PyTorch`
2. Обучение первой нейросети в `PyTorch`

## Высокоуровневое API для обучение нейросетей в `PyTorch`

In [1]:
import torch
import torch.nn as nn

### Создание объекта нейросети

In [2]:
net = nn.Sequential(
    nn.Linear(700, 500),
    nn.ReLU(),
    nn.Linear(500, 200),
    nn.ReLU(),
    nn.Linear(200, 10)
)

In [6]:
net

Sequential(
  (0): Linear(in_features=700, out_features=500, bias=True)
  (1): ReLU()
  (2): Linear(in_features=500, out_features=200, bias=True)
  (3): ReLU()
  (4): Linear(in_features=200, out_features=10, bias=True)
)

In [7]:
from collections import OrderedDict

net = nn.Sequential(
    OrderedDict(
        [
            ('linear1', nn.Linear(700, 500)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(500, 200)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(200, 10))
        ]
    )
)

In [8]:
net

Sequential(
  (linear1): Linear(in_features=700, out_features=500, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=500, out_features=200, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=200, out_features=10, bias=True)
)

In [9]:
net.linear1

Linear(in_features=700, out_features=500, bias=True)

In [10]:
input_tensor = torch.rand(6, 700)

net(input_tensor).shape

torch.Size([6, 10])

In [16]:
class CustomTaskNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear1 = nn.Linear(700, 500)
        self.linear3 = nn.Linear(500, 10)
        self.linear2 = nn.Linear(500, 500)
        
        self.activation = nn.ReLU()

    def forward(self, x):
        output = self.activation(self.linear1(x))
        output = self.activation(self.linear2(output))
        output = self.activation(self.linear2(output))
        output = self.linear3(output)

        return output

In [17]:
net = CustomTaskNetwork()

In [19]:
net.__repr__()

'CustomTaskNetwork(\n  (linear1): Linear(in_features=700, out_features=500, bias=True)\n  (linear3): Linear(in_features=500, out_features=10, bias=True)\n  (linear2): Linear(in_features=500, out_features=500, bias=True)\n  (activation): ReLU()\n)'

In [15]:
net(input_tensor)

tensor([[ 0.1047,  0.0018, -0.0319,  0.0088,  0.0206, -0.0232, -0.0438, -0.0296,
         -0.0596,  0.0264],
        [ 0.0695, -0.0082, -0.0272,  0.0197,  0.0149, -0.0096, -0.0450, -0.0269,
         -0.0376,  0.0147],
        [ 0.0836, -0.0125, -0.0277,  0.0262,  0.0144, -0.0084, -0.0455, -0.0274,
         -0.0429,  0.0231],
        [ 0.0628, -0.0069, -0.0311,  0.0112,  0.0018, -0.0231, -0.0261, -0.0272,
         -0.0663,  0.0373],
        [ 0.0950,  0.0019, -0.0376,  0.0258,  0.0259, -0.0222, -0.0506, -0.0409,
         -0.0361,  0.0272],
        [ 0.0803, -0.0290, -0.0264, -0.0017,  0.0162, -0.0159, -0.0376, -0.0209,
         -0.0666,  0.0297]], grad_fn=<AddmmBackward0>)

In [20]:
net.to(torch.device('cuda:0'))

CustomTaskNetwork(
  (linear1): Linear(in_features=700, out_features=500, bias=True)
  (linear3): Linear(in_features=500, out_features=10, bias=True)
  (linear2): Linear(in_features=500, out_features=500, bias=True)
  (activation): ReLU()
)

In [27]:
net.linear1.weight.device

device(type='cpu')

In [26]:
net.cpu()

CustomTaskNetwork(
  (linear1): Linear(in_features=700, out_features=500, bias=True)
  (linear3): Linear(in_features=500, out_features=10, bias=True)
  (linear2): Linear(in_features=500, out_features=500, bias=True)
  (activation): ReLU()
)

In [30]:
net.train()
net.eval()

net.linear1.weight

Parameter containing:
tensor([[ 3.5266e-02,  5.7545e-03, -3.4416e-02,  ..., -2.6693e-02,
         -3.0254e-02, -5.2823e-03],
        [-4.8144e-03,  1.9251e-02, -1.7009e-02,  ..., -8.3402e-03,
         -3.5727e-02, -3.1722e-02],
        [ 2.5009e-02, -2.3408e-02, -2.4295e-02,  ..., -5.3888e-03,
          8.9848e-03,  3.1952e-02],
        ...,
        [ 3.3807e-02,  2.1817e-03,  8.7364e-03,  ..., -3.3118e-02,
         -7.7291e-05,  3.4851e-02],
        [ 1.2947e-02,  3.3807e-02, -3.7735e-02,  ...,  1.5305e-02,
          1.3197e-02, -1.5404e-02],
        [ 1.6492e-02, -1.9562e-02, -5.0703e-03,  ..., -2.2453e-02,
         -3.2365e-02,  1.2563e-02]])

In [32]:
list(net.parameters())

[Parameter containing:
 tensor([[ 3.5266e-02,  5.7545e-03, -3.4416e-02,  ..., -2.6693e-02,
          -3.0254e-02, -5.2823e-03],
         [-4.8144e-03,  1.9251e-02, -1.7009e-02,  ..., -8.3402e-03,
          -3.5727e-02, -3.1722e-02],
         [ 2.5009e-02, -2.3408e-02, -2.4295e-02,  ..., -5.3888e-03,
           8.9848e-03,  3.1952e-02],
         ...,
         [ 3.3807e-02,  2.1817e-03,  8.7364e-03,  ..., -3.3118e-02,
          -7.7291e-05,  3.4851e-02],
         [ 1.2947e-02,  3.3807e-02, -3.7735e-02,  ...,  1.5305e-02,
           1.3197e-02, -1.5404e-02],
         [ 1.6492e-02, -1.9562e-02, -5.0703e-03,  ..., -2.2453e-02,
          -3.2365e-02,  1.2563e-02]]),
 Parameter containing:
 tensor([-8.9737e-03, -2.8011e-02, -1.7497e-02, -3.1679e-02, -1.2169e-02,
         -3.7304e-02,  8.6797e-04, -1.5913e-02, -5.3632e-03, -1.1712e-02,
          1.3446e-02, -3.5661e-02,  1.7712e-02, -3.4449e-02,  3.1551e-02,
         -5.8992e-03,  2.5977e-03, -2.6701e-02, -1.8009e-02,  2.2785e-02,
          3.

In [33]:
net.state_dict()

OrderedDict([('linear1.weight',
              tensor([[ 3.5266e-02,  5.7545e-03, -3.4416e-02,  ..., -2.6693e-02,
                       -3.0254e-02, -5.2823e-03],
                      [-4.8144e-03,  1.9251e-02, -1.7009e-02,  ..., -8.3402e-03,
                       -3.5727e-02, -3.1722e-02],
                      [ 2.5009e-02, -2.3408e-02, -2.4295e-02,  ..., -5.3888e-03,
                        8.9848e-03,  3.1952e-02],
                      ...,
                      [ 3.3807e-02,  2.1817e-03,  8.7364e-03,  ..., -3.3118e-02,
                       -7.7291e-05,  3.4851e-02],
                      [ 1.2947e-02,  3.3807e-02, -3.7735e-02,  ...,  1.5305e-02,
                        1.3197e-02, -1.5404e-02],
                      [ 1.6492e-02, -1.9562e-02, -5.0703e-03,  ..., -2.2453e-02,
                       -3.2365e-02,  1.2563e-02]])),
             ('linear1.bias',
              tensor([-8.9737e-03, -2.8011e-02, -1.7497e-02, -3.1679e-02, -1.2169e-02,
                      -3.7304e-02, 

In [35]:
torch.save(net.state_dict(), 'model.pt')

In [37]:
torch.load('model.pt')

OrderedDict([('linear1.weight',
              tensor([[ 3.5266e-02,  5.7545e-03, -3.4416e-02,  ..., -2.6693e-02,
                       -3.0254e-02, -5.2823e-03],
                      [-4.8144e-03,  1.9251e-02, -1.7009e-02,  ..., -8.3402e-03,
                       -3.5727e-02, -3.1722e-02],
                      [ 2.5009e-02, -2.3408e-02, -2.4295e-02,  ..., -5.3888e-03,
                        8.9848e-03,  3.1952e-02],
                      ...,
                      [ 3.3807e-02,  2.1817e-03,  8.7364e-03,  ..., -3.3118e-02,
                       -7.7291e-05,  3.4851e-02],
                      [ 1.2947e-02,  3.3807e-02, -3.7735e-02,  ...,  1.5305e-02,
                        1.3197e-02, -1.5404e-02],
                      [ 1.6492e-02, -1.9562e-02, -5.0703e-03,  ..., -2.2453e-02,
                       -3.2365e-02,  1.2563e-02]])),
             ('linear1.bias',
              tensor([-8.9737e-03, -2.8011e-02, -1.7497e-02, -3.1679e-02, -1.2169e-02,
                      -3.7304e-02, 

In [38]:
net.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

### Оптимизаторы

In [39]:
from torch import optim

In [40]:
optim.SGD, optim.Adam

(torch.optim.sgd.SGD, torch.optim.adam.Adam)

In [42]:
optimizer = optim.Adam(net.parameters(), betas=(0.9, 0.999), lr=1e-3)

In [43]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [44]:
optimizer = optim.SGD(
    [
        {'params': net.linear1.parameters()},
        {'params': net.linear2.parameters(), 'lr': 1e-3}
    ],
    lr=1e-2,
    momentum=0.9
)

In [45]:
optimizer

SGD (
Parameter Group 0
    dampening: 0
    foreach: None
    lr: 0.01
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
)

In [47]:
optimizer.step()

In [46]:
optimizer.zero_grad()

### Функции потерь

In [48]:
nn.L1Loss, nn.MSELoss, nn.CrossEntropyLoss

(torch.nn.modules.loss.L1Loss,
 torch.nn.modules.loss.MSELoss,
 torch.nn.modules.loss.CrossEntropyLoss)

In [49]:
loss = nn.MSELoss()

In [50]:
loss

MSELoss()

In [51]:
x = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5)

output = loss(x, target)

print(output)

output.backward()

tensor(1.2666, grad_fn=<MseLossBackward0>)


In [52]:
x.grad

tensor([[ 0.0048,  0.0302, -0.1696,  0.0088, -0.1233],
        [-0.0516, -0.1192, -0.0600,  0.1681, -0.0162],
        [ 0.0931, -0.4144,  0.0005,  0.0449, -0.2476]])

### Датасеты и даталоадеры

In [53]:
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset

In [54]:
n_features = 2
n_objects = 300

torch.manual_seed(0);

In [55]:
w_true = torch.randn(n_features)

X = (torch.rand(n_objects, n_features) - 0.5) * 10
X *= (torch.arange(n_features) * 2 + 1)
Y = (X @ w_true + torch.randn(n_objects)).unsqueeze(1)

X.shape, Y.shape

(torch.Size([300, 2]), torch.Size([300, 1]))

In [56]:
X

tensor([[-1.9258e+00,  4.0224e+00],
        [-9.9066e-02,  1.1893e+01],
        [-4.4372e-01,  3.9692e+00],
        [-1.5111e+00, -2.9485e+00],
        [-4.7767e+00, -9.9342e+00],
        [-2.0611e+00,  5.5565e-01],
        [ 1.9767e+00,  9.0003e+00],
        [-3.3897e+00, -6.5319e+00],
        [ 1.8161e+00,  1.2456e+01],
        [-1.0290e+00,  1.1225e+01],
        [-8.0592e-01,  1.5872e+00],
        [ 4.5274e+00, -1.3915e+01],
        [-3.1477e+00, -3.7975e+00],
        [-1.9490e+00,  1.2960e+01],
        [-3.2409e+00, -6.9050e+00],
        [-3.4932e+00, -1.4048e+01],
        [-2.9187e+00,  1.2894e+01],
        [ 2.2311e+00,  7.2701e+00],
        [ 2.6296e-01, -7.6903e+00],
        [ 8.4592e-01, -1.4005e+01],
        [-3.6128e+00, -7.7330e+00],
        [ 3.1547e+00,  8.7948e+00],
        [-2.2175e+00, -5.4124e-01],
        [ 3.1978e+00,  1.4912e+01],
        [ 1.9844e+00,  2.0264e+00],
        [ 3.3524e+00, -8.8320e+00],
        [ 9.3172e-01, -1.1630e+01],
        [-3.4654e+00, -7.748

In [57]:
Y

tensor([[-3.4378e+00],
        [-3.7890e+00],
        [-2.5952e+00],
        [-7.3753e-01],
        [-4.4356e+00],
        [-4.3357e+00],
        [ 1.3836e+00],
        [-3.7483e+00],
        [-1.2653e+00],
        [-4.0966e+00],
        [-3.2285e+00],
        [ 1.0745e+01],
        [-4.5013e+00],
        [-7.2812e+00],
        [-3.4633e+00],
        [-1.4592e+00],
        [-7.5205e+00],
        [-1.3306e+00],
        [ 3.2050e+00],
        [ 5.3060e+00],
        [-3.8119e+00],
        [ 1.7163e+00],
        [-4.1767e+00],
        [-1.9737e-01],
        [ 2.6804e+00],
        [ 8.2850e+00],
        [ 3.5710e+00],
        [-3.1805e+00],
        [ 9.2787e-01],
        [-6.0633e+00],
        [ 2.6105e+00],
        [-5.3974e-03],
        [ 9.3413e-01],
        [-4.0694e+00],
        [-4.8873e-01],
        [ 4.1689e+00],
        [ 5.9016e+00],
        [ 2.5146e+00],
        [-1.0889e+01],
        [ 9.8984e+00],
        [ 2.9398e+00],
        [ 7.1161e-01],
        [ 6.0663e+00],
        [-2

In [58]:
dataset = TensorDataset(X, Y)

In [59]:
dataset[7]

(tensor([-3.3897, -6.5319]), tensor([-3.7483]))

In [60]:
X[7], Y[7]

(tensor([-3.3897, -6.5319]), tensor([-3.7483]))

In [61]:
class CustomDataset(Dataset):
    def __init__(self, w_true, n_features, n_objects):
        self.X = (torch.rand(n_objects, n_features) - 0.5) * 10
        self.X *= (torch.arange(n_features) * 2 + 1)
        self.Y = self.X @ w_true + torch.randn(n_objects)

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, item):
        return self.X[item], self.Y[item]

In [62]:
dataset = CustomDataset(w_true, n_features, n_objects)

In [65]:
dataset[7]

(tensor([4.4391, 9.1902]), tensor(4.4021))

In [66]:
dataset.X[7]

tensor([4.4391, 9.1902])

In [67]:
from torch.utils.data import DataLoader

In [68]:
loader = DataLoader(dataset, batch_size=4, shuffle=True, drop_last=True)

In [70]:
X.shape

torch.Size([300, 2])

### Общая структура обучения модели

In [None]:
model.train()

for x, y in dataloader:
    optimizer.zero_grad()

    output = model(x)

    loss = loss_fn(output, y)

    loss.backward()

    optimizer.step()

In [71]:
from tqdm import tqdm

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer


def train(model: nn.Module, data_loader: DataLoader, optimizer: Optimizer, loss_fn):
    model.train()

    total_loss = 0

    for x, y in tqdm(data_loader):
        optimizer.zero_grad()

        output = model(x)

        loss = loss_fn(output, y)

        loss.backward()

        total_loss += loss.item()

        optimizer.step()

    return total_loss / len(data_loader)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader


@torch.inference_mode()
def evaluate(model: nn.Module, data_loader: DataLoader, loss_fn):
    model.eval()

    total_loss = 0

    for x, y in tqdm(data_loader):
        output = model(x)

        loss = loss_fn(output, y)

        total_loss += loss.item()

    return total_loss / len(data_loader)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style='darkgrid')


def plot_stats(
    train_loss: list[float],
    valid_loss: list[float],
    title: str
):
    plt.figure(figsize=(16, 8))

    plt.title(title + ' loss')

    plt.plot(train_loss, label='Train loss')
    plt.plot(valid_loss, label='Valid loss')
    plt.legend()

    plt.show()

In [None]:
from IPython.display import clear_output


def fit(model, train_loader, valid_loader, optimizer, loss_fn, num_epochs, title):
    train_loss_history, valid_loss_history = [], []

    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, loss_fn)
        valid_loss = evaluate(model, valid_loader, loss_fn)

        train_loss_history.append(train_loss)
        valid_loss_history.append(valid_loss)

        clear_output()

        plot_stats(train_loss_history, valid_loss_history, title)

## Обучение первой нейросети в `PyTorch`

In [None]:
class CustomTaskNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear = nn.Linear(n_features, 1)

    def forward(self, x):
        return self.linear(x)


model = CustomTaskNetwork()

optimizer = optim.Adam(model.parameters(), lr=1e-3)

loss_fn = nn.MSELoss()

In [None]:
from torch.utils.data import random_split


dataset = TensorDataset(X, Y)

train_dataset, valid_dataset = random_split(
    dataset,
    (int(len(dataset) * 0.8), len(dataset) -  int(len(dataset) * 0.8)),
    generator=torch.Generator().manual_seed(300)
)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=10, shuffle=False)

In [None]:
fit(model, train_loader, valid_loader, optimizer, loss_fn, 10, 'Simple fc')