In [1]:
%matplotlib inline
import numpy as np
import torch
torch.set_printoptions(edgeitems=2)

In [19]:
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4])
t_un = 0.1 * t_u

In [20]:
def model(t_u, w, b):
    return w * t_u + b

In [21]:
def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c)**2
    return squared_diffs.mean()

In [22]:
import torch.optim as optim

dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'Optimizer',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'lr_scheduler']

In [26]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-4
optimizer = optim.Adam([params], lr=learning_rate)

In [27]:
t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)
loss.backward()



In [30]:
optimizer.step()

params

tensor([2.5523, 0.2128], requires_grad=True)

In [35]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.Adam([params], lr=learning_rate)

t_p = model(t_un, *params)
loss = loss_fn(t_p, t_c)

optimizer.zero_grad() # <1>
loss.backward()
optimizer.step()

params

tensor([1.0100, 0.0100], requires_grad=True)

In [38]:
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params) 
        loss = loss_fn(t_p, t_c)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 500 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))
            
    return params

In [39]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate) # <1>

training_loop(
    n_epochs = 5000, 
    optimizer = optimizer,
    params = params, # <1> 
    t_u = t_un,
    t_c = t_c)

Epoch 500, Loss 7.860118
Epoch 1000, Loss 3.828538
Epoch 1500, Loss 3.092191
Epoch 2000, Loss 2.957697
Epoch 2500, Loss 2.933134
Epoch 3000, Loss 2.928648
Epoch 3500, Loss 2.927830
Epoch 4000, Loss 2.927680
Epoch 4500, Loss 2.927651
Epoch 5000, Loss 2.927648


tensor([  5.3671, -17.3012], requires_grad=True)

In [40]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-1
optimizer = optim.Adam([params], lr=learning_rate) # <1>

training_loop(
    n_epochs = 2000, 
    optimizer = optimizer,
    params = params,
    t_u = t_u, # <2> 
    t_c = t_c)

Epoch 500, Loss 7.612903
Epoch 1000, Loss 3.086700
Epoch 1500, Loss 2.928578
Epoch 2000, Loss 2.927646


tensor([  0.5367, -17.3021], requires_grad=True)

We’ve touched on a lot of the essential concepts that will enable you to train com-
plicated deep learning models while knowing what’s going on under the hood: back-
propagation to estimate gradients, autograd, and optimizing weights of models by
using gradient descent or other optimizers. We don’t have a whole lot more to cover.
The rest is mostly filling in the blanks, however extensive they are.


In [44]:
n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)

shuffled_indices = torch.randperm(n_samples)

train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

train_indices, val_indices  # <1>

(tensor([ 0,  4,  2,  8,  5,  7,  6,  9, 10]), tensor([1, 3]))

In [45]:
# torch.randperm?

In [46]:
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]

val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u

In [47]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params) # <1>
        train_loss = loss_fn(train_t_p, train_t_c)
                             
        val_t_p = model(val_t_u, *params) # <1>
        val_loss = loss_fn(val_t_p, val_t_c)
        
        optimizer.zero_grad()
        train_loss.backward() # <2>
        optimizer.step()

        if epoch <= 3 or epoch % 500 == 0:
            print('Epoch {}, Training loss {}, Validation loss {}'.format(
                epoch, float(train_loss), float(val_loss)))
            
    return params

In [48]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

training_loop(
    n_epochs = 3000, 
    optimizer = optimizer,
    params = params,
    train_t_u = train_t_un, # <1> 
    val_t_u = val_t_un, # <1> 
    train_t_c = train_t_c,
    val_t_c = val_t_c)

Epoch 1, Training loss 46.760398864746094, Validation loss 231.5820770263672
Epoch 2, Training loss 29.0880126953125, Validation loss 140.4360809326172
Epoch 3, Training loss 24.947715759277344, Validation loss 104.99067687988281
Epoch 500, Training loss 7.474353313446045, Validation loss 27.957183837890625
Epoch 1000, Training loss 3.901845693588257, Validation loss 13.457063674926758
Epoch 1500, Training loss 3.1253573894500732, Validation loss 8.595168113708496
Epoch 2000, Training loss 2.956590175628662, Validation loss 6.7411112785339355
Epoch 2500, Training loss 2.919907569885254, Validation loss 5.966399192810059
Epoch 3000, Training loss 2.9119338989257812, Validation loss 5.624724388122559


tensor([  5.0222, -15.9270], requires_grad=True)

Here, we’re not being entirely fair to the model. The validation set is small, so the val-
idation loss will be meaningful only up to a point. In any case, note that the validation
loss is higher than your training loss, although not by an order of magnitude. The fact
that a model performs better on the training set is expected since the model parame-
ters are being shaped by the training set. Your main goal is to also see both the train-
ing loss and the validation loss decreasing. Although ideally, both losses would be
roughly the same value, as long as validation loss stays reasonably close to the training
loss, you know that your model is continuing to learn generalized things about your
data. In figure 4.13, case C is ideal, and D is acceptable. In case A, the model isn’t
learning at all, and in case B, you see overfitting.


In [49]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)

        with torch.no_grad(): # <1>
            val_t_p = model(val_t_u, *params)
            val_loss = loss_fn(val_t_p, val_t_c)
            assert val_loss.requires_grad == False # <2>
            
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

In [50]:
def calc_forward(t_u, t_c, is_train):
    with torch.set_grad_enabled(is_train):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
    return loss