# 4.1 Learning is parameter estimation

In [None]:
### 4.1.1 A hot problem ###
import torch
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4])

In [None]:
### 4.1.2 Choosing a linear model as a first try ###
# t_c = w * t_u + b
# name w and b after weight and bias.

In [None]:
### 4.1.3 Less loss is what you want ###
# two loss functions |t_p - t_c| and (t_p - t_c)^2
# the square of differences behaves more nicely.

In [None]:
### 4.1.4 From problem to PyTorch ###
# The model
def model(t_u, w, b):
    return w*t_u + b
# Loss function
def loss_fn(t_p, t_c):
    squared_diffs = (t_p-t_c)**2
    return squared_diffs.mean()

#  initialize the parameters
w = torch.ones(1) # Scalar, the product operation will use broadcasting. 
b = torch.zeros(1)

#  invoke the model
t_p = model(t_u, w, b)
print('t_p: ', t_p)

# Loss
loss = loss_fn(t_p, t_c)
print('loss: ', loss)

In [None]:
### 4.1.5 Down along the gradient ###
# Basic gradient descent: grad = [loss(w+delt)-loss(w-delt)]/2*delt
delt = 0.1
# w
grad_w = (loss_fn(model(t_u, w+delt, b), t_c)-loss_fn(model(t_u, w-delt, b), t_c))/2*delt
# b
grad_b = (loss_fn(model(t_u, w, b+delt), t_c) - loss_fn(model(t_u, w, b-delt), t_c))/2*delt
# learning rate
learning_rate = 1e-2
w = w-learning_rate*grad_w
b = b-learning_rate*grad_b

In [None]:
### 4.1.6 Getting analytical ###
# apply the chain rule and compute the derivative of the loss.
# d loss_fn / d w = (d loss_fn / d t_p) * (d t_p / d w)

# d loss_fn / d t_p
def dloss_fn(t_p, t_c):
    return 2*(t_p-t_c)

# d t_p / d w
def dmodel_dw(t_u, w, b):
    return t_u
# d t_p / d b
def dmodel_db(t_u, w, b):
    return 1.0

# grad function.
# grad_loss = (grad_w, grad_b)
def grad_fn(t_u, t_c, t_p, w, b):
    grad_w = dloss_fn(t_p, t_c)*dmodel_dw(t_u, w, b)
    grad_b = dloss_fn(t_p, t_c)*dmodel_db(t_u, w, b)
    return torch.stack([grad_w.mean(), grad_b.mean()])

In [None]:
### 4.1.7 The training loop ###
def training_loop(n_epochs, learning_rate, params, t_u, t_c):
    for epoch in range(1, n_epochs+1):
        w, b = params
        # Forward pass
        t_p = model(t_u, w, b)
        # Backward pass
        loss = loss_fn(t_p, t_c)
        grad = grad_fn(t_u, t_c, t_p, w, b)
        # Update params
        params = params - grad*learning_rate
        # print
        print('epoch: %d, loss: %f' %(epoch, float(loss)))
        print('params: ', params)
        print('grad: ', grad)
    # return params.
    return params

In [None]:
# Now invoke your training loop.
training_loop(
    n_epochs = 100,
    learning_rate = 1e-2,
    params = torch.tensor([1.0, 0.0]),
    t_u = t_u,
    t_c = t_c
)
# Blow up

In [None]:
# Set lower learning_rate: 1e-4
training_loop(
    n_epochs = 100,
    learning_rate = 1e-4,
    params = torch.tensor([1.0, 0.0]),
    t_u = t_u,
    t_c = t_c
)

In [None]:
#  normalized input
# learning rate states 1e-2
t_un = t_u*0.1
training_loop(
    n_epochs = 100,
    learning_rate = 1e-2,
    params = torch.tensor([1.0, 0.0]),
    t_u = t_un, # t_un
    t_c = t_c
)

In [None]:
# Change n_epochs to 5000
# learning rate states 1e-2
t_un = t_u*0.1
params = training_loop(
    n_epochs = 5000,
    learning_rate = 1e-2,
    params = torch.tensor([1.0, 0.0]),
    t_u = t_un, # t_un
    t_c = t_c
)

In [None]:
# plot your data
# https://matplotlib.org/3.2.1/api/_as_gen/matplotlib.pyplot.plot.html
%matplotlib inline
from matplotlib import pyplot as plt
# out
t_p = model(t_un, *params)
# plot
fig = plt.figure(dpi=600)
plt.xlabel('Fahrenheit')
plt.ylabel('Celsius')
# Draw fitted line.
plt.plot(t_u.numpy(), t_p.detach().numpy())
plt.plot(t_u.numpy(), t_c.numpy(), 'o') # 'o': circle marker

# 4.2 PyTorch's autograd: Backpropagate all things

In [None]:
# Data
# Celsius
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0])
# Fahrenheit
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4])

# The model
def model(t_u, w, b):
    return w*t_u + b
# Loss function
def loss_fn(t_p, t_c):
    squared_diffs = (t_p-t_c)**2
    return squared_diffs.mean()
# Parameters.
params = torch.tensor([1.0, 0.0], requires_grad=True)
print('params.grad: ', params.grad)

In [None]:
# Auto grad.
loss = loss_fn(model(t_u, *params), t_c)
# Backward.
loss.backward()
print('params.grad: ', params.grad)

In [None]:
# Autograd training loop
def training_loop(n_epochs, learning_rate, params, t_u, t_c):
    for epoch in range(1, n_epochs+1):
        # Clear grad before call `loss.backward()`
        if params.grad is not None:
            params.grad.zero_()
        # Forward pass
        t_p = model(t_u, *params)
        # Backward pass
        loss = loss_fn(t_p, t_c)
        loss.backward()
        # Update params
        # detatch(): detatch from computation graph, backpropagate only your current params.
        # requires_grad_(): Tracking params, autograd again.
        params = (params-learning_rate*params.grad).detach().requires_grad_()
        
        # print
        print('epoch: %d, loss: %f' %(epoch, float(loss)))
        print('params: ', params)
        print('grad: ', params.grad)
    # return params.
    return params

In [None]:
# Normlization.
t_un = t_u*0.1
training_loop(
    n_epochs = 5000,
    learning_rate = 1e-2,
    params = torch.tensor([1.0, 0.0], requires_grad=True),
    t_u = t_un,
    t_c = t_c
)

In [None]:
### 4.2.1 Optimizers a la carte ###
# Optimization list
import torch.optim as optim
dir(optim)

# use optimizer.
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)
# Forward.
t_p = model(t_un, *params)
# Loss
loss = loss_fn(t_p, t_c)
# Clear grad.
optimizer.zero_grad()
# Backward
loss.backward()
# Update params.
optimizer.step()
print('params: ', params)

In [None]:
# Training loop with optimizer.
# Autograd training loop
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs+1):
        
        # Forward pass
        t_p = model(t_u, *params)
        # Backward pass
        loss = loss_fn(t_p, t_c)
        # Clear grad before call `loss.backward()`
        optimizer.zero_grad()
        loss.backward()
        # Update params
        optimizer.step()
        
        # print
        if (epoch % 500 == 0):
            print('epoch: %d, loss: %f' %(epoch, float(loss)))
            print('params: ', params)
    # return params.
    return params

In [None]:
# Training
# Normlization.
t_un = t_u*0.1
training_loop(
    n_epochs = 5000,
    optimizer = optimizer,
    params = params,
    t_u = t_un,
    t_c = t_c
)

In [None]:
# Use adam optimizer
params = torch.tensor([1.0, 0.0], requires_grad=True)
# increase the learning rate to 1e-1, Adam won't even blink.
learning_rate = 1e-1 
optimizer = optim.Adam([params], lr=learning_rate)
# No need normalization
training_loop(
    n_epochs = 2000,
    optimizer = optimizer,
    params = params,
    t_u = t_u,  # No need normalization
    t_c = t_c
)

In [None]:
### 4.2.2 Training, validation, and overfitting ###
n_samples = t_u.shape[0]
n_val = int(0.2*n_samples)
shuffled_indices = torch.randperm(n_samples)
train_indices = shuffled_indices[:-n_val] # Before the last n_val samples.
val_indices = shuffled_indices[-n_val:] # The last n_val samples. 

# Get train & val set.
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]
val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

# Rewrite train loop.
def train_loop(n_epochs, optimizer, params, train_t_u, train_t_c, val_t_u, val_t_c):
    for epoch in range(1, n_epochs+1):
        # Forward.
        train_t_p = model(train_t_u, *params)
        val_t_p = model(val_t_u, *params)
        # Loss
        train_loss = loss_fn(train_t_p, train_t_c)
        val_loss = loss_fn(val_t_p, val_t_c)
        # Clear grad.
        optimizer.zero_grad()
        # Backward
        train_loss.backward()
        # update params.
        optimizer.step()

        # print
        if (epoch % 500 == 0):
            print('epoch: %d, train_loss: %f, val_loss: %f' %(epoch, train_loss, val_loss))
    return params

# Use adam optimizer
params = torch.tensor([1.0, 0.0], requires_grad=True)
# increase the learning rate to 1e-1, Adam won't even blink.
learning_rate = 1e-1 
optimizer = optim.Adam([params], lr=learning_rate)
# No need normalization
train_loop(
    n_epochs = 2000,
    optimizer = optimizer,
    params = params,
    train_t_u = train_t_u,  # No need normalization
    train_t_c = train_t_c,
    val_t_u = val_t_u,
    val_t_c = val_t_c
)

In [None]:
### 4.2.3 Nits in autograd and switching it off ###

# Switch off autograd when validation.
def calc_forward(t_u, t_c, params, is_train):
    with torch.set_grad_enabled(is_train):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
    return loss

# Rewrite train loop
def train_loop_plus(n_epochs, optimizer, params, train_t_u, train_t_c, val_t_u, val_t_c):
    for epoch in range(1, n_epochs+1):
        # Forward.
        train_loss = calc_forward(train_t_u, train_t_c, params, True)
        val_loss = calc_forward(val_t_u, val_t_c, params, False)
        # Clear grad.
        optimizer.zero_grad()
        # Backward
        train_loss.backward()
        # update params.
        optimizer.step()

        # print
        if (epoch % 500 == 0):
            print('epoch: %d, train_loss: %f, val_loss: %f' %(epoch, train_loss, val_loss))
    return params

# No need normalization
train_loop_plus(
    n_epochs = 2000,
    optimizer = optimizer,
    params = params,
    train_t_u = train_t_u,  # No need normalization
    train_t_c = train_t_c,
    val_t_u = val_t_u,
    val_t_c = val_t_c
)

# Exercises

In [None]:
# 1. Redefine the model to be w2 * t_u ** 2 + w1 * t_u + b
def model_2rd(t_u, w1, w2, b):
    return w2*(t_u**2) + w1*t_u + b
# Switch off autograd when validation.
def calc_forward(t_u, t_c, params, is_train):
    with torch.set_grad_enabled(is_train):
        t_p = model_2rd(t_u, *params)  # replace with `model_2rd`
        loss = loss_fn(t_p, t_c)
    return loss

# Use adam optimizer
params = torch.tensor([1.0, 1.0, 0.0], requires_grad=True)
# increase the learning rate to 1e-1, Adam won't even blink.
learning_rate = 1e-1 
optimizer = optim.Adam([params], lr=learning_rate)

# No need normalization
params = train_loop_plus(
    n_epochs = 5000,
    optimizer = optimizer,
    params = params,
    train_t_u = train_t_u,  # No need normalization
    train_t_c = train_t_c,
    val_t_u = val_t_u,
    val_t_c = val_t_c
)

In [None]:
# Plot 
%matplotlib inline
from matplotlib import pyplot as plt
# Create plot
plt.figure(dpi=600)
plt.xlabel('Fahrenheit')
plt.ylabel('Celsius')

# Data
t_p = model_2rd(t_u, *params)
# plt fitted model.
plt.plot(t_u.numpy(), t_p.detach().numpy())
# plt raw data
plt.plot(t_u.numpy(), t_c.numpy(), 'o')