<a href="https://colab.research.google.com/github/kscaman/DL_ENS/blob/main/DL_ENS_Optimization_algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Optimization algorithms

Code adapted from [DataFlowr](https://dataflowr.github.io/website/modules/4-optimization-for-deep-learning/), itself adapted from the [optimization chapter](http://www.d2l.ai/chapter_optimization/) of Dive into Deep Learning

In [None]:
%matplotlib inline

import numpy as np
import torch
import matplotlib.pyplot as plt


def get_gradient(f, x, noise_std=0):
    """Returns the gradient of a function f at x with additive Gaussian noise."""
    x = x.detach() # Removes information about the gradient
    x.requires_grad = True
    output = f(x)
    output.backward()
    gradient = x.grad
    noise = torch.randn(gradient.shape)
    return gradient + noise_std * noise

def optimize(update, num_iter=20, init_s=0):
    """Optimize the objective function of 2d variables with a customized update."""
    """update(x,s_x) should return the updated positions x and possible memory terms."""
    x = torch.Tensor([-5, -2])
    s = init_s
    all_x = [x]
    for i in range(num_iter):
        x, s = update(x, s)
        all_x.append((x))
    print('epoch %d, x1 %f, x2 %f' % (i+1, x[0], x[1]))
    return torch.stack(all_x, dim=0)

def show_trace_2d(f, all_x, color='red', is_new_plot=True):
    """Show the trace of 2d variables during optimization."""
    # Plot the iterates x_t
    if is_new_plot:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    else:
        fig = plt.gcf()
        ax1 = fig.axes[0]
        ax2 = fig.axes[1]
    ax1.plot(all_x[:,0], all_x[:,1], '-o', color=color)
    all_f = np.array([f(torch.Tensor([x[0], x[1]])) for x in all_x])

    # Plot the function's level sets
    x1 = np.arange(-5.5, 3.5, 0.1)
    x2 = np.arange(-3.0, 2.0, 0.1)
    f_grid = np.array([[f(torch.Tensor([u, v])) for u in x1] for v in x2])
    ax1.contour(x1, x2, f_grid, colors='blue')
    ax1.set_xlim([-5.5, 3.5])
    ax1.set_ylim([-3.0, 2.0])
    ax1.set_xlabel('x1')
    ax1.set_ylabel('x2')

    # Plot the function values f(x_t)
    ax2.plot(all_f, color=color)
    ax2.set_xlabel('time ($t$)')
    ax2.set_ylabel('function value ($f(x_t)$)')

In [None]:
# The function to optimize
def f(x):
    return 0.1 * x[0] ** 2 + 2 * x[1] ** 2

## Gradient descent

In [None]:
eta = 0.4

def gd(x, s):
    gradient = get_gradient(f, x)
    x = x - eta * gradient
    return x, 0

In [None]:
eta = 0.4
show_trace_2d(f, optimize(gd))

In [None]:
eta = 0.6
show_trace_2d(f, optimize(gd))

## Momemtum

In [None]:
eta, gamma = 0.4, 0.5

def momentum(x, v):
    #
    # YOUR CODE HERE
    #
    return x, v

In [None]:
eta = 0.4
show_trace_2d(f, optimize(momentum))

In [None]:
eta = 0.6
show_trace_2d(f, optimize(momentum))

In [None]:
eta, gamma = 0.05, 0.9
show_trace_2d(f, optimize(momentum))

## Nesterov accelerated gradient

In [None]:
eta, gamma = 0.05, 0.9

def nesterov(x, v):
    #
    # YOUR CODE HERE
    #
    return x, v

In [None]:
show_trace_2d(f, optimize(nesterov))
show_trace_2d(f, optimize(momentum), color= 'green')

## Adagrad

In [None]:
import math

def adagrad(x, s):
    eps = 1e-6
    #
    # YOUR CODE HERE (hint: use x**a for coordinate-wise power a)
    #
    return x, s

In [None]:
eta = 0.4
show_trace_2d(f, optimize(adagrad))

In [None]:
eta = 1.5
show_trace_2d(f, optimize(adagrad))

## RMSProp

In [None]:
def rmsprop(x, s):
    eps = 1e-6
    #
    # YOUR CODE HERE
    #
    return x, s

In [None]:
eta, gamma = 0.4, 0.9
show_trace_2d(f, optimize(rmsprop))

## Adam

In [None]:
def adam(x, s):
    beta1, beta2, eps = 0.9, 0.99, 1e-6
    m, v, t = s
    #
    # YOUR CODE HERE
    #
    return x, (m, v, t)

In [None]:
eta = 0.8
show_trace_2d(f, optimize(adam, init_s=(0, 0, 0)))

## AMSGrad

In [None]:
def amsgrad(x, s):
    beta1, beta2, eps = 0.9, 0.99, 1e-6
    m, v, v_bar = s
    #
    # YOUR CODE HERE
    #
    return x, (m, v, v_bar)

In [None]:
eta = 1
show_trace_2d(f, optimize(amsgrad, init_s=(0, 0, torch.Tensor([0, 0]))))
show_trace_2d(f, optimize(adam, init_s=(0, 0, 0)), color ='green', is_new_plot=False)

# Stochastic optimization and schedulers

We will now see what happens when the gradient is noisy.

In [None]:
eta = 0.4

def sgd(x, s):
    gradient = get_gradient(f, x, noise_std=0.8)
    x = x - eta * gradient
    return x, 0

In [None]:
eta = 0.4
show_trace_2d(f, optimize(sgd, num_iter=200))
eta = 0.2
show_trace_2d(f, optimize(sgd, num_iter=200), color='green', is_new_plot=False)
eta = 0.05
show_trace_2d(f, optimize(sgd, num_iter=200), color='blue', is_new_plot=False)

A high learning rate is faster, but a small learning rate leads to better final performance.
Propose a modification of sgd that use behavior this to his advantage.

In [None]:
def sgd_with_scheduler(x, t):
    #
    # YOUR CODE HERE
    #
    return x, t

In [None]:
eta = 0.4
show_trace_2d(f, optimize(sgd, num_iter=200))
eta = 0.2
show_trace_2d(f, optimize(sgd, num_iter=200), color='green', is_new_plot=False)
eta = 0.05
show_trace_2d(f, optimize(sgd, num_iter=200), color='blue', is_new_plot=False)
show_trace_2d(f, optimize(sgd_with_scheduler, num_iter=200), color='yellow', is_new_plot=False)

In [None]:
# Generating a dataset of two Gaussian classes and 100 samples
N = 100
v = torch.Tensor([1,1]).unsqueeze(0)
Y = 2 * torch.randint(low=0, high=2, size=(N, 1), dtype=torch.float) - 1
X = 1 * torch.randn(N, 2) + Y @ v
plt.plot(X[Y.flatten() > 0,0], X[Y.flatten() > 0,1], 'o', color='red')
plt.plot(X[Y.flatten() < 0,0], X[Y.flatten() < 0,1], 'o', color='blue')

In [None]:
def mse_linear(theta):
    #
    # YOUR CODE HERE
    # WARNING: theta should be of size (2,1)
    #

mini_batch_size = 1
def mse_linear_mb(theta, t):
    #
    # YOUR CODE HERE
    # WARNING: theta should be of size (2,1)
    # HINT: use torch.randperm
    #

In [None]:
def adam(x, s):
    beta1, beta2, eps = 0.9, 0.99, 1e-6
    m, v, t = s
    #
    # YOUR CODE HERE
    #
    return x, (m, v, t)

eta = 0.1
mini_batch_size = 1
show_trace_2d(mse_linear, optimize(adam, num_iter=100, init_s=(0, 0, 0)))
mini_batch_size = 5
show_trace_2d(mse_linear, optimize(adam, num_iter=100, init_s=(0, 0, 0)), color='green', is_new_plot=False)
mini_batch_size = 100
show_trace_2d(mse_linear, optimize(adam, num_iter=100, init_s=(0, 0, 0)), color='blue', is_new_plot=False)

# Pytorch optimizers and schedulers
Typical training loop, optimizers and schedulers in Pytorch

In [None]:
# MINIMAL (AND INCOMPLETE) EXAMPLE
for epoch in range(20):
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

In [None]:
# MORE DETAILED AND COMPLETE EXAMPLE

### PARAMETERS
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 100

### DATA, MODEL, LOSS, OPTIMIZER AND SCHEDULER
dataloader = ... # YOUR DATA
model = ... # YOUR MODEL
criterion = ... # YOUR LOSS FUNCTION
optimizer = ... # YOUR OPTIMIZER
scheduler = ... # YOUR SCHEDULER

### TRAINING LOOP
# Prepares the model for training (needed for some models)
model.train()

for epoch in range(num_epochs):
    # One training epoch over the whole dataset
    for inputs, targets in dataloader:
        # One mini-batch, put on the desired devide (cpu or gpu)
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Reinitialize the gradients before any computation
        optimizer.zero_grad()

        # Computation of the model's output and loss on the mini-batch
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Computation of the gradient on the mini-batch
        loss.backward()
        
        # One iteration of the optimizer and update of the step-size
        optimizer.step()

        # Then we can compute statistics and store loss values
        ...
        
    # Update of the step-size
    scheduler.step()
    print('Loss: {:.4f} Acc: {:.4f}'.format(..., ...))

In [None]:
# Typical optimizers and schedulers
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda t: 1/t**0.5)

In [None]:
# If you want to implement your own optimizer
class SuperDuperOptimizer(optim.Optimizer):
    def __init__(self, params, lr):
        defaults = dict(lr=lr)
        super(SuperDuperOptimizer, self).__init__(params, defaults)

    def step():
        ...