In [217]:


import torch as t
from torch import nn, optim
import numpy as np
from typing import Callable, Iterable

import utils

In [225]:
def rosenbrocks_banana(x: t.Tensor, y: t.Tensor, a=1, b=100) -> t.Tensor:
    return (a - x) ** 2 + b * (y - x**2) ** 2 + 1

x_range = [-2, 2]
y_range = [-1, 3]
fig = utils.plot_fn(rosenbrocks_banana, x_range, y_range, log_scale=True)
fig.show()

In [218]:
def opt_fn_with_sgd(fn: Callable, xy: t.Tensor, lr=0.001, momentum=0.98, n_iters: int = 100):
    '''
    Optimize the a given function starting from the specified point.

    xy: shape (2,). The (x, y) starting point.
    n_iters: number of steps.

    Return: (n_iters, 2). The (x,y) BEFORE each step. So out[0] is the starting point.
    '''
    assert xy.requires_grad
    out = []
    v = t.zeros_like(xy)
    xys = [xy]
    for i in range(n_iters):
        z = fn(xy[0], xy[1])
        z.backward()
        with t.no_grad():
            if momentum is not None:
                v = momentum*v + (1-momentum)*xy.grad
                xy -= lr * v
            else: 
                xy -= lr * xy.grad
        xy.grad.zero_()
        out.append(xy.clone())
    return t.stack(out).detach()

test = opt_fn_with_sgd(rosenbrocks_banana, t.tensor([1.0,4.0], requires_grad=True), n_iters=10)
assert test.shape == t.Size([10,2]), test.shape

In [274]:
def opt_fn_with_sgd(fn: Callable, xy: t.Tensor, lr=0.001, momentum=0.98, n_iters: int = 100):
    '''
    Optimize the a given function starting from the specified point.

    xy: shape (2,). The (x, y) starting point.
    n_iters: number of steps.

    Return: (n_iters, 2). The (x,y) BEFORE each step. So out[0] is the starting point.
    '''
    assert xy.requires_grad
    # optimizer = optim.SGD([xy], lr = lr, momentum=momentum, weight_decay=0)
    optimizer = SGD([xy], lr = lr, momentum=momentum, weight_decay=0)
    xys = t.zeros((n_iters, 2))
    for i in range(n_iters):
        xys[i] = xy.detach()
        z = fn(xy[0], xy[1])
        z.backward()
        optimizer.step()
        optimizer.zero_grad()
    return xys

test = opt_fn_with_sgd(rosenbrocks_banana, t.tensor([1.0,2.0], requires_grad=True), n_iters=10)
assert test.shape == t.Size([10,2]), test.shape

SGD(lr=0.001, momentum=0.98, weight_decay = 0


In [275]:
xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]
fig = utils.plot_optimization_sgd(opt_fn_with_sgd, rosenbrocks_banana, xy, x_range, y_range, lr=0.001, momentum=0.98, show_min=True, n_iters=100)
fig.show()

SGD(lr=0.001, momentum=0.98, weight_decay = 0


In [310]:
class SGD:
    params: list

    def __init__(self, params: Iterable[t.nn.parameter.Parameter], lr: float, momentum: float, weight_decay: float):
        '''Implements SGD with momentum.

        Like the PyTorch version, but assume nesterov=False, maximize=False, and dampening=0
            https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD
        '''
        self.params = list(params)
        self.lr = lr 
        self.mu = momentum
        self.weight_decay = weight_decay
        self.gs = [t.zeros_like(p) for p in self.params]
        self.t = 1

        print(self)

    def zero_grad(self) -> None:
        for param in self.params:
            if param.requires_grad and param.grad is not None:
                param.grad.zero_()

    def step(self) -> None:
        with t.no_grad():
            for i, param in enumerate(self.params):
                previous_gs = self.gs[i].clone()
                self.gs[i] = param.grad.clone()
                if self.weight_decay != 0:
                    self.gs[i] = self.gs[i] + self.weight_decay*param
                if self.mu != 0 and self.t>1:
                    self.gs[i] = self.mu*previous_gs + self.gs[i] 

                self.params[i] -= self.lr*self.gs[i]

        self.t += 1
    def __repr__(self) -> str:
        # Should return something reasonable here, e.g. "SGD(lr=lr, ...)"
        return f'SGD(lr={self.lr}, momentum={self.mu}, weight_decay = {self.weight_decay}'

utils.test_sgd(SGD)

SGD(lr=0.1, momentum=0.0, weight_decay = 0.0

Testing configuration:  {'lr': 0.1, 'momentum': 0.0, 'weight_decay': 0.0}
SGD(lr=0.1, momentum=0.7, weight_decay = 0.0

Testing configuration:  {'lr': 0.1, 'momentum': 0.7, 'weight_decay': 0.0}
SGD(lr=0.1, momentum=0.5, weight_decay = 0.0

Testing configuration:  {'lr': 0.1, 'momentum': 0.5, 'weight_decay': 0.0}
SGD(lr=0.1, momentum=0.5, weight_decay = 0.05

Testing configuration:  {'lr': 0.1, 'momentum': 0.5, 'weight_decay': 0.05}
SGD(lr=0.2, momentum=0.8, weight_decay = 0.05

Testing configuration:  {'lr': 0.2, 'momentum': 0.8, 'weight_decay': 0.05}


In [268]:
class RMSprop:
    def __init__(
        self,
        params: Iterable[t.nn.parameter.Parameter],
        lr: float,
        alpha: float,
        eps: float,
        weight_decay: float,
        momentum: float,
    ):
        '''Implements RMSprop.

        Like the PyTorch version, but assumes centered=False
            https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html#torch.optim.RMSprop
        '''
        self.params = list(params)
        self.lr = lr 
        self.mu = momentum
        self.weight_decay = weight_decay
        self.gs = [t.zeros_like(p) for p in self.params]
        self.vs = [t.zeros_like(p) for p in self.params]
        self.alpha = alpha
        self.eps = eps
        self.t = 1
        self.bs = [t.zeros_like(p) for p in self.params]

    def zero_grad(self) -> None:
        for param in self.params:
            if param.requires_grad and param.grad is not None:
                param.grad = t.zeros_like(param.grad)
                
    def step(self) -> None:
        with t.no_grad():
            for i, param in enumerate(self.params):
                self.gs[i] = param.grad.clone()
                if self.weight_decay != 0:
                    self.gs[i] = self.gs[i] + self.weight_decay*param

                self.vs[i] = self.alpha*self.vs[i] + (1-self.alpha)*self.gs[i]**2
                v_est = self.vs[i].clone()

                if self.mu > 0:
                    self.bs[i] =      self.mu*self.bs[i]+ self.gs[i]/(t.sqrt(v_est) + self.eps)
                    self.params[i] -= self.lr*self.bs[i]
                else: 
                    self.params[i] -= self.lr*self.gs[i]/(t.sqrt(v_est) + self.eps)

        self.t += 1

    def __repr__(self) -> str:
        return f'SGD(lr={self.lr}, momentum={self.mu}, weight_decay = {self.weight_decay}, alpha = {self.alpha}, eps = {self.eps}'


utils.test_rmsprop(RMSprop)


Testing configuration:  {'lr': 0.1, 'alpha': 0.9, 'eps': 0.001, 'weight_decay': 0.0, 'momentum': 0.0}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.0}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.5}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.0}


In [290]:
class Adam:
    def __init__(
        self,
        params: Iterable[t.nn.parameter.Parameter],
        lr: float,
        betas: tuple[float, float],
        eps: float,
        weight_decay: float,
    ):
        '''Implements Adam.

        Like the PyTorch version, but assumes amsgrad=False and maximize=False
            https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam
        '''
        self.params = list(params)
        self.lr = lr 
        self.weight_decay = weight_decay
        self.eps = eps
        self.t = 1
        self.beta1, self.beta2 = betas
        self.gs = [t.zeros_like(p) for p in self.params]
        self.mi = [t.zeros_like(p) for p in self.params]
        self.vi = [t.zeros_like(p) for p in self.params]


    def zero_grad(self) -> None:
        for param in self.params:
            if param.requires_grad and param.grad is not None:
                param.grad = t.zeros_like(param.grad)
                
    def step(self) -> None:
        with t.no_grad():
            for i, param in enumerate(self.params):
                self.gs[i] = param.grad.clone()
                if self.weight_decay != 0:
                    self.gs[i] = self.gs[i] + self.weight_decay*param

                self.mi[i] = self.beta1*self.mi[i] + (1-self.beta1)*self.gs[i]
                self.vi[i] = self.beta2*self.vi[i] + (1-self.beta2)*self.gs[i]**2
                m_hat = self.mi[i]/(1-self.beta1**self.t)
                v_hat = self.vi[i]/(1-self.beta2**self.t)

                self.params[i] -= self.lr*m_hat/(t.sqrt(v_hat) + self.eps)

        self.t += 1

    def __repr__(self) -> str:
        return f'SGD(lr={self.lr}, betas={self.beta1, self.beta2}, weight_decay = {self.weight_decay}, eps = {self.eps}'


utils.test_adam(Adam)


Testing configuration:  {'lr': 0.1, 'betas': (0.8, 0.95), 'eps': 0.001, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'betas': (0.8, 0.9), 'eps': 0.001, 'weight_decay': 0.05}

Testing configuration:  {'lr': 0.2, 'betas': (0.9, 0.95), 'eps': 0.01, 'weight_decay': 0.08}


In [292]:
def opt_fn(fn: Callable, xy: t.Tensor, optimizer_class, optimizer_kwargs, n_iters: int = 100):
    '''Optimize the a given function starting from the specified point.

    optimizer_class: one of the optimizers you've defined, either SGD, RMSprop, or Adam
    optimzer_kwargs: keyword arguments passed to your optimiser (e.g. lr and weight_decay)
    '''
    assert xy.requires_grad
    optimizer = optimizer_class([xy], **optimizer_kwargs)
    xys = t.zeros((n_iters, 2))
    for i in range(n_iters):
        xys[i] = xy.detach()
        z = fn(xy[0], xy[1])

        z.backward()
        optimizer.step()
        optimizer.zero_grad()
    return xys
    
xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]
optimizers = [
    (SGD, dict(lr=1e-3, momentum=0.98, weight_decay=0)),
    (RMSprop, dict(lr=5e-4,  alpha = 0.98, momentum=0.98, eps = 1e-12, weight_decay=0)),
    (Adam, dict(lr=5e-4, betas=[0.98,1], eps = 1e-12, weight_decay=0)),
]

fn = rosenbrocks_banana
fig = utils.plot_optimization(opt_fn, fn, xy, optimizers, x_range, y_range)

fig.show()

SGD(lr=0.001, momentum=0.98, weight_decay = 0
SGD(lr=0.001, momentum=0.98, weight_decay = 0


In [306]:
xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]
optimizers = [
    (SGD, dict(lr=1e-3, momentum=0.98, weight_decay=0)),
    (RMSprop, dict(lr=5e-4,  alpha = 0.98, momentum=0.98, eps = 1e-12, weight_decay=0)),
    (Adam, dict(lr=5e-4, betas=[0.98,1], eps = 1e-12, weight_decay=0)),
]

def rosenbrocks_banana(x: t.Tensor, y: t.Tensor, a=1, b=100) -> t.Tensor:
    return (a - x) + b * (y - x**2) ** 2 + 1

fig = utils.plot_optimization(opt_fn, rosenbrocks_banana, xy, optimizers, x_range, y_range)

fig.show()

SGD(lr=0.001, momentum=0.98, weight_decay = 0
SGD(lr=0.001, momentum=0.98, weight_decay = 0


In [307]:
# Moving on to Learning Rate Schedulers
import torch

model = [nn.Parameter(torch.randn(2, 2, requires_grad=True))]
optimizer = optim.SGD(model, 0.1)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

for epoch in range(20):
    for input, target in dataset:
        optimizer.zero_grad()
        output = model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
    scheduler.step()

NameError: name 'dataset' is not defined

In [339]:
class ExponentialLR():
    def __init__(self, optimizer, gamma):
        '''Implements ExponentialLR.

        Like the PyTorch version, but assumes last_epoch=-1 and verbose=False
            https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ExponentialLR.html
        '''
        self.gamma = gamma 
        self.optimizer = optimizer

    def step(self):
        if 'param_groups' in dir(self.optimizer):
            for i in self.optimizer.param_groups:
                i['lr'] =  self.gamma*i['lr']
        else: 
            self.optimizer.lr = self.gamma*self.optimizer.lr

    def __repr__(self):
        return 'ExponentialLR({})'.format(self.optimizer.__repr__)

utils.test_ExponentialLR(ExponentialLR, t.optim.SGD)

Testing ExponentialLR, training loop has 30 epochs, 4 batches per epoch

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  gamma=0.5

Testing configuration:
	optimizer:  lr=0.01, momentum=0.9, weight_decay=0.1 
	scheduler:  gamma=0.5

All tests in `test_ExponentialLR` passed!


In [360]:
class SGD:

    def __init__(self, params, **kwargs):
        '''Implements SGD with momentum.

        Accepts parameters in groups, or an iterable.

        Like the PyTorch version, but assume nesterov=False, maximize=False, and dampening=0
            https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD
        kwargs can contain lr, momentum or weight_decay
        '''
        self.param_groups = {}
        self.all_params = []
        print(kwargs)
        if 'lr' not in kwargs: # assume we are in a param group situation
            for i, param_group in enumerate(params): 
                assert 'lr' in param_group.keys()
                self.param_groups[i] = param_group 
                self.param_groups[i]["gs"] = [t.zeros_like(p) for p in param_group['params']]
                self.param_groups[i]["t"] = 1
                self.param_groups[i]["weight_decay"] = param_group.get("weight_decay", 0.0)
                self.param_groups[i]["momentum"] = param_group.get("momentum", 0.0)
                self.param_groups[i]['params']= param_group['params']
                self.all_params.append(param_group['params'].values())
        
            assert len(self.all_params) == len(set(self.all_params)) # params only appear once 
        else: 
            self.param_groups[0] = {
                "params":params,
                "lr":kwargs["lr"],
                "momentum":kwargs["momentum"],
                "weight_decay":kwargs["weight_decay"],
                "gs": [t.zeros_like(p) for p in params],
                "t": 1
            }

            
        self.t = 1

    def zero_grad(self) -> None:
        for param_group in self.param_groups.values():
            # print(param_group)
            for param in param_group["params"]:
                if param.requires_grad and param.grad is not None:
                    param.grad.zero_()

    def step(self) -> None: 
        for i, param_group in enumerate(self.param_groups):
            self.step_param_group(i)

    def step_param_group(self, i) -> None:

        param_group = self.param_groups[i]
        lr = param_group['lr']
        mu = param_group['momentum']
        weight_decay = param_group['weight_decay']
        gs = param_group['gs']
        params = param_group['params']

        with t.no_grad():
            for i, param in enumerate(params):
                previous_gs = gs[i].clone()
                gs[i] = param.grad.clone()
                if weight_decay != 0:
                    gs[i] = gs[i] + weight_decay*param
                if self.mu != 0 and self.t>1:
                    gs[i] = mu*previous_gs + gs[i] 

                param_group["params"][i] -= lr*gs[i]

        self.t += 1
    def __repr__(self) -> str:
        # Should return something reasonable here, e.g. "SGD(lr=lr, ...)"
        return f'SGD(lr={self.lr}, momentum={self.mu}, weight_decay = {self.weight_decay}'

utils.test_sgd(SGD)

{'lr': 0.1, 'momentum': 0.0, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'momentum': 0.0, 'weight_decay': 0.0}


AssertionError: Tensor-likes are not close!

Mismatched elements: 56 / 64 (87.5%)
Greatest absolute difference: 0.0067813098430633545 at index (15, 0) (up to 1e-05 allowed)
Greatest relative difference: 0.5964056311370575 at index (13, 1) (up to 0 allowed)