Examples of various optimizers

In [182]:
import torch
from torch.optim import SGD
from typing import List

m = 10
n = 5
lr = 0.01

X = torch.rand(m, n)
b = torch.rand(n, 1)
b.requires_grad = True
y = torch.rand(m, 1)

def loss(X, b, y):
    return torch.mean((y - X @ b) ** 2)

def gen_data():
    return X.detach().clone(), b.detach().clone(), y.detach().clone()

In [185]:
class MySGD():
    def __init__(self, weight_tensors: List[torch.Tensor], lr: int):
        self.weight_tensors = weight_tensors
        self.lr = lr
    
    def zero_grad(self):
        for wt in self.weight_tensors:
            if wt.grad is not None:
                wt.grad.zero_()

    def step(self):
        with torch.no_grad():
            for wt in self.weight_tensors:
                wt -= wt.grad * self.lr

iter = 5
def run_my_sgd():
    X, b, y = gen_data()
    b.requires_grad_()
    optimizer = MySGD([b], lr = lr)
    print('--- My SGD ---')

    for i in range(1, iter+1):
        optimizer.zero_grad()
        L = loss(X, b, y)
        L.backward()
        optimizer.step()
        print(f'weights after iteration {i} : {b.squeeze()}')

def run_sgd():
    X, b, y = gen_data()
    b.requires_grad_()
    optimizer = SGD([b], lr = lr)
    print('--- built in SGD ---')

    for i in range(1, iter+1):
        optimizer.zero_grad()
        L = loss(X, b, y)
        L.backward()
        optimizer.step()
        print(f'weights after iteration {i} : {b.squeeze()}')

run_my_sgd()
run_sgd()

--- My SGD ---
weights after iteration 1 : tensor([0.1353, 0.0951, 0.7788, 0.4300, 0.6636], grad_fn=<SqueezeBackward0>)
weights after iteration 2 : tensor([0.1281, 0.0893, 0.7703, 0.4204, 0.6546], grad_fn=<SqueezeBackward0>)
weights after iteration 3 : tensor([0.1211, 0.0836, 0.7619, 0.4111, 0.6458], grad_fn=<SqueezeBackward0>)
weights after iteration 4 : tensor([0.1143, 0.0781, 0.7539, 0.4020, 0.6373], grad_fn=<SqueezeBackward0>)
weights after iteration 5 : tensor([0.1077, 0.0727, 0.7460, 0.3933, 0.6290], grad_fn=<SqueezeBackward0>)
--- built in SGD ---
weights after iteration 1 : tensor([0.1353, 0.0951, 0.7788, 0.4300, 0.6636], grad_fn=<SqueezeBackward0>)
weights after iteration 2 : tensor([0.1281, 0.0893, 0.7703, 0.4204, 0.6546], grad_fn=<SqueezeBackward0>)
weights after iteration 3 : tensor([0.1211, 0.0836, 0.7619, 0.4111, 0.6458], grad_fn=<SqueezeBackward0>)
weights after iteration 4 : tensor([0.1143, 0.0781, 0.7539, 0.4020, 0.6373], grad_fn=<SqueezeBackward0>)
weights after itera

Example for Adam optimizer

In [190]:
from torch.optim import Adam

iter = 5
betas=(0.9, 0.999)
eps=1e-08

class MyAdam(MySGD):
    def __init__(self, weight_tensors: List[torch.Tensor], lr: int, betas, eps):
        super().__init__(weight_tensors, lr)
        self.betas = betas
        self.eps = eps
        self.m = [torch.zeros_like(wt) for wt in weight_tensors]
        self.v = [torch.zeros_like(wt) for wt in weight_tensors]
        self.i = 0
    
    def step(self):
        beta1, beta2 = self.betas

        for ind, wt in enumerate(self.weight_tensors):
            self.m[ind] = (1- beta1) * wt.grad  + beta1 * self.m[ind]
            self.v[ind] = (1- beta2) * wt.grad ** 2 + beta2 * self.v[ind]

            self.i += 1
            m_hat = self.m[ind] / (1-beta1 ** self.i)
            v_hat = self.v[ind] / (1-beta2 ** self.i)

            with torch.no_grad():
                wt -= self.lr * m_hat / (torch.sqrt(v_hat)+self.eps)


def run_adam():
    X, b, y = gen_data()
    b.requires_grad = True
    optimizer = Adam([b], lr=lr, betas=betas, eps=eps)
    print('--- built in adam ---')
    for i in range(1, iter+1):
        optimizer.zero_grad()
        L = loss(X, b, y)
        L.backward()
        optimizer.step()
        print(f'After step {i}, param is {b.squeeze()}')
        

def my_adam():
    X, b, y = gen_data()
    b.requires_grad = True
    optimizer = MyAdam([b], lr=lr, betas=betas, eps=eps)
    print('--- My adam ---')
    for i in range(1, iter+1):
        optimizer.zero_grad()
        L = loss(X, b, y)
        L.backward()
        optimizer.step()
        print(f'After step {i}, param is {b.squeeze()}')

run_adam()
my_adam()




--- built in adam ---
After step 1, param is tensor([0.1327, 0.0912, 0.7776, 0.4300, 0.6629], grad_fn=<SqueezeBackward0>)
After step 2, param is tensor([0.1227, 0.0812, 0.7676, 0.4200, 0.6529], grad_fn=<SqueezeBackward0>)
After step 3, param is tensor([0.1127, 0.0712, 0.7576, 0.4100, 0.6429], grad_fn=<SqueezeBackward0>)
After step 4, param is tensor([0.1028, 0.0613, 0.7477, 0.4001, 0.6329], grad_fn=<SqueezeBackward0>)
After step 5, param is tensor([0.0929, 0.0514, 0.7378, 0.3902, 0.6230], grad_fn=<SqueezeBackward0>)
--- My adam ---
After step 1, param is tensor([0.1327, 0.0912, 0.7776, 0.4300, 0.6629], grad_fn=<SqueezeBackward0>)
After step 2, param is tensor([0.1227, 0.0812, 0.7676, 0.4200, 0.6529], grad_fn=<SqueezeBackward0>)
After step 3, param is tensor([0.1127, 0.0712, 0.7576, 0.4100, 0.6429], grad_fn=<SqueezeBackward0>)
After step 4, param is tensor([0.1028, 0.0613, 0.7477, 0.4001, 0.6329], grad_fn=<SqueezeBackward0>)
After step 5, param is tensor([0.0929, 0.0514, 0.7378, 0.3902,

RMSProp

In [192]:
from torch.optim import RMSprop

alpha=0.99
eps=1e-08

class MyRMSprop(MySGD):
    def __init__(self, weight_tensors: List[torch.Tensor], lr: int, alpha, eps):
        super().__init__(weight_tensors, lr)
        self.alpha = alpha
        self.eps = eps
        self.v = [torch.zeros_like(wt) for wt in weight_tensors]
    
    def step(self):
        for ind, wt in enumerate(self.weight_tensors):
            self.v[ind] = (1- self.alpha) * wt.grad ** 2 + self.alpha * self.v[ind]
            with torch.no_grad():
                wt -= self.lr * wt.grad / (torch.sqrt(self.v[ind])+self.eps)


def run_rmsprop():
    X, b, y = gen_data()
    b.requires_grad = True
    optimizer = RMSprop([b], lr=lr, alpha=alpha, eps=eps)
    print('--- built in RMSprop ---')
    for i in range(1, iter+1):
        optimizer.zero_grad()
        L = loss(X, b, y)
        L.backward()
        optimizer.step()
        print(f'After step {i}, param is {b.squeeze()}')
        

def my_rmsprop():
    X, b, y = gen_data()
    b.requires_grad = True
    optimizer = MyRMSprop([b], lr=lr, alpha=alpha, eps=eps)
    print('--- My RMSprop ---')
    for i in range(1, iter+1):
        optimizer.zero_grad()
        L = loss(X, b, y)
        L.backward()
        optimizer.step()
        print(f'After step {i}, param is {b.squeeze()}')

run_rmsprop()
my_rmsprop()

--- built in RMSprop ---
After step 1, param is tensor([0.0427, 0.0012, 0.6876, 0.3400, 0.5729], grad_fn=<SqueezeBackward0>)
After step 2, param is tensor([-0.0122, -0.0535,  0.6315,  0.2860,  0.5167],
       grad_fn=<SqueezeBackward0>)
After step 3, param is tensor([-0.0485, -0.0895,  0.5933,  0.2511,  0.4785],
       grad_fn=<SqueezeBackward0>)
After step 4, param is tensor([-0.0741, -0.1147,  0.5654,  0.2274,  0.4505],
       grad_fn=<SqueezeBackward0>)
After step 5, param is tensor([-0.0926, -0.1327,  0.5443,  0.2112,  0.4292],
       grad_fn=<SqueezeBackward0>)
--- My RMSprop ---
After step 1, param is tensor([0.0427, 0.0012, 0.6876, 0.3400, 0.5729], grad_fn=<SqueezeBackward0>)
After step 2, param is tensor([-0.0122, -0.0535,  0.6315,  0.2860,  0.5167],
       grad_fn=<SqueezeBackward0>)
After step 3, param is tensor([-0.0485, -0.0895,  0.5933,  0.2511,  0.4785],
       grad_fn=<SqueezeBackward0>)
After step 4, param is tensor([-0.0741, -0.1147,  0.5654,  0.2274,  0.4505],
       