Examples of various optimizers

In [174]:
import torch
from torch.optim import SGD

m = 10
n = 5
lr = 0.01

X = torch.rand(m, n)
b = torch.rand(n, 1)
b.requires_grad = True
y = torch.rand(m, 1)

def loss(X, b, y):
    return torch.mean((y - X @ b) ** 2)

def gen_data():
    return X.detach().clone(), b.detach().clone(), y.detach().clone()


L = loss(X, b, y)
L.backward()
print(f'weights after my sgd : {(b - b.grad * lr).squeeze()}')

optimizer = SGD([b], lr=lr)
optimizer.zero_grad()
b.grad = None
L = loss(X, b, y)
L.backward()
optimizer.step()

print(f'weights after built in sgd : {b.squeeze()}')

weights after my sgd : tensor([0.6387, 0.2775, 0.6478, 0.3678, 0.7727], grad_fn=<SqueezeBackward0>)
weights after built in sgd : tensor([0.6387, 0.2775, 0.6478, 0.3678, 0.7727], grad_fn=<SqueezeBackward0>)


Example for Adam optimizer

In [175]:
from torch.optim import Adam

iter = 5
betas=(0.9, 0.999)
eps=1e-08


def run_adam():
    X, b, y = gen_data()
    b.requires_grad = True
    optimizer = Adam([b], lr=lr, betas=betas, eps=eps)
    print('--- built in adam ---')
    for i in range(1, iter+1):
        optimizer.zero_grad()
        L = loss(X, b, y)
        L.backward()
        optimizer.step()
        print(f'After step {i}, param is {b.squeeze()}')
        

def my_adam():
    X, b, y = gen_data()
    b.requires_grad = True
    print('--- my adam ---')
    m = torch.zeros_like(b)
    v = torch.zeros_like(b)
    for i in range(1, iter+1):
        if b.grad is not None:
            b.grad.zero_()

        L = loss(X, b, y)
        L.backward()

        m = (1- betas[0]) * b.grad  + betas[0] * m
        v = (1- betas[1]) * b.grad ** 2 + betas[1] * v
        m_hat = m / (1-betas[0] ** i)
        v_hat = v / (1-betas[1] ** i)

        with torch.no_grad():
            b -= lr * m_hat / (torch.sqrt(v_hat)+ eps)
        
        print(f'After step {i}, param is {b.squeeze()}') 

run_adam()
my_adam()




--- built in adam ---
After step 1, param is tensor([0.6287, 0.2675, 0.6378, 0.3578, 0.7627], grad_fn=<SqueezeBackward0>)
After step 2, param is tensor([0.6187, 0.2576, 0.6278, 0.3478, 0.7527], grad_fn=<SqueezeBackward0>)
After step 3, param is tensor([0.6087, 0.2476, 0.6178, 0.3378, 0.7427], grad_fn=<SqueezeBackward0>)
After step 4, param is tensor([0.5988, 0.2377, 0.6078, 0.3279, 0.7328], grad_fn=<SqueezeBackward0>)
After step 5, param is tensor([0.5888, 0.2278, 0.5979, 0.3179, 0.7228], grad_fn=<SqueezeBackward0>)
--- my adam ---
After step 1, param is tensor([0.6287, 0.2675, 0.6378, 0.3578, 0.7627], grad_fn=<SqueezeBackward0>)
After step 2, param is tensor([0.6187, 0.2576, 0.6278, 0.3478, 0.7527], grad_fn=<SqueezeBackward0>)
After step 3, param is tensor([0.6087, 0.2476, 0.6178, 0.3378, 0.7427], grad_fn=<SqueezeBackward0>)
After step 4, param is tensor([0.5988, 0.2377, 0.6078, 0.3279, 0.7328], grad_fn=<SqueezeBackward0>)
After step 5, param is tensor([0.5888, 0.2278, 0.5979, 0.3179,