In [71]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [72]:
torch.manual_seed(1)
n_classes = 5
y = torch.randint(0, n_classes, (4,))
window_size = 4
# y = F.one_hot(y, num_classes=10)

x = torch.rand(window_size,2)
w = torch.rand(2,n_classes,requires_grad=True)
x_1 = x @ w

# Softmax
# x = F.softmax(x,dim=1)
x_1_exp =  x_1.exp()
probs = x_1_exp/x_1_exp.sum(dim=1).unsqueeze(1)
loss = -probs[torch.arange(window_size), y].log().mean()
print(loss)
loss.backward()

for t in [probs,w]:
  t.retain_grad()

tensor(1.7352, grad_fn=<NegBackward0>)


In [73]:
w_gradient = torch.clone(w._grad)
w_gradient

tensor([[ 0.0814,  0.0924,  0.0859, -0.0889, -0.1708],
        [-0.0688,  0.1538,  0.1344, -0.0244, -0.1950]])

### Gradient Descent

In [74]:
lr = 0.01
w = w -  lr * w_gradient

### Gradient Descent with Momentum

In [75]:
v = torch.zeros_like(w).float()
lr = 0.1
beta = 0.99

v = beta* v + (1.0-beta)  * w_gradient
w = w - lr * v


### Adaptive Gradient Algorithm (AdaGrad)

In [76]:
v = torch.zeros_like(w).float()
lr = 0.01
epsilon = 0.99

v = v  + w_gradient**2
w = w - (lr/(v**0.5+epsilon ))* v

In [77]:
def ManualAdaGrad(learning_rate,parameter):
    with torch.no_grad():
        w = parameter.weight
        grad = parameter.weight.grad
        grad_squared = grad**2
        epsilon = 1e-8

        adjusted_lr = learning_rate / torch.sqrt(grad_squared + epsilon)
        new_weight = w - adjusted_lr * grad
        print("Manual Result from the Update")
        print(new_weight)


torch.manual_seed(0)
x = torch.rand(2,3)
y = torch.tensor([5])
loss_fn = nn.CrossEntropyLoss()

linear = nn.Linear(3,3)
learning_rate = 0.1
optimizer = optim.Adagrad(linear.parameters(), lr=learning_rate)

print(linear.weight)
x = linear(x)
x = x.flatten().unsqueeze(0)
loss = loss_fn(x, y)
loss.backward()
ManualAdaGrad(learning_rate,linear)
optimizer.step()

print("Update from the pytorch optimizer")
print(linear.weight)


Parameter containing:
tensor([[-0.0114,  0.4578, -0.0512],
        [ 0.1528, -0.1745, -0.1135],
        [-0.5516, -0.3824, -0.2380]], requires_grad=True)
Manual Result from the Update
tensor([[-0.1114,  0.3578, -0.1512],
        [ 0.0528, -0.2745, -0.2135],
        [-0.4516, -0.2824, -0.1380]])
Update from the pytorch optimizer
Parameter containing:
tensor([[-0.1114,  0.3578, -0.1512],
        [ 0.0528, -0.2745, -0.2135],
        [-0.4516, -0.2824, -0.1380]], requires_grad=True)


### RMSProp - Root Mean Square Propagation

In [78]:
def ManualRmsProp(learning_rate,parameter):
    with torch.no_grad():
        w = parameter.weight.clone().detach()
        grad = parameter.weight.grad.clone().detach()


        epsilon = 1e-8
        beta = 0.99
        v = torch.zeros_like(w).float()

        v = beta * v  + ( 1- beta ) * grad**2
        w = w - (learning_rate *( v+epsilon )**-0.5 ) * grad
        print("Manual Result from the Update")
        print(w)


torch.manual_seed(0)
x = torch.rand(2,3)
y = torch.tensor([5])
loss_fn = nn.CrossEntropyLoss()

linear = nn.Linear(3,3)
learning_rate = 0.1
optimizer = optim.RMSprop(linear.parameters(), lr=learning_rate)

print(linear.weight)
x = linear(x)
x = x.flatten().unsqueeze(0)
loss = loss_fn(x, y)
loss.backward()
ManualRmsProp(learning_rate,linear)
optimizer.step()

print("Update from the pytorch optimizer")
print(linear.weight)


Parameter containing:
tensor([[-0.0114,  0.4578, -0.0512],
        [ 0.1528, -0.1745, -0.1135],
        [-0.5516, -0.3824, -0.2380]], requires_grad=True)
Manual Result from the Update
tensor([[-1.0114, -0.5422, -1.0512],
        [-0.8472, -1.1745, -1.1135],
        [ 0.4483,  0.6176,  0.7620]])
Update from the pytorch optimizer
Parameter containing:
tensor([[-1.0114, -0.5422, -1.0512],
        [-0.8472, -1.1745, -1.1135],
        [ 0.4484,  0.6176,  0.7620]], requires_grad=True)


### Adaptive Moment Estimation

In [83]:
def ManualAdamGrad(parameter):
    with torch.no_grad():
        beta_1 = 0.9
        beta_2 = 0.999
        step_no = 1.
        epsilon = 1e-8
        lr = 0.1

        w = parameter.weight.clone().detach()
        grad = parameter.weight.grad.clone().detach()

        m = torch.zeros_like(w).float()
        v = torch.zeros_like(w).float()

        m = beta_1 * m + ( 1. - beta_1 ) * grad
        v = beta_2 * v + ( 1. - beta_2 ) * grad ** 2

        m_hat = m / (1 - beta_1**step_no)
        v_hat = v / (1 - beta_2**step_no)

        adjusted = lr * m_hat * (v_hat+epsilon)**-0.5
        w -= adjusted
        print("Manual Update")
        print(w)



torch.manual_seed(0)
x = torch.rand(2,3)
y = torch.tensor([5])
loss_fn = nn.CrossEntropyLoss()

linear = nn.Linear(3,3)
lr = 0.1
optimizer = optim.Adam(linear.parameters(), lr=lr)

print(linear.weight)
x = linear(x)
x = x.flatten().unsqueeze(0)
loss = loss_fn(x, y)
loss.backward()
ManualAdamGrad(linear)
optimizer.step()

print("Update from the pytorch optimizer")
print(linear.weight)


Parameter containing:
tensor([[-0.0114,  0.4578, -0.0512],
        [ 0.1528, -0.1745, -0.1135],
        [-0.5516, -0.3824, -0.2380]], requires_grad=True)
Manual Update
tensor([[-0.1114,  0.3578, -0.1512],
        [ 0.0528, -0.2745, -0.2135],
        [-0.4516, -0.2824, -0.1380]])
Update from the pytorch optimizer
Parameter containing:
tensor([[-0.1114,  0.3578, -0.1512],
        [ 0.0528, -0.2745, -0.2135],
        [-0.4516, -0.2824, -0.1380]], requires_grad=True)
