## Pytorch basics: Gradient

#### Gradient Descent

One basic first-order optimisation algorithm for minimising loss function $ L $ is called gradient descent.

$$
G = \theta \longleftarrow \theta - \lambda \frac{\partial L}{\partial \theta}
$$

$ \theta $ : inputs of the loss function \
$ \lambda $ : learning rate \
$ \frac{\partial L}{\partial \theta} $ : gradient of L with respect to its parameters

In [1]:
def square_function(theta):
    # square loss function
    return 2 * theta

In [2]:
# 1 step gredient descent
def one_step_gradient_descent(theta, lr):
    """ Returns updated value of theta after one step. """
    return theta - lr * square_function(theta)

In [3]:
one_step_gradient_descent(1, 0.2)

0.6

In [4]:
# multiple 1 step gredient descent
def gradient_descent(initial_theta, lr, steps):
    """ Returns final value of theta after multiple one step gradient descents. """
    theta = initial_theta # float
    for _ in range(steps):
        theta = one_step_gradient_descent(theta, lr)
        print("𝜃: ",theta)
    return theta

theta_opt = gradient_descent(
    initial_theta=1.0,
    lr=0.2,
    steps=20
)
print(f"𝜃 after optimization: {theta_opt}")

𝜃:  0.6
𝜃:  0.36
𝜃:  0.216
𝜃:  0.1296
𝜃:  0.07776
𝜃:  0.046655999999999996
𝜃:  0.027993599999999997
𝜃:  0.016796159999999997
𝜃:  0.010077695999999997
𝜃:  0.006046617599999998
𝜃:  0.003627970559999999
𝜃:  0.002176782335999999
𝜃:  0.0013060694015999993
𝜃:  0.0007836416409599996
𝜃:  0.00047018498457599973
𝜃:  0.00028211099074559984
𝜃:  0.00016926659444735988
𝜃:  0.00010155995666841592
𝜃:  6.093597400104955e-05
𝜃:  3.656158440062973e-05
𝜃 after optimization: 3.656158440062973e-05


### Automatic Gradient Descent in pytorch

In [5]:
import torch

In [6]:
tensor = torch.Tensor([[1, 2], [3, 4]])
parameters = torch.nn.Parameter(tensor) # stores operations applied to
print(parameters)

Parameter containing:
tensor([[1., 2.],
        [3., 4.]], requires_grad=True)


requires_grad=True indicates that the parameters will keep track of all the operations applied, so that the gradients can be computed automatically when needed

In [7]:
# everytime we perform an operation on a torch Parameter, 
# it is recorded in each computed tensor.
temp = parameters + 5
print(temp)

tensor([[6., 7.],
        [8., 9.]], grad_fn=<AddBackward0>)


grad_fn=<AddBackward0> indicates that it remembers the last operation performed on the parameters (addition with 5)

In [8]:
temp = torch.exp(parameters)
print(temp)
# remembered operation performed exp

tensor([[ 2.7183,  7.3891],
        [20.0855, 54.5982]], grad_fn=<ExpBackward0>)


In [22]:
t0 = torch.Tensor([1])
theta = torch.nn.Parameter(t0)

loss = theta * theta
print(loss)

tensor([1.], grad_fn=<MulBackward0>)


In [23]:
print(theta.grad)

None


here it is None as we did not tell torch which gradient we wanted to calculate

In [24]:
loss = theta * theta
loss.backward()
print(theta.grad)

tensor([2.])


#### Calculate automatic gradients

In [25]:
def gradient_autograd(tensor):
    tensor_with_grad = torch.nn.Parameter(tensor) 
    loss = tensor_with_grad * tensor_with_grad  # loss = theta * theta
    loss.backward() 
    return tensor_with_grad.grad

In [32]:
def gd_torch_autograd(initial_theta, lr, steps):
    """ Returns automatic gradient. """
    tensor = initial_theta
    for _ in range(steps):
        tensor = tensor - lr * gradient_autograd(tensor)
        print(tensor, "-- gd: ", gradient_autograd(tensor))
    return tensor

In [33]:
initial_theta = torch.Tensor([1])
gd_torch_autograd(initial_theta, lr=0.2, steps=10)

tensor([0.6000]) -- gd:  tensor([1.2000])
tensor([0.3600]) -- gd:  tensor([0.7200])
tensor([0.2160]) -- gd:  tensor([0.4320])
tensor([0.1296]) -- gd:  tensor([0.2592])
tensor([0.0778]) -- gd:  tensor([0.1555])
tensor([0.0467]) -- gd:  tensor([0.0933])
tensor([0.0280]) -- gd:  tensor([0.0560])
tensor([0.0168]) -- gd:  tensor([0.0336])
tensor([0.0101]) -- gd:  tensor([0.0202])
tensor([0.0060]) -- gd:  tensor([0.0121])


tensor([0.0060])

In [34]:
gradient_autograd(initial_theta)

tensor([2.])

##### let's calculate

tensor = tensor - lr * gradient_autograd(tensor)

1 - 0.2 (2) = 0.6              -- g: 2*0.6 = 1.200 \
0.6000 - 0.2 (1.200) = 0.3600  -- g: 2*0.3600 = 0.7200 \
0.3600 - 0.2 (0.7200) = 0.2160 -- g: 2*2160 = 0.4320  \
0.2160 - 0.2 (0.4320) = 0.1296 -- g: 2*0.1296 = 0.2592 \
  ...  \
0.0060 - 0.2 (0.0121) = 0.00358

> Finally, we solve the mystery of gradient >_< 

In [35]:
t = torch.Tensor([1])
theta = torch.nn.Parameter(t)

loss = theta * theta
print(theta.grad)
loss.backward()
print(theta.grad)

None
tensor([2.])


In [36]:
print(theta)

Parameter containing:
tensor([1.], requires_grad=True)


The value of theta is not updated!

#### Optimizer

PyTorch optimizers automatically perform parameter updates based on the computed gradients.

It has two methods: zero_grad() and step()

In [37]:
list_parameters = [theta]
learning_rate = 0.2

# each optimizer is initialised with a list of the parameters to optimize
optimizer = torch.optim.SGD(params=list_parameters, lr=learning_rate)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.2
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [38]:
loss = theta * theta
loss.backward()
print(f"After 1st loss.backward(): theta.grad={theta.grad}")

loss = theta * theta  # necessary to recompute and store
loss.backward()
print(f"After 2nd loss.backward(): theta.grad={theta.grad}")

loss = theta * theta 
loss.backward()
print(f"After 3rd loss.backward(): theta.grad={theta.grad}")

After 1st loss.backward(): theta.grad=tensor([4.])
After 2nd loss.backward(): theta.grad=tensor([6.])
After 3rd loss.backward(): theta.grad=tensor([8.])


loss.backward() computes the gradients and adds those to the attributes .grad of the parameters. So to solve this zero_grad() is needed to reset gradients to 0.

In [40]:
# zero_grad() : reset to 0
optimiser = torch.optim.SGD(params=list_parameters, lr=learning_rate)
optimiser.zero_grad()
loss = theta * theta
loss.backward()
print(f"After 1st loss.backward(): theta.grad={theta.grad}")

optimiser.zero_grad()
print(f"After optimiser.zero_grad(): theta.grad={theta.grad}")

loss = theta * theta   # necessary to recompute and store
loss.backward()
print(f"After 2nd loss.backward(): theta.grad={theta.grad}")

After 1st loss.backward(): theta.grad=tensor([2.])
After optimiser.zero_grad(): theta.grad=None
After 2nd loss.backward(): theta.grad=tensor([2.])


In [41]:
# step() : performs one optimisation step
optimiser = torch.optim.SGD(params=list_parameters, lr=learning_rate)
optimiser.zero_grad()
loss = theta * theta
loss.backward()
print(f"Before optimization step: {theta}")
optimiser.step()
print(f"After optimization step: {theta}")

Before optimization step: Parameter containing:
tensor([1.], requires_grad=True)
After optimization step: Parameter containing:
tensor([0.6000], requires_grad=True)


In [42]:
# gradient descent with optimizer
def gd_with_optimizer(initial_theta, lr, steps):
    """ Returns gradient descents with the SGD optimiser. """
    tensor = torch.nn.Parameter(initial_theta, requires_grad=True)
    optimizer = torch.optim.SGD(params=[tensor], lr=lr)
    for _ in range(steps):
        optimizer.zero_grad()
        loss = torch.sum(tensor * tensor) 
        loss.backward()
        optimizer.step()
        print(tensor)
    return tensor

initial_tensor = torch.Tensor([1,-1]) 
gd_with_optimizer(initial_tensor, lr=0.2, steps=10)

Parameter containing:
tensor([ 0.6000, -0.6000], requires_grad=True)
Parameter containing:
tensor([ 0.3600, -0.3600], requires_grad=True)
Parameter containing:
tensor([ 0.2160, -0.2160], requires_grad=True)
Parameter containing:
tensor([ 0.1296, -0.1296], requires_grad=True)
Parameter containing:
tensor([ 0.0778, -0.0778], requires_grad=True)
Parameter containing:
tensor([ 0.0467, -0.0467], requires_grad=True)
Parameter containing:
tensor([ 0.0280, -0.0280], requires_grad=True)
Parameter containing:
tensor([ 0.0168, -0.0168], requires_grad=True)
Parameter containing:
tensor([ 0.0101, -0.0101], requires_grad=True)
Parameter containing:
tensor([ 0.0060, -0.0060], requires_grad=True)


Parameter containing:
tensor([ 0.0060, -0.0060], requires_grad=True)

In [43]:
adam_opt = torch.optim.Adam(params=list_parameters)
adam_opt

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [44]:
rms_opt = torch.optim.RMSprop(params=list_parameters)
rms_opt

RMSprop (
Parameter Group 0
    alpha: 0.99
    capturable: False
    centered: False
    differentiable: False
    eps: 1e-08
    foreach: None
    lr: 0.01
    maximize: False
    momentum: 0
    weight_decay: 0
)

In [45]:
lbg_opt = torch.optim.LBFGS(params=list_parameters)
lbg_opt

LBFGS (
Parameter Group 0
    history_size: 100
    line_search_fn: None
    lr: 1
    max_eval: 25
    max_iter: 20
    tolerance_change: 1e-09
    tolerance_grad: 1e-07
)