In [None]:
# Using different types of Learning rates Policies using Pytorch

In [1]:
import torch

In [3]:
# torch.optim is a package implementing various optimization algorithms.
model = torch.nn.Linear(20, 10)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.01
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
)

In [None]:
# We can also provide Per-parameter options while defining Optimizer
# We can pass an iterable of dict (dictionary). Each of them will define a separate
# parameter group, and should contain a params key, containing a list of parameters
# belonging to it.

perlayer_optimizer = torch.optim.SGD(
    [
        {'params': model.base.parameters()},
        {'params': model.classifier.parameters(), 'lr': 1e-3}
    ], lr=1e-2, momentum=0.9
)

"""
This means that model.base’s parameters will use the default learning rate of 1e-2,
model.classifier’s parameters will use a learning rate of 1e-3.
And a momentum of 0.9 will be used for all parameters.
"""

In [None]:
# Understanding the base class of Optimizers
# CLASS   torch.optim.Optimizer(params, defaults) is base class for all optimizer
"""
 1. params (iterable) – an iterable of torch.Tensor s or dict s.
                        Specifies what Tensors should be optimized.

 2. defaults – (dict): a dict containing default values of optimization options
                       (used when a parameter group doesn’t specify them).
"""
# For more reference, visit here :
# https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer

In [None]:
# How ot adjust learning rate during training process
# torch.optim.lr_scheduler provides several methods to adjust the learning rate
# based on the number of epochs.  (********important)

# Learning rate scheduling should be applied after optimizer’s update.
lr_policy_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[5, 10, 20], gamma=0.2)
"""
milestones (list): List of epoch indices. Must be increasing.
gamma (float): Multiplicative factor of learning rate decay. Default: 0.1.
"""
dataset = torch.utils.data.Dataset()  # dataset for training
MAX_EPOCH = 50
loss_fn = torch.nn.MSELoss()

for epoch in range(MAX_EPOCH):

  for data, target in dataset:

    optimizer.zero_grad()
    prediction = model(data)
    loss = loss_fn(prediction, target)
    loss.backward()
    optimizer.step()

  lr_policy_scheduler.step()  # applied after epoch


"""
Note: Most learning rate schedulers can be called back-to-back (also referred to
as chaining schedulers). The result is that each scheduler is applied one after
the other on the learning rate obtained by the one preceding it.
"""
# for example
# lr_policy_scheduler_1.step()
# lr_policy_scheduler_2.step()

In [None]:
# Chained Scheduler
model = torch.nn.Linear(20, 1)
# here, optimizer uses lr =1 for all groups
optimizer = torch.optim.SGD(model.parameters(), lr=1, momentum=0.9)

scheduler_1 = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=0.5, total_iters=2)
"""
if only scheduler_1 is aplied
>>> # lr = 0.5   if epoch == 0
>>> # lr = 0.5   if epoch == 1
>>> # lr = 1   if epoch >= 2
"""

scheduler_2 = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
"""
when only scheduler_2 is applied
>>> # lr = 0.9     if epoch == 0
>>> # lr = 0.81    if epoch == 1
>>> # lr = 0.729    if epoch == 2
>>> # lr = 0.6561   if epoch == 3
>>> # lr = 0.59049  if epoch == 4
... and so on
"""

chained_scheduler = torch.optim.lr_scheduler.ChainedScheduler([scheduler_1, scheduler_2])
"""
when chained_scheduler is applied
>>> # lr = 0.45     if epoch == 0
>>> # lr = 0.405    if epoch == 1
>>> # lr = 0.729    if epoch == 2
>>> # lr = 0.6561   if epoch == 3
>>> # lr = 0.59049  if epoch == 4
... and so on
"""


# Return last computed learning rate by current scheduler.
chained_scheduler.get_last_lr()



In [None]:
# Another important Learning rate schedulers
plateau_lr_sh = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2)
"""
Reduce learning rate when a metric has stopped improving. Models often benefit
from reducing the learning rate by a factor of 2-10 once learning stagnates.
This scheduler reads a metrics quantity and if no improvement is seen for a
‘patience’ number of epochs, the learning rate is reduced.

mode : min or max
In min mode, lr will be reduced when the quantity monitored has stopped decreasing.
In max mode it will be reduced when the quantity monitored has stopped increasing.

patience : Number of epochs with no improvement after which learning rate will be reduced.
"""


# Warming start learning rate decay
# refer to torch.optim.lr_scheduler.CosineAnnealingWarmRestarts