# Learning rate schedules

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('..')

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from transformers import (
    get_constant_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    get_cosine_with_hard_restarts_schedule_with_warmup
)

In [None]:
# create simple model
model = nn.Linear(1, 1)

# create optimizer
lr = 1e-04

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# set number of training steps
num_training_steps = 1000

In [None]:
# create LR schedulers
schedulers = {
    'constant with warmup': get_constant_schedule_with_warmup(
        optimizer,
        num_warmup_steps=200
    ),
    'linear with warmup': get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=300,
        num_training_steps=num_training_steps
    ),
    'cosine with warmup': get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=num_training_steps
    ),
    'cosine with hard restarts and warmup': get_cosine_with_hard_restarts_schedule_with_warmup(
        optimizer,
        num_warmup_steps=400,
        num_training_steps=num_training_steps,
        num_cycles=2
    )
}

In [None]:
# calculate LR scaling factors
steps = np.arange(num_training_steps)

lr_scalings = {
    key: np.array([sched.lr_lambdas[0](step) for step in steps])
    for key, sched in schedulers.items()
}

In [None]:
# plot LR schedules
fig, ax = plt.subplots(figsize=(6, 4))
for k, v in lr_scalings.items():
    ax.plot(steps, v, alpha=0.7, label=k)
ax.set(xlabel='training step', ylabel='LR scaling factor')
ax.set_xlim((steps.min(), steps.max()))
ax.legend()
ax.grid(visible=True, which='both', color='gray', alpha=0.2, linestyle='-')
ax.set_axisbelow(True)
fig.tight_layout()