# Alrao (All learning rates at once) : a tutorial

We show in this notebook how to use Alrao in practice.

### Imports

In [1]:
from __future__ import print_function
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import os
import argparse
import time
from tqdm import tqdm
import numpy as np

from alrao import AlraoModel
from alrao import SGDAlrao, AdamAlrao
from alrao import lr_sampler_generic, generator_randomlr_neurons, generator_randomlr_weights

# CUDA
use_cuda = torch.cuda.is_available()

### Data Loading
We use the CIFAR10 dataset. We also use some data augmentation.

In [2]:
batch_size = 32

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(root='./datasets', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./datasets', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


### Defining the preclassifier model
We define a pre-classifier model. This model can be defined exactly as any usual model. Only two things are specific with alrao : 
* First, there is no classifier. The classifier layer will be added later
* The model needs the have a `linearinputdim` attribute , which is the output's dimension of the pre-classifier

In [3]:
class VGG(nn.Module): # identical to models.VGG
    def __init__(self, cfg):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg)
        # The dimension of the preclassier's output need to be specified.
        self.linearinputdim = 512

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        # The model do not contain a classifier layer.
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)

preclassifier = VGG([64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M',
                     512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'])

### Define the classifier
Here, we define our own classifier class. In practice, we do not need to redefine it, it can be found in `alrao.custom_layers.LinearClassifier`.
We redefine it here to show how any classifier (with a log-softmax output) can be used.

In [4]:
class Classifier(nn.Module): # identical to alrao.custom_layers.LinearClassifier
    def __init__(self, in_features, n_classes):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(in_features, n_classes)

    def forward(self, x):
        x = self.fc(x)
        x = nn.functional.log_softmax(x, dim=1)
        return x
    
criterion = nn.NLLLoss()

### Build the modified architecture for Alrao
We define the new architecture, with the parallel classifiers.
<img src="img/newalrao.png" width="400"></img>

Here there are 10 categories, and we decide to use 10 classifiers.

In [5]:
# nb_classifiers is the number of classifiers averaged by Alrao.
nb_classifiers = 10
nb_categories = 10
net = AlraoModel(preclassifier, nb_classifiers, Classifier, preclassifier.linearinputdim, nb_categories)
if use_cuda: net.cuda()

### Sampling the learning rates
We choose an interval (`minLR`, `maxLR`) in which the learning rates are chosen.
For the pre-classifier, the learning rates are sampled from the log-uniform distribution $\log-U(\cdot ; \eta_{\min}, \eta_{\max})$ :
namely, if $\eta \sim \log-U(\cdot ; \eta_{\min},
\eta_{\max})$, then $\log \eta$ is uniformly distributed between $\log
\eta_{\min}$ and $\log \eta_{\max}$.
Its
density function is
$$\log-U(\eta; \eta_\min, \eta_\max) = \frac{1_{\eta_\min \leq \eta \leq \eta_\max}}{\eta_\max - \eta_\min}\times\frac{1}{\eta}$$

The learning rates of the classifier are log-uniformly spread on the interval : 
$\log \eta_{j} = \log \eta_{\min} +
\frac{j-1}{N_{\mathrm{cl}}-1}\log(\eta_{\max}/ \eta_{\min})$

In [6]:
# We define the interval in which the learning rates are sampled
minlr = 10 ** (-5)
maxlr = 10 ** 1

# We spread the classifiers learning rates log-uniformly on the interval.
classifiers_lr = [np.exp(np.log(minlr) + \
    k /(nb_classifiers-1) * (np.log(maxlr) - np.log(minlr)) \
    ) for k in range(nb_classifiers)]

# We define the sampler for the preclassifier’s features.
lr_sampler = lr_sampler_generic(minlr, maxlr)
lr_preclassifier = generator_randomlr_neurons(net.preclassifier, lr_sampler)

### Define the optimizer
We define the Alrao optimizer. This includes : 
* A single (usual) SGD optimizer for each classifier
* A modified SGD optimizer for the pre-classifier, allowing to use one learning rate per neuron.
* The switch model averaging method, with its own update procedure.

In [7]:
optimizer = SGDAlrao(net.parameters_preclassifier(),
                     lr_preclassifier,
                     net.classifiers_parameters_list(),
                     classifiers_lr)

### Training procedure
We define the train procedure.

In [8]:
def train(epoch):
    train_loss = 0
    correct = 0
    total = 0
    pbar = tqdm(total=len(trainloader.dataset),bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} {postfix}')
    pbar.set_description("Epoch %d" % epoch)
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        net.train()
        if use_cuda: inputs, targets = inputs.cuda(), targets.cuda()

        # We update the model averaging weights in the optimizer
        optimizer.update_posterior(net.posterior())
        optimizer.zero_grad()

        # Forward pass of the Alrao model
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        # We compute the gradient of all the model’s weights
        loss.backward()

        # We reset all the classifiers gradients, and re-compute them with
        # as if their were the only output of the network.
        optimizer.classifiers_zero_grad()
        newx = net.last_x.detach()
        for classifier in net.classifiers():
            loss_classifier = criterion(classifier(newx), targets)
            loss_classifier.backward()

        # Then, we can run an update step of the gradient descent.
        optimizer.step()

        # Finally, we update the model averaging weights
        net.update_switch(targets, catch_up=False)

        # Update loss
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        # Update progression bar
        pbar.update(batch_size)
        postfix = OrderedDict([("LossTrain","{:.4f}".format(train_loss/(batch_idx+1))),
                               ("AccTrain", "{:.3f}".format(100.*correct/total))])
        postfix["PostSw"] = net.repr_posterior()
        pbar.set_postfix(postfix)
    pbar.close()

    # Print performance of the classifiers
    cl_perf = net.switch.get_cl_perf()
    for k in range(len(cl_perf)):
        print("Classifier {}\t LossTrain:{:.6f}\tAccTrain:{:.4f}".format(
            k, cl_perf[k][0], cl_perf[k][1]))

In [9]:
def test(epoch):
    net.eval()
    net.switch.reset_cl_perf()
    test_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(testloader):
        net.eval()
        if use_cuda: inputs, targets = inputs.cuda(), targets.cuda()

        # Forward pass of the Alrao model
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        # Update loss
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    print('\tLossTest: %.4f\tAccTest: %.3f' % (test_loss/(batch_idx+1), 100.*correct/total))
    print(("Posterior : "+"{:.1e}, " * nb_classifiers).format(*net.posterior()))

    return test_loss / (batch_idx + 1), correct / total

In [10]:
for epoch in range(5):
    train(epoch)
    test(epoch)

Epoch 0: : 50016it [01:07, 742.92it/s, LossTrain=1.4900, AccTrain=45.114, PostSw=|   █      |]

Classifier 0	 LossTrain:2.173622	AccTrain:0.1930
Classifier 1	 LossTrain:2.163998	AccTrain:0.2271
Classifier 2	 LossTrain:1.804787	AccTrain:0.3675
Classifier 3	 LossTrain:1.492775	AccTrain:0.4513
Classifier 4	 LossTrain:1.494896	AccTrain:0.4512
Classifier 5	 LossTrain:1.563066	AccTrain:0.4236
Classifier 6	 LossTrain:2.965732	AccTrain:0.3495
Classifier 7	 LossTrain:15.103052	AccTrain:0.3258
Classifier 8	 LossTrain:71.228097	AccTrain:0.3261
Classifier 9	 LossTrain:336.374353	AccTrain:0.3198



Epoch 1:   0%|          | 0/50000 

	LossTest: 1.9365	AccTest: 40.080
Posterior : 1.0e-04, 1.3e-04, 3.1e-04, 9.5e-01, 5.1e-02, 1.0e-03, 9.2e-05, 6.3e-05, 6.3e-05, 6.3e-05, 


Epoch 1: : 50016it [01:07, 736.07it/s, LossTrain=1.0356, AccTrain=63.354, PostSw=|   █      |]

Classifier 0	 LossTrain:2.044145	AccTrain:0.2570
Classifier 1	 LossTrain:1.714333	AccTrain:0.5242
Classifier 2	 LossTrain:1.236024	AccTrain:0.6049
Classifier 3	 LossTrain:1.035507	AccTrain:0.6336
Classifier 4	 LossTrain:1.046654	AccTrain:0.6290
Classifier 5	 LossTrain:1.099860	AccTrain:0.6112
Classifier 6	 LossTrain:1.919717	AccTrain:0.5359
Classifier 7	 LossTrain:8.849318	AccTrain:0.5079
Classifier 8	 LossTrain:41.872616	AccTrain:0.5052
Classifier 9	 LossTrain:193.838003	AccTrain:0.5055



Epoch 2:   0%|          | 0/50000 

	LossTest: 0.9920	AccTest: 65.720
Posterior : 4.3e-05, 5.7e-05, 2.0e-04, 1.0e+00, 2.5e-03, 5.4e-04, 9.2e-05, 5.1e-05, 3.1e-05, 3.1e-05, 


Epoch 2: : 50016it [01:07, 741.45it/s, LossTrain=0.8494, AccTrain=70.306, PostSw=|   █      |]

Classifier 0	 LossTrain:1.879200	AccTrain:0.3608
Classifier 1	 LossTrain:1.381986	AccTrain:0.6621
Classifier 2	 LossTrain:0.972673	AccTrain:0.6900
Classifier 3	 LossTrain:0.849395	AccTrain:0.7031
Classifier 4	 LossTrain:0.861581	AccTrain:0.6990
Classifier 5	 LossTrain:0.912557	AccTrain:0.6842
Classifier 6	 LossTrain:1.632264	AccTrain:0.6169
Classifier 7	 LossTrain:7.417791	AccTrain:0.5905
Classifier 8	 LossTrain:35.361829	AccTrain:0.5900
Classifier 9	 LossTrain:164.323780	AccTrain:0.5884



Epoch 3:   0%|          | 0/50000 

	LossTest: 1.0819	AccTest: 64.330
Posterior : 4.2e-05, 7.9e-05, 3.5e-04, 1.0e+00, 1.8e-03, 2.8e-04, 2.2e-05, 2.0e-05, 2.0e-05, 2.0e-05, 


Epoch 3: : 50016it [01:07, 744.65it/s, LossTrain=0.7438, AccTrain=74.348, PostSw=|   █      |]

Classifier 0	 LossTrain:1.697140	AccTrain:0.5044
Classifier 1	 LossTrain:1.150226	AccTrain:0.7131
Classifier 2	 LossTrain:0.820588	AccTrain:0.7345
Classifier 3	 LossTrain:0.743760	AccTrain:0.7434
Classifier 4	 LossTrain:0.755788	AccTrain:0.7387
Classifier 5	 LossTrain:0.803319	AccTrain:0.7257
Classifier 6	 LossTrain:1.449954	AccTrain:0.6628
Classifier 7	 LossTrain:6.610083	AccTrain:0.6379
Classifier 8	 LossTrain:31.033267	AccTrain:0.6384
Classifier 9	 LossTrain:145.715637	AccTrain:0.6370



Epoch 4:   0%|          | 0/50000 

	LossTest: 0.9804	AccTest: 67.940
Posterior : 2.3e-05, 4.6e-05, 2.5e-04, 1.0e+00, 1.4e-03, 4.0e-04, 7.2e-05, 2.1e-05, 1.5e-05, 1.5e-05, 


Epoch 4: : 50016it [01:07, 741.40it/s, LossTrain=0.6640, AccTrain=77.040, PostSw=|   █      |]

Classifier 0	 LossTrain:1.571790	AccTrain:0.6018
Classifier 1	 LossTrain:0.979954	AccTrain:0.7528
Classifier 2	 LossTrain:0.716525	AccTrain:0.7655
Classifier 3	 LossTrain:0.664020	AccTrain:0.7704
Classifier 4	 LossTrain:0.674792	AccTrain:0.7672
Classifier 5	 LossTrain:0.720209	AccTrain:0.7528
Classifier 6	 LossTrain:1.314522	AccTrain:0.6977
Classifier 7	 LossTrain:6.121748	AccTrain:0.6753
Classifier 8	 LossTrain:28.275020	AccTrain:0.6742
Classifier 9	 LossTrain:131.609205	AccTrain:0.6744





	LossTest: 0.8283	AccTest: 72.440
Posterior : 2.1e-05, 4.7e-05, 2.8e-04, 1.0e+00, 1.2e-03, 2.7e-04, 4.3e-05, 1.2e-05, 1.2e-05, 1.2e-05, 
