# Alrao (All learning rates at once) : a tutorial

We show in this notebook how to use Alrao in practice.

### Imports

In [1]:
from __future__ import print_function
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import os
import argparse
import time
from tqdm import tqdm
import numpy as np

from alrao import AlraoModel
from alrao import SGDAlrao, AdamAlrao
from alrao import lr_sampler_generic, generator_randomlr_neurons, generator_randomlr_weights

# CUDA
use_cuda = torch.cuda.is_available()

### Data Loading
We use the CIFAR10 dataset. We also use some data augmentation.

In [2]:
batch_size = 32

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(root='./datasets', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./datasets', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


### Defining the preclassifier model
We define a pre-classifier model. This model can be defined exactly as any usual model. Only two things are specific with alrao : 
* First, there is no classifier. The classifier layer will be added later
* The model needs the have a `linearinputdim` attribute , which is the output's dimension of the pre-classifier

In [3]:
class VGG(nn.Module): # identical to models.VGG
    def __init__(self, cfg):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg)
        # The dimension of the preclassier's output need to be specified.
        self.linearinputdim = 512

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        # The model do not contain a classifier layer.
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)

preclassifier = VGG([64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M',
                     512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'])

### Define the classifier
Here, we define our own classifier class. In practice, we do not need to redefine it, it can be found in `alrao.custom_layers.LinearClassifier`.
We redefine it here to show how any classifier (with a log-softmax output) can be used.

In [4]:
class Classifier(nn.Module): # identical to alrao.custom_layers.LinearClassifier
    def __init__(self, in_features, n_classes):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(in_features, n_classes)

    def forward(self, x):
        x = self.fc(x)
        x = nn.functional.log_softmax(x, dim=1)
        return x
    
criterion = nn.NLLLoss()

### Build the modified architecture for Alrao
We define the new architecture, with the parallel classifiers.
</img><img src="img/newalrao.png" width="400"></img>

Here there are 10 categories, and we decide to use 10 classifiers.

In [5]:
# nb_classifiers is the number of classifiers averaged by Alrao.
nb_classifiers = 10
nb_categories = 10
net = AlraoModel(preclassifier, nb_classifiers, Classifier, preclassifier.linearinputdim, nb_categories)
if use_cuda: net.cuda()

### Sampling the learning rates
We choose an interval (`minLR`, `maxLR`) in which the learning rates are chosen.
For the pre-classifier, the learning rates are sampled from the log-uniform distribution $\log-U(\cdot ; \eta_{\min}, \eta_{\max})$ :
namely, if $\eta \sim \log-U(\cdot ; \eta_{\min},
\eta_{\max})$, then $\log \eta$ is uniformly distributed between $\log
\eta_{\min}$ and $\log \eta_{\max}$.
Its
density function is
\begin{equation}
  \label{eq:logunif}
  \log-U(\eta; \eta_{\min}, \eta_{\max}) = \frac{1_{\eta_{\min} \leq \eta \leq \eta_{\max}}}{\eta_{\max} - \eta_{\min}}\times\frac{1}{\eta}
\end{equation}

The learning rates of the classifier are log-uniformly spread on the interval : 
$\log \eta_{j} = \log \eta_{\min} +
\frac{j-1}{N_{\mathrm{cl}}-1}\log(\eta_{\max}/ \eta_{\min})$

In [6]:
# We define the interval in which the learning rates are sampled
minlr = 10 ** (-5)
maxlr = 10 ** 1

# We spread the classifiers learning rates log-uniformly on the interval.
classifiers_lr = [np.exp(np.log(minlr) + \
    k /(nb_classifiers-1) * (np.log(maxlr) - np.log(minlr)) \
    ) for k in range(nb_classifiers)]

# We define the sampler for the preclassifier’s features.
lr_sampler = lr_sampler_generic(minlr, maxlr)
lr_preclassifier = generator_randomlr_neurons(net.preclassifier, lr_sampler)

### Define the optimizer
We define the Alrao optimizer. This includes : 
* A single (usual) SGD optimizer for each classifier
* A modified SGD optimizer for the pre-classifier, allowing to use one learning rate per neuron.
* The switch model averaging method, with its own update procedure.

In [7]:
optimizer = SGDAlrao(net.parameters_preclassifier(),
                     lr_preclassifier,
                     net.classifiers_parameters_list(),
                     classifiers_lr)

### Training procedure
We define the train procedure.

In [8]:
def train(epoch):
    train_loss = 0
    correct = 0
    total = 0
    pbar = tqdm(total=len(trainloader.dataset),bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} {postfix}')
    pbar.set_description("Epoch %d" % epoch)
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        net.train()
        if use_cuda: inputs, targets = inputs.cuda(), targets.cuda()

        # We update the model averaging weights in the optimizer
        optimizer.update_posterior(net.posterior())
        optimizer.zero_grad()

        # Forward pass of the Alrao model
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        # We compute the gradient of all the model’s weights
        loss.backward()

        # We reset all the classifiers gradients, and re-compute them with
        # as if their were the only output of the network.
        optimizer.classifiers_zero_grad()
        newx = net.last_x.detach()
        for classifier in net.classifiers():
            loss_classifier = criterion(classifier(newx), targets)
            loss_classifier.backward()

        # Then, we can run an update step of the gradient descent.
        optimizer.step()

        # Finally, we update the model averaging weights
        net.update_switch(targets, catch_up=False)

        # Update loss
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        # Update progression bar
        pbar.update(batch_size)
        postfix = OrderedDict([("LossTrain","{:.4f}".format(train_loss/(batch_idx+1))),
                               ("AccTrain", "{:.3f}".format(100.*correct/total))])
        postfix["PostSw"] = net.repr_posterior()
        pbar.set_postfix(postfix)
    pbar.close()

    # Print performance of the classifiers
    cl_perf = net.switch.get_cl_perf()
    for k in range(len(cl_perf)):
        print("Classifier {}\t LossTrain:{:.6f}\tAccTrain:{:.4f}".format(
            k, cl_perf[k][0], cl_perf[k][1]))

In [9]:
def test(epoch):
    net.eval()
    net.switch.reset_cl_perf()
    test_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(testloader):
        net.eval()
        if use_cuda: inputs, targets = inputs.cuda(), targets.cuda()

        # Forward pass of the Alrao model
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        # Update loss
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    print('\tLossTest: %.4f\tAccTest: %.3f' % (test_loss/(batch_idx+1), 100.*correct/total))
    print(("Posterior : "+"{:.1e}, " * nb_classifiers).format(*net.posterior()))

    return test_loss / (batch_idx + 1), correct / total

In [10]:
for epoch in range(50):
    train(epoch)
    test(epoch)

Epoch 0: : 50016it [01:08, 734.15it/s, LossTrain=1.4912, AccTrain=44.960, PostSw=|  █       |]

Classifier 0	 LossTrain:2.337866	AccTrain:0.1637
Classifier 1	 LossTrain:2.177651	AccTrain:0.2126
Classifier 2	 LossTrain:1.491529	AccTrain:0.4495
Classifier 3	 LossTrain:1.593359	AccTrain:0.4239
Classifier 4	 LossTrain:1.511902	AccTrain:0.4416
Classifier 5	 LossTrain:1.584111	AccTrain:0.4145
Classifier 6	 LossTrain:3.459628	AccTrain:0.3357
Classifier 7	 LossTrain:17.862085	AccTrain:0.3163
Classifier 8	 LossTrain:84.200117	AccTrain:0.3188
Classifier 9	 LossTrain:385.173223	AccTrain:0.3209



Epoch 1:   0%|          | 0/50000 

	LossTest: 1.6407	AccTest: 48.350
Posterior : 1.0e-04, 1.2e-04, 9.9e-01, 1.3e-03, 2.8e-03, 6.4e-04, 7.5e-05, 6.2e-05, 6.2e-05, 6.2e-05, 


Epoch 1: : 50016it [01:08, 677.93it/s, LossTrain=1.0398, AccTrain=63.140, PostSw=|  █       |]

Classifier 0	 LossTrain:2.190358	AccTrain:0.2145
Classifier 1	 LossTrain:1.724247	AccTrain:0.4703
Classifier 2	 LossTrain:1.039695	AccTrain:0.6314
Classifier 3	 LossTrain:1.073514	AccTrain:0.6238
Classifier 4	 LossTrain:1.061550	AccTrain:0.6235
Classifier 5	 LossTrain:1.136726	AccTrain:0.6007
Classifier 6	 LossTrain:2.577442	AccTrain:0.5183
Classifier 7	 LossTrain:12.325342	AccTrain:0.5012
Classifier 8	 LossTrain:57.199303	AccTrain:0.4994
Classifier 9	 LossTrain:265.525891	AccTrain:0.5019



Epoch 2:   0%|          | 0/50000 

	LossTest: 0.9826	AccTest: 65.570
Posterior : 4.3e-05, 6.4e-05, 1.0e+00, 1.4e-03, 1.5e-03, 3.1e-04, 5.2e-05, 3.1e-05, 3.0e-05, 3.0e-05, 


Epoch 2: : 50016it [01:07, 738.08it/s, LossTrain=0.8426, AccTrain=70.528, PostSw=|  █       |]

Classifier 0	 LossTrain:1.990107	AccTrain:0.2853
Classifier 1	 LossTrain:1.378167	AccTrain:0.6083
Classifier 2	 LossTrain:0.842532	AccTrain:0.7054
Classifier 3	 LossTrain:0.864112	AccTrain:0.6977
Classifier 4	 LossTrain:0.862863	AccTrain:0.6969
Classifier 5	 LossTrain:0.940143	AccTrain:0.6752
Classifier 6	 LossTrain:2.238621	AccTrain:0.6050
Classifier 7	 LossTrain:10.572438	AccTrain:0.5899
Classifier 8	 LossTrain:48.966449	AccTrain:0.5907
Classifier 9	 LossTrain:230.978295	AccTrain:0.5883



Epoch 3:   0%|          | 0/50000 

	LossTest: 0.8048	AccTest: 71.720
Posterior : 3.6e-05, 6.6e-05, 1.0e+00, 1.3e-03, 9.3e-04, 1.6e-04, 2.1e-05, 2.0e-05, 2.0e-05, 2.0e-05, 


Epoch 3:   3%|▎         | 1408/50000 , LossTrain=0.7753, AccTrain=73.366, PostSw=|  █       |Process Process-13:
Process Process-14:
  File "/home/parietal/lblier/miniconda3/envs/test/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/parietal/lblier/miniconda3/envs/test/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/parietal/lblier/miniconda3/envs/test/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.5/dist-packages/torch/utils/data/dataloader.py", line 96, in _worker_loop
    r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
  File "/home/parietal/lblier/miniconda3/envs/test/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/parietal/lblier/miniconda3/envs/test/

KeyboardInterrupt: 

Epoch 3:   3%|▎         | 1408/50000 , LossTrain=0.7753, AccTrain=73.366, PostSw=|  █       |