# Setup instructions

We provide a user-ready interface to reproduce our results concerning the Deep Frank-Wolfe algorithm. Therefore, we strongly recommend using Google Colab to perform training. To run this notebook on Google Colab, please import from the **Frank_Wolfe** directory the following files:

1. architectures.py
2. MultiClassHingeLoss.py
3. DFW.py
4. utils.py

In [None]:
#@title Import and utilities 

from utils import *
from DFW import *
from architectures import *
from MultiClassHingeLoss import *
!pip install barbar
from barbar import Bar
import torch
import torch.nn as nn
import torchvision
import os
import time
import pickle
import sys
from torch.optim.lr_scheduler import StepLR
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

device = "cpu"

In [None]:
#@title Choose dataset name and architecture of the network 

#@markdown Please, run this cell before any new optimizer as to reinitialize the model

# Select the dataset and the architecture

dataset_name = 'CIFAR10' #@param ['CIFAR10', 'CIFAR100']
model_type = 'DenseNet' #@param ['DenseNet', 'WideResNet', 'GoogLeNet']

# load the model
if model_type == 'GoogLeNet':
    model = GoogleNet(num_class=10 if dataset_name == "CIFAR10" else 100)
elif model_type == 'DenseNet':
    model = torchvision.models.densenet121(pretrained=False)
elif model_type == 'WideResNet':
    model =  WideResNet(num_classes=10 if dataset_name == "CIFAR10" else 100)
else:
    raise ValueError("Please, select an available architecture")

# setting dataset attributes, dictionary useful to normalize the images
datasetDict = setDatasetAttributes(dataset_name)

# transform operation
trainTransformDict, testTransformDict = setTrainAndTest(dataset_name) 
root = f"{dataset_name}-dataset"

# prepare train and test datasets 
trainData = datasetDict['datasetDict'](root=root, train=True, download=True,
                                            transform=trainTransformDict[dataset_name])
testData = datasetDict['datasetDict'](root=root, train=False,
                                        transform=testTransformDict[dataset_name])
# move the model to GPU
model = model.to(device="cuda:0")

In [None]:
#@title Choose optimizer and parameters 

# Choice of the optimizer and the parameters.

# The parameters used in our experiments can be found
# in the report and at the end of the notebook.

optimizer_name = "DFW multistep" #@param  ['DFW', 'Adam', 'SGD with scheduler', 'DFW multistep']
eta =   0.1 #@param {type:"number"}
momentum = 0.9 #@param {type:"number"}
lr = 0.001 #@param {type:"number"}
beta_1 = 0.9 #@param {type:"number"}
beta_2 = 0.999 #@param {type:"number"}
initial_prox_steps = 2 #@param {type: "number"}

if optimizer_name != "DFW multistep":
    initial_prox_steps = 1

# define the optimizer

if optimizer_name == "DFW" or optimizer_name == 'DFW multistep':
    optimizer = DFW(params=model.parameters(), eta=eta, momentum=momentum,
                    prox_steps=initial_prox_steps)
    
    assert initial_prox_steps > 0
    assert eta > 0
    assert 0 <= momentum <= 1
    
elif optimizer_name == "SGD with scheduler":
    optimizer = torch.optim.SGD(params=model.parameters(), lr=lr,
                              momentum=momentum)
    scheduler = StepLR(optimizer, step_size=20, gamma=0.2)
    assert 0 <= momentum <= 1

elif optimizer_name == "Adam":
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, 
                               betas=(beta_1, beta_2))
    
if optimizer_name == "DFW" or optimizer_name == "DFW multistep":
    # we consider a convex and piece-wise linear objective for DFW
    loss_criterion = MultiClassHingeLoss().to(device="cuda:0")
    # smoothing of the loss function for the case of many classes
    smoothing = True
else: 
    # cross entropy otherwise
    loss_criterion = nn.CrossEntropyLoss().to(device="cuda:0")
    smoothing = False

In [None]:
#@title Train the network

# to save training stats
train_losses = []
test_losses = []
train_acc = []
test_acc = []

# parameters for the training phase
nepochs =  5 #@param {type:"integer"}
batch_size = 128  #@param {type:"integer"}

# Loaders
trainLoader = torch.utils.data.DataLoader(trainData, batch_size=batch_size, shuffle=True,
                                      pin_memory=torch.cuda.is_available(), num_workers=2)
testLoader = torch.utils.data.DataLoader(testData, batch_size=batch_size, shuffle=False,
                                      pin_memory=torch.cuda.is_available(), num_workers=2)

# initialize necessary metrics objects
train_loss, train_accuracy = AverageMeter(), AverageMeter()
test_loss, test_accuracy = AverageMeter(), AverageMeter()

# function to reset metrics
def reset_metrics():
    train_loss.reset()
    train_accuracy.reset()
    test_loss.reset()
    test_accuracy.reset()

@torch.no_grad()
def evaluate_model(data="train"):
    if data == "train":
        loader = trainLoader
        mean_loss, mean_accuracy = train_loss, train_accuracy
    elif data == "test":
        loader = testLoader
        mean_loss, mean_accuracy = test_loss, test_accuracy
    
    sys.stdout.write(f"Evaluation of {data} data:\n")
    
    # iteration over the dataset
    for x_input, y_target in Bar(loader):
        x_input, y_target = x_input.to(device="cuda:0"), y_target.to(device="cuda:0") # we move to GPU
        output = model.eval()(x_input)
        loss = loss_criterion(output, y_target)
        
        # update metrics
        mean_loss(loss.item(), len(y_target)) 
        mean_accuracy(Utilities.categorical_accuracy(y_true=y_target, output=output), len(y_target))

    
# Training
for epoch in range(nepochs + 1):
    
    start = time.time() # start to time
    reset_metrics() # reset the metrics from the previous epoch
    sys.stdout.write(f"\n\nEpoch {epoch}/{nepochs}\n")
    
    if epoch == 0:
        # Evaluate the model once to get the metrics
        evaluate_model(data='train')
    else:
        if epoch > int(0.2 * nepochs) and optimizer_name == "DFW multistep" and initial_prox_steps > 1:
            
            # continue with single steps
            optimizer.prox_steps = 1
            
        sys.stdout.write(f"Training:\n")
        for x_input, y_target in Bar(trainLoader):
            x_input, y_target = x_input.to(device="cuda:0"), y_target.to(device="cuda:0")
            optimizer.zero_grad()  # Zero the gradient buffers
            output = model.train()(x_input) # compute the output
            if dataset_name == "CIFAR100" and smoothing:
              # smoothing of the loss for DFW
              with set_smoothing_enabled(True):
                loss = loss_criterion(output, y_target) # compute the loss
            else:
                loss = loss_criterion(output, y_target) # without smoothing
            loss.backward()  # Backpropagation
            if optimizer_name == "DFW" or optimizer_name == 'DFW multistep':
                optimizer.step(lambda: float(loss))
            else:
                optimizer.step() 
            train_loss(loss.item(), len(y_target))
            train_accuracy(Utilities.categorical_accuracy(y_true=y_target, output=output), len(y_target))

    if optimizer_name == "SGD with scheduler":
        scheduler.step()

    # evaluate the model on the test set    
    evaluate_model(data='test')
    sys.stdout.write(f"\n Finished epoch {epoch}/{nepochs}: Train Loss {train_loss.result()} | Test Loss {test_loss.result()} | Train Acc {train_accuracy.result()} | Test Acc {test_accuracy.result()}\n")
    
    # collect training statistics of the current epoch
    train_losses.append(train_loss.result())
    test_losses.append(test_loss.result())
    train_acc.append(train_accuracy.result())
    test_acc.append(test_accuracy.result())
    elapsed_time = time.time() - start

In [None]:
#@title Save training statistics from the previous cell

results = {'epochs': nepochs, 'train_losses': train_losses, 
           'train_acc': train_acc, 'test_losses': test_losses, 
           'test_acc': test_acc, 'elapsed_time': elapsed_time}
stats_dict = {}

flag = False
if(optimizer_name== "SGD with scheduler"):
    optimizer_name = "SGD"
if(optimizer_name == "DFW multistep"):
    flag = True
    optimizer_name = "DFW" # just for compatibility in the dictionary to be saved

stats_dict.update({optimizer_name: results})

if(flag):
    optimizer_name = "DFW_multistep"

output_folder = os.path.join(os.getcwd(), 'Frank_Wolfe/results/' + dataset_name + '/' + model_type)  # set the folder
os.makedirs(output_folder, exist_ok=True)
fname = output_folder + '/stats_dict_' + optimizer_name + '.pkl'
with open(fname, 'wb') as handle:
    pickle.dump(stats_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#@title Plot only latest training trends obtained in the previous cell

# Test accuracy and training loss plots
fig, ax = plt.subplots(1, 2, figsize=(13.4, 4.8), squeeze=False)
fig.tight_layout(pad=7.)
fig.subplots_adjust(top=0.9, left=0.1, right=0.9, bottom=0.12)
ax[0, 0].plot(np.arange(nepochs + 1), test_acc)
ax[0, 0].set_ylim([0, 1] if dataset_name == "CIFAR10" else [0, 0.8])
ax[0, 0].set_xlabel('Epoch', fontsize='x-large')
ax[0, 0].set_ylabel('Test accuracy', fontsize='xx-large')
ax[0, 0].legend(["{}".format(optimizer_name)])
ax[0, 1].plot(np.arange(nepochs + 1), train_losses)
ax[0, 1].set_xlabel('Epoch', fontsize='x-large')
ax[0, 1].set_ylabel('Training loss', fontsize='xx-large')
ax[0, 1].legend(["{}".format(optimizer_name)])

# Produce complete training plots

### NOTE: here, you can select the training results you collected above to produce plots similar to the ones shown in the report. Please, select the dataset, the architecture and the optimizers of your liking. For the proper working of the plotting function called in the cell below, make sure that the dictionaries are in the corresponding folder. 

### Of course, you can select as many optimizers as you want for the same plot. We remark that the training loss for SGD and Adam is the cross-entropy loss, while it is the multi-class Hinge loss for DFW algorithms.

In [None]:
#@title Plot results presented in the report

# pickle install

dataset_name = 'CIFAR10' #@param ['CIFAR10', 'CIFAR100']
model_type = 'DenseNet' #@param ['DenseNet', 'WideResNet', 'GoogLeNet']

show_Adam = False #@param {type: "boolean"}
show_SGD = False #@param {type: "boolean"}
show_DFW = True #@param {type: "boolean"}
show_DFW_multistep = True #@param {type: "boolean"}

list_optimizers = []
if show_Adam:
    list_optimizers.append("Adam")
if show_SGD:
    list_optimizers.append("SGD")
if show_DFW:
    list_optimizers.append("DFW")
if show_DFW_multistep:
    list_optimizers.append("DFW_multistep")

# for the proper working of the function below, the corresponding dictionaries should be stored in the results folder,
# make sure that you saved them properly in the cell immediately below the training one\n",
plot_stats(dataset_name, model_type, list_optimizers)

# Hyper-parameters used

In order to reproduce our results (i.e. the training trends shown in the report), the following set of parameters should be used.\
If not specified otherwise, other parameters (e.g. $\epsilon$ for numerical stability) are set to their default values.


$\text{Deep Frank Wolfe (single step and multistep)}$:
```python
eta = 0.1  # proximal coefficient
momentum = 0.9  # momentum parameter
optimizer = DFW(model.parameters(), eta=eta, 
            momentum=momentum, prox_steps=2) # or prox_steps=1
```

$\text{Stochastic Gradient Descent with scheduler}$:
```python
lr = 0.1  # learning rate
momentum = 0.9  # momentum parameter
optimizer = torch.optim.SGD(params=model.parameters(), lr=lr,
                              momentum=momentum)
scheduler = StepLR(optimizer, step_size=20, gamma=0.2)  # define scheduler
```

$\text{Adam}$:
```python
lr = 0.001 # learning rate
beta_1 = 0.9
beta_2 = 0.999
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, 
                               betas=(beta_1, beta_2))
```