In [1]:
## If using colab run the following cell, otherwise do not run it
from google.colab import drive

drive.mount('/content/drive')
%cd /content/drive/MyDrive/Machine-Learning-Optimization_working

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Machine-Learning-Optimization_working


In [2]:
!pip install barbar

#@title Import and utilities 

from Frank_Wolfe.utils.utils import *
from Frank_Wolfe.DFW import *
from Frank_Wolfe.architectures import *
from Frank_Wolfe.MultiClassHingeLoss import *
from Frank_Wolfe.MultiClassHingeLoss import set_smoothing_enabled
from barbar import Bar
import torch
import torch.nn as nn
import torchvision
import os
import time
import pickle
import sys
from torch.optim.lr_scheduler import StepLR


device = "cpu"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# The following are flag useful for saving or loading figures/stats

save_stats = True
save_figs = True
load = False

In [4]:
#@title Choose dataset name and architecture of the network 

# Select the dataset and the architecture

dataset_name = 'CIFAR100' #@param ['CIFAR10', 'CIFAR100']
model_type = 'DenseNet' #@param ['DenseNet', 'WideResNet', 'GoogLeNet']

# load the model
if model_type == 'GoogLeNet':
    model = GoogleNet(num_class=10 if dataset_name == 'CIFAR10' else 100)
elif model_type == 'DenseNet':
    model = torchvision.models.densenet121(pretrained=False)
elif model_type == 'WideResNet':
    model =  WideResNet(num_classes=10 if dataset_name == 'CIFAR10' else 100)
else:
    raise ValueError("Please, select an available architecture")


datasetDict = setDatasetAttributes(dataset_name) # dictionary useful to normalize the images
trainTransformDict, testTransformDict = setTrainAndTest(dataset_name) # dict useful for the transform operation
root = f"{dataset_name}-dataset"

# prepare train and test datasets 
trainData = datasetDict['datasetDict'](root=root, train=True, download=True,
                                            transform=trainTransformDict[dataset_name])
testData = datasetDict['datasetDict'](root=root, train=False,
                                        transform=testTransformDict[dataset_name])
# move the model to GPU
model = model.to(device="cuda:0")

Files already downloaded and verified


In [5]:
#@title Choose optimizer and parameters 

# Choice of the optimizer and the parameters, the parameters used in our experiments can be found
# both in the report and at the end of the notebook

optimizer_name = "DFW multistep" #@param  ['DFW', 'Adam', 'SGD with scheduler', 'DFW multistep']
momentum = 0.9 #@param {type:"number"}
lr = 0.001 #@param {type:"number"}
eta =   0.1 #@param {type:"number"}
beta_1 = 0.9 #@param {type:"number"}
beta_2 = 0.999 #@param {type:"number"}
weight_decay = 0.00 #@param {type:"number"}
asymptotic_prox_steps_num = 2 #@param {type: "number"}

if optimizer_name != "DFW multistep":
    asymptotic_prox_steps_num = 1

# define the optimizer

if optimizer_name == "DFW" or optimizer_name == 'DFW multistep':
    optimizer = DFW(params=model.parameters(), eta=eta, momentum=momentum,
                  weight_decay = weight_decay, prox_steps=asymptotic_prox_steps_num)
    
    assert asymptotic_prox_steps_num >0
    assert eta > 0
    assert 0 <= momentum <= 1
elif optimizer_name == "SGD with scheduler":
    optimizer = torch.optim.SGD(params=model.parameters(), lr=lr,
                              momentum=momentum, weight_decay=weight_decay)
    scheduler = StepLR(optimizer, step_size=20, gamma=0.5)
    assert 0 <= momentum <= 1
elif optimizer_name == "Adam":
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, 
                               betas=(beta_1, beta_2), weight_decay=weight_decay)
    
if optimizer_name == "DFW" or optimizer_name == "DFW multistep":
    loss_criterion = MultiClassHingeLoss().to(device="cuda:0")

else: 
    loss_criterion = nn.CrossEntropyLoss().to(device="cuda:0")

i am in


In [6]:
#@title Train the network  

# we will append our results on these lists
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []
epochs_times = []

# parameters for the training phase
nepochs = 50 #@param {type:"integer"}
batch_size = 128  #@param {type:"integer"}
verbose = 0 #@param [0, 1]
smooth = False

if(dataset_name == 'CIFAR100' and (optimizer_name == 'DFW' or optimizer_name=='DFW multistep')):
    smooth = True # for the smoothing of the loss in case we use CIFAR100

# Loaders
trainLoader = torch.utils.data.DataLoader(trainData, batch_size=batch_size, shuffle=True,
                                      pin_memory=torch.cuda.is_available(), num_workers=2)
testLoader = torch.utils.data.DataLoader(testData, batch_size=batch_size, shuffle=False,
                                      pin_memory=torch.cuda.is_available(), num_workers=2)

# initialize necessary metrics objects
train_loss, train_accuracy = AverageMeter(), AverageMeter()
test_loss, test_accuracy = AverageMeter(), AverageMeter()

# function to reset metrics
def reset_metrics():
    train_loss.reset()
    train_accuracy.reset()
    test_loss.reset()
    test_accuracy.reset()

@torch.no_grad()
def evaluate_model(data="train"):
    if data == "train":
        loader = trainLoader
        mean_loss, mean_accuracy = train_loss, train_accuracy
    elif data == "test":
        loader = testLoader
        mean_loss, mean_accuracy = test_loss, test_accuracy
    
    sys.stdout.write(f"Evaluation of {data} data:\n")
    
    # iteration over the dataset
    for x_input, y_target in Bar(loader):
        x_input, y_target = x_input.to(device="cuda:0"), y_target.to(device="cuda:0") # we move to GPU
        output = model.eval()(x_input)
        loss = loss_criterion(output, y_target)
        
        # update metrics
        mean_loss(loss.item(), len(y_target)) 
        mean_accuracy(Utilities.categorical_accuracy(y_true=y_target, output=output), len(y_target))

    
# Training
for epoch in range(nepochs + 1):
    
    start = time.time() # start to time
    reset_metrics() # reset the metrics from the previous epoch
    sys.stdout.write(f"\n\nEpoch {epoch}/{nepochs}\n")
    
    if epoch == 0:
        # First pass through the network to evaluate the model once to get the metrics
        evaluate_model(data='train')
    else:
        if epoch > int(0.2 * nepochs) and optimizer_name == "DFW multistep" and asymptotic_prox_steps_num >1:
            
            # if we already finished the first 20% of the epochs we continue with single steps
            optimizer.prox_steps = 1
            
        sys.stdout.write(f"Training:\n")
        for x_input, y_target in Bar(trainLoader):
            x_input, y_target = x_input.to(device="cuda:0"), y_target.to(device="cuda:0")
            optimizer.zero_grad()  # Zero the gradient buffers
            output = model.train()(x_input) # compute the output
            if smooth == True:
                with set_smoothing_enabled(True):
                    loss = loss_criterion(output, y_target)
            else:
                loss = loss_criterion(output, y_target) # compute the loss

            loss.backward()  # Backpropagation
            if optimizer_name == "DFW" or optimizer_name == 'DFW multistep':
                optimizer.step(lambda: float(loss), model, x_input, y_target, smooth)
            else:
                optimizer.step() 
            train_loss(loss.item(), len(y_target))
            train_accuracy(Utilities.categorical_accuracy(y_true=y_target, output=output), len(y_target))

    if optimizer_name == "SGD with scheduler":
        scheduler.step()
    evaluate_model(data='test')
    sys.stdout.write(f"\n Finished epoch {epoch}/{nepochs}: Train Loss {train_loss.result()} | Test Loss {test_loss.result()} | Train Acc {train_accuracy.result()} | Test Acc {test_accuracy.result()}\n")
    
    train_losses.append(train_loss.result())
    train_accuracies.append(train_accuracy.result())
    test_losses.append(test_loss.result())
    test_accuracies.append(test_accuracy.result())


    elapsed_time = time.time()-start
    sys.stdout.write(f"Time elapsed for the current epoch {elapsed_time}")
    epochs_times.append(elapsed_time)

in


Epoch 0/50
Evaluation of train data:
Evaluation of test data:

 Finished epoch 0/50: Train Loss 1.9938795345306397 | Test Loss 1.8983999757766723 | Train Acc 0.0 | Test Acc 0.0
Time elapsed for the current epoch 16.54431962966919

Epoch 1/50
Training:
Evaluation of test data:

 Finished epoch 1/50: Train Loss 0.7150843365287781 | Test Loss 1.4758526119232178 | Train Acc 0.01106 | Test Acc 0.01
Time elapsed for the current epoch 115.50006246566772

Epoch 2/50
Training:
Evaluation of test data:

 Finished epoch 2/50: Train Loss 0.6907602533721924 | Test Loss 1.2718890270233154 | Train Acc 0.01048 | Test Acc 0.01
Time elapsed for the current epoch 116.50871229171753

Epoch 3/50
Training:
Evaluation of test data:

 Finished epoch 3/50: Train Loss 0.7046723214912415 | Test Loss 1.1122491161346435 | Train Acc 0.00968 | Test Acc 0.01
Time elapsed for the current epoch 116.02266931533813

Epoch 4/50
Training:
Evaluation of test data:

 Finished epoch 4/50: Train Loss 0.7106945220756531 | 

KeyboardInterrupt: ignored

In [None]:
#@title Save training results and plot

if load:
    output_folder = os.path.join(os.getcwd(), 'results')
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_' + model_type + '.pkl'
    with open(fname, 'rb') as handle:
        stats_dict = pickle.load(handle)

# define dictionary of the results
results = {'epochs': nepochs, 'train_losses': train_losses, 
           'train_acc': train_accuracies, 'test_losses': test_losses, 
           'test_acc': test_accuracies, 'elapsed_time': epochs_times}
stats_dict = {}
stats_dict.update({optimizer_name: results})

# save everything onto file
if save_stats: 
    output_folder = os.path.join(os.getcwd(), 'results')  # set the folder
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_' + model_type + '.pkl'
    with open(fname, 'wb') as handle:
        pickle.dump(stats_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Parameters used in the report

In order to reproduce our results, the following set of parameters should be used.\
If not specified, the remaining parameters (e.g. $\epsilon$ for Adam and Adagrad) are set to their default values.

Deep Frank Wolfe:\
$η = 0.1$, $μ = 0.9$, $w_d = 0$

Stochastic Gradient Descent with scheduler:\
$\gamma = 0.01$, $\mu = 0.9$, $w_d = 0$

Adam:\
$\gamma = 0.001$, $\mu = 0.9$, $\beta_1 = 0.9$, $\beta_2 = 0.999$

Deep Frank Wolfe multistep:\
$η = 0.1$, $μ = 0.9$, $w_d = 0$,  n_steps $= 2$