In [None]:
from Frank_Wolfe.utils.utils import *
from Frank_Wolfe.SFW import *
from Frank_Wolfe.constraints.constraints import *
import os
import pickle

device = is_cuda_available()

In [None]:
save_stats = True
save_figs = True

In [None]:
class GoogleNet(torch.nn.Module):
        def __init__(self, num_class=100):
            super().__init__()

            class Inception(torch.nn.Module):
                def __init__(self, input_channels, n1x1, n3x3_reduce, n3x3, n5x5_reduce, n5x5, pool_proj):
                    super().__init__()

                    # 1x1conv branch
                    self.b1 = nn.Sequential(
                        nn.Conv2d(input_channels, n1x1, kernel_size=1),
                        nn.BatchNorm2d(n1x1),
                        nn.ReLU(inplace=True)
                    )

                    # 1x1conv -> 3x3conv branch
                    self.b2 = nn.Sequential(
                        nn.Conv2d(input_channels, n3x3_reduce, kernel_size=1),
                        nn.BatchNorm2d(n3x3_reduce),
                        nn.ReLU(inplace=True),
                        nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1),
                        nn.BatchNorm2d(n3x3),
                        nn.ReLU(inplace=True)
                    )

                    # 1x1conv -> 5x5conv branch
                    # we use 2 3x3 conv filters stacked instead
                    # of 1 5x5 filters to obtain the same receptive
                    # field with fewer parameters
                    self.b3 = nn.Sequential(
                        nn.Conv2d(input_channels, n5x5_reduce, kernel_size=1),
                        nn.BatchNorm2d(n5x5_reduce),
                        nn.ReLU(inplace=True),
                        nn.Conv2d(n5x5_reduce, n5x5, kernel_size=3, padding=1),
                        nn.BatchNorm2d(n5x5, n5x5),
                        nn.ReLU(inplace=True),
                        nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
                        nn.BatchNorm2d(n5x5),
                        nn.ReLU(inplace=True)
                    )

                    # 3x3pooling -> 1x1conv
                    # same conv
                    self.b4 = nn.Sequential(
                        nn.MaxPool2d(3, stride=1, padding=1),
                        nn.Conv2d(input_channels, pool_proj, kernel_size=1),
                        nn.BatchNorm2d(pool_proj),
                        nn.ReLU(inplace=True)
                    )

                def forward(self, x):
                    return torch.cat([self.b1(x), self.b2(x), self.b3(x), self.b4(x)], dim=1)


            self.prelayer = nn.Sequential(
                nn.Conv2d(3, 192, kernel_size=3, padding=1),
                nn.BatchNorm2d(192),
                nn.ReLU(inplace=True)
            )

            #although we only use 1 conv layer as prelayer,
            #we still use name a3, b3.......
            self.a3 = Inception(192, 64, 96, 128, 16, 32, 32)
            self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)

            #"""In general, an Inception network is a network consisting of
            #modules of the above type stacked upon each other, with occasional
            #max-pooling layers with stride 2 to halve the resolution of the
            #grid"""
            self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)

            self.a4 = Inception(480, 192, 96, 208, 16, 48, 64)
            self.b4 = Inception(512, 160, 112, 224, 24, 64, 64)
            self.c4 = Inception(512, 128, 128, 256, 24, 64, 64)
            self.d4 = Inception(512, 112, 144, 288, 32, 64, 64)
            self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)

            self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
            self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)

            #input feature size: 8*8*1024
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            self.dropout = nn.Dropout2d(p=0.4)
            self.linear = nn.Linear(1024, num_class)

        def forward(self, x):
            output = self.prelayer(x)
            output = self.a3(output)
            output = self.b3(output)

            output = self.maxpool(output)

            output = self.a4(output)
            output = self.b4(output)
            output = self.c4(output)
            output = self.d4(output)
            output = self.e4(output)

            output = self.maxpool(output)

            output = self.a5(output)
            output = self.b5(output)

            #"""It was found that a move from fully connected layers to
            #average pooling improved the top-1 accuracy by about 0.6%,
            #however the use of dropout remained essential even after
            #removing the fully connected layers."""
            output = self.avgpool(output)
            output = self.dropout(output)
            output = output.view(output.size()[0], -1)
            output = self.linear(output)

            return output

In [None]:
#@title Select dataset and model
#@markdown While the code also supports also the ImageNet-dataset, only CIFAR-10 and CIFAR-100 are selectable
#@markdown - [DenseNet-121](https://arxiv.org/pdf/1608.06993.pdf)
#@markdown - [WideResNet-28x10](https://arxiv.org/pdf/1605.07146v2.pdf)
#@markdown - [GoogLeNet](https://arxiv.org/pdf/1409.4842v1.pdf)
#@markdown - [ResNext50](https://arxiv.org/pdf/1611.05431.pdf)

# select hyperparameters
dataset_name = 'CIFAR10' #@param ['CIFAR10', 'CIFAR100']
model_type = 'mlp' #@param ['mlp', DenseNet', 'WideResNet', 'GoogLeNet', 'ResNeXt']
model = GoogleNet(num_class=10)
datasetDict = setDatasetAttributes(dataset_name)
trainTransformDict, testTransformDict = setTrainAndTest(dataset_name)

root = f"{dataset_name}-dataset"

trainData = datasetDict['datasetDict'](root=root, train=True, download=True,
                                            transform=trainTransformDict[dataset_name])
testData = datasetDict['datasetDict'](root=root, train=False,
                                        transform=testTransformDict[dataset_name])

In [None]:
#@title Choosing Lp-Norm constraints
#@markdown The following cell allows you to set Lp-norm constraints for the chosen network. For exact parameters both for the constraints and the optimizer see the last cell of this notebook.
ord =  "2" #@param [1, 2, 5, 'inf']
ord = float(ord)
value = 10 #@param {type:"number"}
mode = 'initialization' #@param ['initialization', 'radius', 'diameter']

assert value > 0

In [None]:
#@title Configuring the Frank-Wolfe Algorithm
#@markdown Choose momentum and learning rate rescaling, see Section 3.1 of [arXiv:2010.07243](https://arxiv.org/pdf/2010.07243.pdf).
momentum = 0.9 #@param {type:"number"}
rescale = 'gradient' #@param ['gradient', 'diameter', 'None']
rescale = None if rescale == 'None' else rescale

#@markdown Choose a learning rate for SFW. You can activate the learning rate scheduler which automatically multiplies the current learning rate by `lr_decrease_factor` every `lr_step_size epochs`
learning_rate = 0.1 #@param {type:"number"}
lr_scheduler_active = True #@param {type:"boolean"}
lr_decrease_factor = 0.1 #@param {type:"number"}
lr_step_size = 60 #@param {type:"integer"}

#@markdown You can also enable retraction of the learning rate, i.e., if enabled the learning rate is increased and decreased automatically depending on the two moving averages of different length of the train loss over the epochs.
retraction = True #@param {type:"boolean"}

assert learning_rate > 0
assert 0 <= momentum <= 1
assert lr_decrease_factor > 0
assert lr_step_size > 0

# Select optimizer
optimizer = SFW(params=model.parameters(), learning_rate=learning_rate, momentum=momentum, rescale=rescale)

### We analyse now the dependence of the performance on the choice of the norm for the constraint, for a fixed $value$

In [None]:
nepochs = 10
batch_size = 64
value = 10
mode = 'initialization'
orders = ["1", "2", "inf"]
different_norms_dict = {}
for ord in orders:
    sys.stdout.write(f"------------ Collecting results with L_{ord} constraints ---------")
    ord = float(ord)
    constraints = create_lp_constraints(model, ord=ord, value=value, mode=mode)
    train_losses, test_losses, train_accuracies, test_accuracies, elapsed_time = train_network(nepochs, batch_size,
                                                                     model, constraints, trainData, testData, optimizer)
    current_dict = {'epochs': nepochs, 'train_losses': train_losses, 'test_losses': test_losses,
            'train_accuracies': train_accuracies, 'test_accuracies': test_accuracies, 'elapsed_time': elapsed_time}
    different_norms_dict.update({ord: current_dict})

In [None]:
# save everything onto file
if save_stats: 
    output_folder = os.path.join(os.getcwd(), 'results')  # set the folder
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_different_norms_list.pkl'
    with open(fname, 'wb') as handle:
        pickle.dump(different_norms_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)