# Setup instructions

We provide a user-ready interface to reproduce our results concerning the Block Coordinate Descent algorithm. Therefore, we strongly recommend using Google Colab to perform training. 
To run this notebook on Google Colab, please import from the **Block_Coordinate_Descent** directory the following files:

1. CD_utilities.py
2. Torch_architectures.py
3. Train_functions.py
4. layers.py
5. utilities.py

In [None]:
#@title Import and Utilities

import numpy as np
import pandas as pd
import torch
import pickle
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms, utils
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import os
import copy
from torch.optim.lr_scheduler import MultiStepLR

from utilities import *
from Torch_architectures import *
from Train_functions import *
from CD_utilities import *
from layers import *

print("PyTorch Version:", torch.__version__)
print("Torchvision Version:", torchvision.__version__)
print("GPU is available?", torch.cuda.is_available())

dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
#@title  Choose dataset name and optimizer 

ts = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0,), (1,))])

# change the flag to choose the dataset to work with
dataset_flag = "MNIST" #@param ['MNIST','FMNIST','CIFAR10']
batch_size = 256 #@param {type:"integer"}
if dataset_flag =='MNIST':
  trainset = datasets.MNIST('../data', train=True, download=True, transform=ts)
  testset = datasets.MNIST(root='../data', train=False, download=True, transform=ts)
  dataset_train = torch.utils.data.DataLoader(testset,batch_size = 128, shuffle = True)
  dataset_test = torch.utils.data.DataLoader(trainset,batch_size = batch_size,shuffle = True)
elif dataset_flag =='FMNIST':
  trainset = datasets.FashionMNIST('../data', train=True, download=True, transform=ts)
  testset = datasets.FashionMNIST(root='../data', train=False, download=True, transform=ts)
  dataset_train = torch.utils.data.DataLoader(testset,batch_size = 128, shuffle = True)
  dataset_test = torch.utils.data.DataLoader(trainset,batch_size = batch_size,shuffle = True)
elif dataset_flag=='CIFAR10':
  trainset = datasets.CIFAR10('../data', train=True, download=True, transform=ts)
  testset = datasets.CIFAR10(root='../data', train=False, download=True, transform=ts)
  dataset_train = torch.utils.data.DataLoader(testset,batch_size = 128, shuffle = True)
  dataset_test = torch.utils.data.DataLoader(trainset,batch_size = batch_size,shuffle = True)

x_train, y_train, x_test, y_test,y_train_one_hot, y_test_one_hot, I1, I2 = load_dataset(trainset, testset,10)

# we move to device to use GPU

x_train = x_train.to(device = device)
x_test = x_test.to(device = device)
y_train = y_train.to(device = device)
y_test = y_test.to(device = device)
y_train_one_hot = y_train_one_hot.to(device)
y_test_one_hot = y_test_one_hot.to(device)
input_size = x_train.shape[0]
hidden_size = int(1.5*input_size)
output_size = 10

In [None]:
from torch.optim.lr_scheduler import MultiStepLR
#@title Model Selection
model_name = 'Multilayer-Perceptron' #@param ['Multilayer-Perceptron']
optimizer_name = "Coordinate-Descent" #@param ['SGD','Adam','Coordinate-Descent','Coordinate-Descent+SGD','Coordinate-Descent+Adam']
momentum = 0.9 #@param {type:"number"}
lr = 0.001 #@param {type:"number"}
weight_decay = 0.00 #@param {type:"number"}
beta_1 = 0.9 #@param {type:"number"}
beta_2 = 0.999 #@param {type:"number"}
gamma = 0.1 #@param {type:"number"}
alpha = 4 #@param {type:"number"}
epochs =  10#@param {type:"integer"}
#the ratio of the epochs for coordinate descent for mixed classifiers
ratio =  0.6#@param {type:"number"}
GD_Update = False #@param {type:"boolean"}
linear_extension = True #@param {type:"boolean"}
cross_entropy = nn.CrossEntropyLoss()

if(model_name =='Multilayer-Perceptron'):
  model = MultiLayerPerceptron(input_size,hidden_size,output_size) 


if (optimizer_name == "SGD" or optimizer_name == "Coordinate-Descent+SGD"):
  optimizer = torch.optim.SGD(params=model.parameters(), lr=lr,
                              momentum=momentum, weight_decay=weight_decay)
  assert lr > 0
  assert 0 <= momentum <= 1

elif (optimizer_name == "Adam" or optimizer_name == "Coordinate-Descent+Adam"):
  optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, 
                               betas=(beta_1, beta_2), weight_decay=weight_decay)

if(optimizer_name != 'Coordinate-Descent'):
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2)

In [None]:
#@title Train the network

train_losses = []
test_losses = []
accuracy_train = []
accuracy_test = []
epochs_times = []
start = time.time()
if(optimizer_name == 'Coordinate-Descent' or optimizer_name == 'Coordinate-Descent+SGD' or optimizer_name== 'Coordinate-Descent+Adam'):
  print('training BCD')
  if(optimizer_name != 'Coordinate-Descent'):
    total_epochs = epochs
    epochs = int(total_epochs * ratio)
  train_losses, test_losses , accuracy_train, accuracy_test,epochs_times,Ws,bs = execute_training([["Perceptron",hidden_size,1],["Perceptron",hidden_size,1]], input_size, hidden_size, output_size, x_train, x_test, y_train, y_test, y_train_one_hot, y_test_one_hot,
                                         GD_Update, linear_extension, I1 = hidden_size,I2=1, niter = epochs, gamma = gamma, alpha = alpha)
  #Train using BCD
  train_losses = list(train_losses)
  test_losses = list(test_losses)
  accuracy_train = list(accuracy_train)
  accuracy_test = list(accuracy_test)
  if(optimizer_name != 'Coordinate-Descent'):
    epochs = total_epochs-epochs
if(optimizer_name != 'Coordinate-Descent'):
  model = model.to(device)
  #train using sgd or adam
  if(optimizer_name == 'Coordinate-Descent+SGD' or optimizer_name == 'Coordinate-Descent+Adam'):
    i=0
    for param in model.parameters():
      if i%2 == 0:
        param.data = Ws[int(i/2)]
        #temp_W.pop()
      else:
        param.data = torch.flatten(bs[int(i/2)])
        #temp_b.pop()
      i+=1
  train_loss, test_loss, acc_train, acc_test, times = train_model(model, dataset_train, dataset_test, optimizer, cross_entropy, epochs,scheduler,optimizer_name)
  train_losses = list(train_losses) + train_loss
  test_losses = list(test_losses) + test_loss
  accuracy_train = list(accuracy_train) + acc_train
  accuracy_test = list(accuracy_test) + acc_test
  epochs_times = list(epochs_times) + times
elapsed_time = time.time() - start


In [None]:
#@title Save training statistics from the previous cell

results = {'epochs': epochs_times, 'train_losses': train_losses, 
           'train_acc': accuracy_train, 'test_losses': test_losses, 
           'test_acc': accuracy_test, 'elapsed_time': elapsed_time}

stats_dict = {}
stats_dict.update({optimizer_name: results})

save_stats = True
if(GD_Update):
  suffix = '-Entropy.pkl'
elif(linear_extension):
  suffix = '-linear_prox.pkl'
else:
  suffix = '.pkl'

# save everything onto file
if save_stats: 
    output_folder = os.path.join(os.getcwd(), dataset_flag)  # set the folder
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_' + model_name + '_' + optimizer_name + suffix
    with open(fname, 'wb') as handle:
        pickle.dump(stats_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#@title Plot only latest training trends obtained in the previous cell

# Test accuracy and training loss plots
full_name = optimizer_name
if(GD_Update):
  suffix = '-Entropy'
  full_name = full_name + suffix
elif(linear_extension):
  suffix = '-linear_prox'
  full_name = full_name + suffix
plot_stats(dataset_flag, [full_name], [optimizer_name], "MLP",epochs)

# Produce complete training plots

### NOTE: here, you can select the training results you collected above to produce plots similar to the ones shown in the report. Please, select the dataset and the optimizers of your liking. For the proper working of the plotting function called below, make sure that the dictionaries are in the corresponding folder. 

1. In the following cell select the dataset for which you want to plot the results.
2. Select the optimizers for which you have produced the results:
  - For `Block Coordinate Descent` select `plot_BCD`
  - For `Block Coordinate Descent + GD update` select `plot_BCD_GD_update`
  - For `Block Coordinate Descent prox linear` select `plot_BCD_linear_prox`
  - For `SGD` select `plot_SGD`
  - For `Adam` select `plot_Adam`
  - For `Block Coordinate Descent + Adam` select `plot_BCD_Adam`
  - For `Block Coordinate Descent + SGD` select `plot_BCD_SGD`

Of course you can select as many optimizers as you want for the same plot.

### NOTE: we remark that `Block Coordinate Descent + GD update` measures a different loss.

In [None]:
#@title Produce complete plots of training trends

dataset_name = "MNIST" #@param ["MNIST","FMNIST","CIFAR10"]
n_epochs =  10#@param {type:"integer"}
plot_BCD = True #@param {type:"boolean"}
plot_BCD_GD_update = False #@param {type:"boolean"}
plot_BCD_linear_prox = True #@param {type:"boolean"}
plot_SGD = False #@param {type:"boolean"}
plot_Adam = False #@param {type:"boolean"}
plot_BCD_Adam = False #@param {type:"boolean"}
plot_BCD_SGD = False #@param {type:"boolean"}

full_names = []
opt_names = []

if(plot_BCD):
  full_names = full_names + ["Coordinate-Descent"]
  opt_names = opt_names + ["Coordinate-Descent"]
if(plot_BCD_GD_update):
  full_names = full_names + ["Coordinate-Descent-Entropy"]
  opt_names = opt_names + ["Coordinate-Descent"]
if(plot_BCD_linear_prox):
  full_names = full_names + ["Coordinate-Descent-linear_prox"]
  opt_names = opt_names + ["Coordinate-Descent"]
if(plot_SGD):
  full_names = full_names + ["SGD"]
  opt_names = opt_names + ["SGD"]
if(plot_Adam):
  full_names = full_names + ["Adam"]
  opt_names = opt_names + ["Adam"]
if(plot_BCD_Adam):
  full_names = full_names + ["Coordinate-Descent+Adam"]
  opt_names = opt_names + ["Coordinate-Descent+Adam"]
if(plot_BCD_SGD):
  full_names = full_names + ["Coordinate-Descent+SGD"]
  opt_names = opt_names + ["Coordinate-Descent+SGD"]
    
plot_stats(dataset_name, full_names, opt_names, "MLP",n_epochs)

# Hyper-parameters used

In order to reproduce our results (i.e. the training trends shown in the report), the following set of parameters should be used.\ If not specified otherwise, other parameters (e.g. for numerical stability) are set to their default values.

$\text{Block Coordinate Descent}$:\
$\gamma = 0.1$, $\alpha = 4$

$\text{Stochastic Gradient Descent (with scheduler)}:$\
$\gamma = 0.01$, $\mu = 0.9$\
$\text{Scheduler:}$\
$\text{Step size = 15,}\gamma = 0.2$

Adam:\
$\gamma = 0.001$, $\mu = 0.9$, $\beta_1 = 0.9$, $\beta_2 = 0.999$

For hybrid variants, we took $ratio = 0.6$

For the mixed optimizer of SGD decrease the learning rate to $\gamma = 0.001$

$\text{Block Coordinate Descent + GD update for V_N}:$\
$GD\_update = True$

$\text{Prox Linear update for V_N}:$\
$prox\_linear = True$