In [11]:
#@title Import and Utilities

import numpy as np
import pandas as pd
import torch
import pickle
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms, utils
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import os
import copy

from utilities import *
from Torch_architectures import *
from Train_functions import *
from CD_utilities import *
from layers import *

print("PyTorch Version:", torch.__version__)
print("Torchvision Version:", torchvision.__version__)
print("GPU is available?", torch.cuda.is_available())

dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch Version: 1.11.0+cu113
Torchvision Version: 0.12.0+cu113
GPU is available? True


# Imported datasets
For the testing and comparison of our algorithms we will use the following datasets:

1. MNIST

# Train - test split

The Code for the Block Coordinate Descent was mostly based on https://github.com/timlautk/BCD-for-DNNs-PyTorch/blob/master/bcd_dnn_mlp_mnist.ipynb

In [12]:
#@title Dataset & Optimizer Selection
ts = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0,), (1,))])

# change the flag to choose the dataset to work with
dataset_flag = "CIFAR10" #@param ['MNIST','FMNIST','CIFAR10']
batch_size = 256 #@param {type:"integer"}
if dataset_flag =='MNIST':
  trainset = datasets.MNIST('../data', train=True, download=True, transform=ts)
  testset = datasets.MNIST(root='../data', train=False, download=True, transform=ts)
  dataset_train = torch.utils.data.DataLoader(testset,batch_size = 128, shuffle = True)
  dataset_test = torch.utils.data.DataLoader(trainset,batch_size = batch_size,shuffle = True)
elif dataset_flag =='FMNIST':
  trainset = datasets.FashionMNIST('../data', train=True, download=True, transform=ts)
  testset = datasets.FashionMNIST(root='../data', train=False, download=True, transform=ts)
  dataset_train = torch.utils.data.DataLoader(testset,batch_size = 128, shuffle = True)
  dataset_test = torch.utils.data.DataLoader(trainset,batch_size = batch_size,shuffle = True)
elif dataset_flag=='CIFAR10':
  trainset = datasets.CIFAR10('../data', train=True, download=True, transform=ts)
  testset = datasets.CIFAR10(root='../data', train=False, download=True, transform=ts)
  dataset_train = torch.utils.data.DataLoader(testset,batch_size = 128, shuffle = True)
  dataset_test = torch.utils.data.DataLoader(trainset,batch_size = batch_size,shuffle = True)

x_train, y_train, x_test, y_test,y_train_one_hot, y_test_one_hot, I1, I2 = load_dataset(trainset, testset,10)

# we move to device to use GPU

x_train = x_train.to(device = device)
x_test = x_test.to(device = device)
y_train = y_train.to(device = device)
y_test = y_test.to(device = device)
y_train_one_hot = y_train_one_hot.to(device)
y_test_one_hot = y_test_one_hot.to(device)
input_size = x_train.shape[0]
hidden_size = int(1.5*input_size)
output_size = 10

Files already downloaded and verified
Files already downloaded and verified


In [13]:
from torch.optim.lr_scheduler import MultiStepLR
#@title Model Selection
model_name = 'Multilayer-Perceptron' #@param ['Multilayer-Perceptron']
optimizer_name = "Coordinate-Descent" #@param ['SGD','Adam','Coordinate-Descent','Coordinate-Descent+SGD','Coordinate-Descent+Adam']
momentum = 0.9 #@param {type:"number"}
lr = 0.001 #@param {type:"number"}
weight_decay = 0.00 #@param {type:"number"}
beta_1 = 0.9 #@param {type:"number"}
beta_2 = 0.999 #@param {type:"number"}
epochs = 50 #@param {type:"integer"}
#the ratio of the epochs for coordinate descent for mixed classifiers
ratio =  0.6#@param {type:"number"}
use_entropy = True #@param {type:"boolean"}
linear_extension = False #@param {type:"boolean"}
cross_entropy = nn.CrossEntropyLoss()

if(model_name =='Multilayer-Perceptron'):
  model = MultiLayerPerceptron(input_size,hidden_size,output_size) 


if (optimizer_name == "SGD" or optimizer_name == "Coordinate-Descent+SGD"):
  #print("Got in SGD")
  optimizer = torch.optim.SGD(params=model.parameters(), lr=lr,
                              momentum=momentum, weight_decay=weight_decay)
  assert lr > 0
  assert 0 <= momentum <= 1
elif (optimizer_name == "Adam" or optimizer_name == "Coordinate-Descent+Adam"):
  #print("Got in Adam")
  optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, 
                               betas=(beta_1, beta_2), weight_decay=weight_decay)
if(optimizer_name != 'Coordinate-Descent'):
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2)

# Optimizers & Loss functions Definitions

1. SGD from pytorch 
2. CrossEntropyLoss function criterion

# Training

Note: Fix it so that it moves everything to device in the following function and that it does the label sample split here

In [14]:
train_losses = []
test_losses = []
accuracy_train = []
accuracy_test = []
epochs_times = []
start = time.time()
if(optimizer_name == 'Coordinate-Descent' or optimizer_name == 'Coordinate-Descent+SGD' or optimizer_name== 'Coordinate-Descent+Adam'):
  print('training BCD')
  if(optimizer_name != 'Coordinate-Descent'):
    total_epochs = epochs
    epochs = int(total_epochs * ratio)
  train_losses, test_losses , accuracy_train, accuracy_test,epochs_times,Ws,bs = execute_training([["Perceptron",hidden_size,1],["Perceptron",hidden_size,1]], input_size, hidden_size, output_size, x_train, x_test, y_train, y_test, y_train_one_hot, y_test_one_hot,
                                         use_entropy, linear_extension, I1 = hidden_size,I2=1, niter = epochs, gamma = 0.1, alpha = 4)
  #Train using BCD
  if(optimizer_name != 'Coordinate-Descent'):
    epochs = total_epochs-epochs
if(optimizer_name != 'Coordinate-Descent'):
  model = model.to(device)
  #train using sgd or adam
  if(optimizer_name == 'Coordinate-Descent+SGD' or optimizer_name == 'Coordinate-Descent+Adam'):
    i=0
    for param in model.parameters():
      if i%2 == 0:
        param.data = Ws[int(i/2)]
        #temp_W.pop()
      else:
        param.data = torch.flatten(bs[int(i/2)])
        #temp_b.pop()
      i+=1
  train_loss, test_loss, acc_train, acc_test, times = train_model(model, dataset_train, dataset_test, optimizer, cross_entropy, epochs,scheduler,optimizer_name)
  train_losses = list(train_losses) + train_loss
  test_losses = list(test_losses) + test_loss
  accuracy_train = list(accuracy_train) + acc_train
  accuracy_test = list(accuracy_test) + acc_test
  epochs_times = list(epochs_times) + times
elapsed_time = time.time() - start


training BCD
Epoch 1 / 50 
 - time: 18.31203818321228 - sq_loss: 114895.984375 - tot_loss: 2202.223332252548 - loss_class: 22980.58203125 - acc: 0.1429 - val_acc: 0.1409
Epoch 2 / 50 
 - time: 19.155765771865845 - sq_loss: 114172.953125 - tot_loss: 1429.0080787355082 - loss_class: 22839.056640625 - acc: 0.29094 - val_acc: 0.2848
Epoch 3 / 50 
 - time: 19.191007375717163 - sq_loss: 113490.421875 - tot_loss: 1240.6976878490302 - loss_class: 22707.8359375 - acc: 0.38468 - val_acc: 0.376
Epoch 4 / 50 
 - time: 18.98080015182495 - sq_loss: 112862.25 - tot_loss: 1068.905744433962 - loss_class: 22588.060546875 - acc: 0.42182 - val_acc: 0.4053
Epoch 5 / 50 
 - time: 18.73792004585266 - sq_loss: 112278.7421875 - tot_loss: 913.109833765775 - loss_class: 22477.453125 - acc: 0.44308 - val_acc: 0.4259
Epoch 6 / 50 
 - time: 19.125234365463257 - sq_loss: 111726.71875 - tot_loss: 773.0521360978485 - loss_class: 22372.943359375 - acc: 0.453 - val_acc: 0.4345
Epoch 7 / 50 
 - time: 18.93863296508789 - 

In [15]:
#Replace this with the same function as DFW
results = {'epochs': epochs_times, 'train_losses': train_losses, 
           'train_acc': accuracy_train, 'test_losses': test_losses, 
           'test_acc': accuracy_test, 'elapsed_time': elapsed_time}
stats_dict = {}
stats_dict.update({optimizer_name: results})
save_stats = True
print(optimizer_name)
# save everything onto file
if save_stats: 
    output_folder = os.path.join(os.getcwd(), 'results')  # set the folder
    os.makedirs(output_folder, exist_ok=True)
    fname = output_folder + '/stats_dict_' + model_name + '_' + optimizer_name + '-Entropy.pkl'
    with open(fname, 'wb') as handle:
        pickle.dump(stats_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Coordinate-Descent
