In [1]:
import numpy as np
import torch
import torch.nn as nn
import torchvision
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import time
import os
import pickle
import copy
import math
from torch.nn.functional import cross_entropy

from utilities import *
from layers import *
from CD_utilities import *

print("PyTorch Version:", torch.__version__)
print("Torchvision Version:", torchvision.__version__)
print("GPU is available?", torch.cuda.is_available())

dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch Version: 1.11.0+cu113
Torchvision Version: 0.12.0+cu113
GPU is available? True


# Imported datasets
For the testing and comparison of our algorithms we will use the following datasets:

1. MNIST
2. FashionMNIST
3. CIFAR10

In [2]:
ts = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0,), (1,))])

# change the flag to choose the dataset to work with
dataset_flag = 0

if dataset_flag ==0:
  trainset = datasets.MNIST('../data', train=True, download=True, transform=ts)
  testset = datasets.MNIST(root='../data', train=False, download=True, transform=ts)
elif dataset_flag ==1:
  trainset = datasets.FashionMNIST('../data', train=True, download=True, transform=ts)
  testset = datasets.FashionMNIST(root='../data', train=False, download=True, transform=ts)
else:
  trainset = datasets.CIFAR10('../data', train=True, download=True, transform=ts)
  testset = datasets.CIFAR10(root='../data', train=False, download=True, transform=ts)

# Dataset preprocessing

In [3]:
x_train, y_train, x_test, y_test, y_train_one_hot, y_test_one_hot, I1, I2 = load_dataset(trainset, testset, 10)

# We move to GPU
x_train = x_train.to(device = device)
x_test = x_test.to(device = device)
y_train = y_train.to(device = device)
y_test = y_test.to(device = device)
y_train_one_hot = y_train_one_hot.to(device = device)
y_test_one_hot = y_test_one_hot.to(device = device)

cross_entropy = nn.CrossEntropyLoss()

In [4]:
# def shift_right(l):
#   """
#   Shifts right a python list by one element.
#   """
#   return l[-1:]+l[:-1]

# def filter_conv(W,I1,I2,size = 2):
#   """
#   This function filters the entries of the matrix W so that it behaves like a Convolution.
#   :param W: The weight matrix W that contains weights from one layer to the next.
#   :param I1: The first dimension of the original 2D matrix
#   :param I2: The first dimension of the original 2D matrix
#   :return: The filtered weight matrix W
#   """
#   device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#   mask_list = []
#   for i in range(size):
#     mask_list += [1]*size+[0]*(I2-size)
#   mask_list +=[0]*(I1-size)*I2
#   full_mask = [mask_list]
#   counter = I2-size
#   for i in range((I2-size+1)*(I1-size+1)-1):
#     next_mask=shift_right(full_mask[-1])
#     #print(counter)
#     if(counter==0):
#       counter = I2-size
#       for j in range(size-1):
#         next_mask=shift_right(next_mask)
#     else:
#       counter -=1
#     full_mask.append(next_mask)
#   if(torch.tensor(full_mask).shape[0]!=W.shape[0]):
#     print(torch.tensor(full_mask).shape[0],W.shape[0])
#   return torch.mul(torch.tensor(full_mask).to(device),W)

# class Layer():
#   """
#   A simple layer class for the three different types of layers of our network,
#   Perceptron, Convolution, Average Pooling this class is not equivalent to a
#   pytorch default layer since the update is done by the function that calculates
#   the closed form solution for the optimal weights. This class is also
#   constructed in such a way to be able to process input of up to 2 dimensions.
#   """
#   device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#   def __init__(self,col_size,row_size,col_out,row_out,layer_type=["Perceptron"]):
#     """
#     The init function initializes the basic parameters of the layer:
#     :param col_size: the column size of the input
#     :param row_size: the row size of the input
#     :param col_out: the column size of the output
#     :param row_out: the row size of the output
#     :param layer_type: the type of the layer,
#                        if the layer is a Perceptron we expect a single string
#                        that says that the layer is a perceptron
#                        if the layer is a Convolution we expect a s string that
#                        specifies it as a convolution and 1 parameter which is
#                        the size of the convolution
#                        if the layer is an Average Pooling layer then we expect
#                        analogous paramaters as the convolution layer
#     Note that apart from the average pooling, for the other types of layers the initial weight is assigned a random value.
#     """
#     self.col_size = col_size
#     self.row_size = row_size
#     self.col_out = col_out
#     self.row_out = row_out
#     self.layer_type = layer_type
#     std = math.sqrt(1/(row_size*col_size))
#     if(self.layer_type[0] =="Average Pooling"):
#       self.weights = torch.add(torch.FloatTensor(self.row_out*self.col_out,self.row_size*self.col_size),1).to(self.device)
#       print(torch.norm(self.weights))
#       self.weights = filter_conv(self.weights,self.col_size,self.row_size,self.layer_type[1])
#       print(torch.norm(self.weights))
#       self.weights = torch.mul(self.weights,1/(self.layer_type[1]*self.layer_type[1]))
#       self.bias = torch.FloatTensor(row_out*col_out,1).uniform_(-std, std).to(self.device)
#     elif(self.layer_type[0]=="Convolution"):
#       self.weights = torch.FloatTensor(self.row_out*self.col_out,self.row_size*self.col_size).uniform_(-std, std).to(self.device)
#       self.weights = filter_conv(self.weights,self.col_size,self.row_size,self.layer_type[1])
#     else:
#       self.weights = torch.FloatTensor(row_out*col_out,row_size*col_size).uniform_(-std, std).to(self.device)
#     self.bias = torch.FloatTensor(row_out*col_out,1).uniform_(-std, std).to(self.device)

#   def forward_pass(self,input,N):
#     """
#     A function that does a forward pass from the layer, is capable of doing it
#     for N samples simultaniously.
#     :param input: The input that we do the forward pass on, should have dimension
#                   input_dimension_layer x number_of_samples
#     :param N: The number of samples that are in the input
#     """
#     return torch.addmm(self.bias.repeat(1, N), self.weights, input)

#   def update_layer(self,output,input,alpha,rho):
#     """
#     A function that applies the closed form solution, for all different types of layers
#     :param output: The output of the layer with respect to which we should compute the closed form solution
#     :param input: The input of the layer with respect to which we should compute the closed form solution
#     :param alpha: The parameter alpha of the algorithm that is described on the paper
#     :param rho: The parameter rho of the algorithm that is described on the paper
#     """
#     if(self.layer_type[0]!="Average Pooling"):
#       self.weights,self.bias = update_wb_js(output,input,self.weights,self.bias,alpha,rho)
#       if(self.layer_type[0]=="Convolution"):
#         self.weights = filter_conv(self.weights,self.col_size,self.row_size,self.layer_type[1])

#   def get_weights(self):
#     """
#     This function returns the current weights of the layer
#     """
#     return self.weights

#   def get_bias(self):
#     """
#     This function returns the bias of the layer
#     """
#     return self.bias

#   def get_type(self):
#     """
#     This function returns the type of the layer
#     """
#     return self.layer_type


In [5]:
def prox_linear(true_labels,V,U,gamma,alpha):
  """
  """
  V_opt = (gamma*V+alpha*U+true_labels - V)/(gamma+alpha)
  return V_opt

# Architecture initialization

For the MultiLayerPerceptron we have the parameters **input_size** , **hidden_size**,**output_size** corresponding to the size of the input layer, the hidden layer and the output layer, respectively.

The MLP only has 3 layers like https://github.com/timlautk/BCD-for-DNNs-PyTorch/blob/master/bcd_dnn_mlp_mnist.ipynb as a starting point.

Also we use ReLU currently for the same reason.

In [6]:
input_size = x_train.shape[0]
hidden_size = 1600
output_size = 10

# Training

Note: Fix it so that it moves everything to device in the following function and that it does the label sample split here

In [7]:
def execute_training(layers, input_size, hidden_size, output_size, train_set, val_set,
                     train_labels, val_labels, y_train_one_hot, y_test_one_hot, use_gradient, I1 = 40, I2 = 40,
                     niter = 100, gamma = 1, alpha = 5):
  """
  The function takes the following arguements and produces a list of weights and biases with which
  you can use the make_pred function to get a list of predictions
  :param layers: The total number of layers of the network
  :param input_size: The total size of the input layer
  :param hidden_size: The size of the hidden layer
  :param output_size: The size of the output layer (usefull for multiclass classification)
  :param train_set: The training set
  :param val_set: The validation set
  :param train_labels: The training labels
  :param val labels: The validation labels
  :param use_gradient: True if the first update of V is carried out without linearization but using the gradient
  :param niter: The default number of epochs to train the network
  :param gamma: The gamma parameter of the algorithm
  :param alpha: The alpha parameter of the algorithm
  :return Ws,bs: Returns two lists that go in order from the input to the output layer of the weights and the biases of each layer
  """

  N = len(train_labels)
  N_test = len(val_labels)

  # weight initialization (we replicate pytorch weight initialization)

  std = math.sqrt(1/input_size)
  Layer1 = Layer(input_size,1,hidden_size,1,["Perceptron"])
  W = Layer1.get_weights()
  b = Layer1.get_bias()
  #W = torch.FloatTensor(hidden_size, input_size).uniform_(-std, std)
  #b = torch.FloatTensor(hidden_size, 1).uniform_(-std, std)

  #b = b.to(device = device)
  #W = W.to(device = device)

  U = torch.addmm(b.repeat(1, N), W, train_set) # equivalent to W1@train_set+b1.repeat(1,N)
  V = nn.ReLU()(U)

  Ws = [W]
  bs = [b]
  Us = [U]
  Vs = [V]
  Layers = [Layer1]
  row = [I1]
  col = [I2]

  cr_row_size = I1
  cr_col_size = I2
  size = 4
  avg_size = 2
  for cr_layer in layers:
    std = math.sqrt(1/hidden_size)
    if(cr_layer[0] !="Perceptron"):
      Layer_i = Layer(cr_col_size,cr_row_size,cr_col_size-cr_layer[1]+1,cr_row_size-cr_layer[1]+1,cr_layer)
      W = Layer_i.get_weights()
      b = Layer_i.get_bias()
      Layers.append(Layer_i)
      row.append(cr_row_size)
      col.append(cr_col_size)
      cr_row_size = cr_row_size - cr_layer[1]+1
      cr_col_size = cr_col_size - cr_layer[1]+1
    else:
      Layer_i = Layer(cr_col_size,cr_row_size,cr_layer[1],cr_layer[2],[cr_layer[0]])
      W = Layer_i.get_weights()
      b = Layer_i.get_bias()
      Layers.append(Layer_i)
      row.append(cr_row_size)
      col.append(cr_col_size)
      cr_row_size = cr_layer[1]
      cr_col_size = cr_layer[2]
    if(cr_layer[0] != "Average Pooling"):
      #print(W.shape,Vs[-1].shape)
      U = torch.addmm(b.repeat(1, N), W, Vs[-1])
      V = nn.ReLU()(U)
    else:
      U = torch.addmm(b.repeat(1, N), W, Vs[-1])
      V = U
    Ws.append(W)
    bs.append(b)
    Us.append(U)
    Vs.append(V)
  for i in range(len(bs)):
    print("Layer ",i," W ", Ws[i].shape," Layer W ",Layers[i].get_weights().shape)
    print(Layers[i].get_type())

  row.append(cr_row_size)
  col.append(cr_col_size)
  std = math.sqrt(1/hidden_size)
  Layer_out = Layer(cr_col_size,cr_row_size,output_size,1,["Perceptron",10])
  W = Layer_out.get_weights()
  b = Layer_out.get_bias()
  #print(W.shape)
  # we move them to GPU
  #b = b.to(device = device)
  #W = W.to(device = device)
  #U = torch.addmm(b.repeat(1, N), W, Vs[-1])
  #V = U
  #print(U.shape)
  #Ws.append(W)
  Layers.append(Layer_out)
  #W = torch.FloatTensor(output_size, cr_row_size*cr_col_size).uniform_(-std, std)
  #b = torch.FloatTensor(output_size, 1).uniform_(-std, std)

  # we move them to GPU
  #b = b.to(device = device)
  #W = W.to(device = device)
  print(cr_col_size,cr_row_size)
  print(W.shape,Vs[-1].shape)
  U = torch.addmm(b.repeat(1, N), W, Vs[-1])
  V = U
  Ws.append(W)
  bs.append(b)
  Us.append(U)
  Vs.append(V)

  for i in range(len(bs)):
    print("Layer ",i," W ", bs[i].shape," Layer W ",Layers[i].get_bias().shape)
    print(Layers[i].get_type())

  # constant initialization

  gamma1 = gamma2 = gamma3 = gamma4 = gamma

  rho = gamma
  rho1 = rho2 = rho3 = rho4 = rho

  alpha1 = alpha2 = alpha3 = alpha4 = alpha5 = alpha6 = alpha7 \
  = alpha8 = alpha9 = alpha10 = alpha

  # vector of performance initialization

  loss1 = np.empty(niter)
  loss2 = np.empty(niter)
  loss_class = np.empty(niter)
  accuracy_train = np.empty(niter)
  accuracy_test = np.empty(niter)
  time1 = np.empty(niter)

  opt_accuracy = 0
  print(len(Ws),len(Layers))
  early_Ws = Ws
  early_bs = bs
  print('Train on', N, 'samples, validate on', N_test, 'samples')
  for k in range(niter):

    start = time.time()
    Last_layer = Layers[-1]
    W = Last_layer.get_weights()
    b = Last_layer.get_bias()
    # update V3
    if use_gradient == True:
      if (k == 1):
        Vs[-1] = (y_train_one_hot + gamma3*Us[-1] + alpha1*Vs[-1])/(1+ gamma3 + alpha1)
      else:
        for i in range(250):
          #Vs[-1] = (alpha1*Vs[-1] + gamma3*Us[-1] - (torch.exp(Vs[-1])/torch.sum(torch.exp(Vs[-1]),dim=0)-y_train_one_hot))/(gamma3+alpha1)
          #Vs[-1] = Vs[-1] - (alpha1*Vs[-1] + gamma3*Us[-1] + (torch.exp(Vs[-1])/torch.sum(torch.exp(Vs[-1]),dim=0)-y_train_one_hot))/(gamma3+alpha1)*0.01/(i+1)
          Vs[-1] = Vs[-1] - (gamma3*(Vs[-1]-Us[-1])+torch.exp(Vs[-1])/torch.sum(torch.exp(Vs[-1]),dim=0)-y_train_one_hot) * 0.01/(i+1)
    else:
      #Vs[-1] = (gamma*Vs[-1]+alpha*Us[-1]+ y_train_one_hot - Vs[-1])/(gamma+alpha)
      #(y_train_one_hot + gamma3*Us[-1] + alpha1*Vs[-1])/(1+ gamma3 + alpha1)
      Vs[-1] = (y_train_one_hot + gamma3*Us[-1] + alpha1*Vs[-1])/(1+ gamma3 + alpha1)

    # update U3
    Us[-1] = (gamma3*Vs[-1] + rho3*(torch.mm(W,Vs[-2]) + b.repeat(1,N)))/(gamma3 + rho3)

    # update W3 and b3
    W, b = update_wb_js(Us[-1],Vs[-2],Ws[-1],bs[-1],alpha1, rho3)
    Ws[-1] = W
    bs[-1] = b
    Layers[-1].update_layer(Us[-1],Vs[-2],alpha1, rho3)

    for i in range(len(Vs)-2,0,-1):
      Layer_next = Layers[i+1]
      Layer_cur = Layers[i]
      L_next_type = Layer_next.get_type()
      W_next = Layer_next.get_weights()
      W_cur = Layer_cur.get_weights()
      b_next = Layer_next.get_bias()
      b_cur = Layer_cur.get_bias()
      if(L_next_type[0]=="Average Pooling"):
        Vs[i] = update_no_activation(Us[i],Us[i+1],W_next,b_next,rho3,gamma2)
        Us[i] = Vs[i]
      else:
        Vs[i] = update_v_js(Us[i],Us[i+1],W_next,b_next,rho3,gamma2)
        Us[i] = relu_prox(Vs[i],(rho2*torch.addmm(b_cur.repeat(1,N), W_cur, Vs[i-1]) +
                                alpha2*Us[i])/(rho2 + alpha2),(rho2 + alpha2)/gamma2, row[i+1]*col[i+1], N)
        W,b = update_wb_js(Us[i],Vs[i-1],W_cur,b_cur,alpha3,rho2)
        Layers[i].update_layer(Us[i],Vs[i-1],alpha3,rho2)

    # update V1
    Vs[0] = update_v_js(Us[0],Us[1],Ws[1],bs[1],rho2,gamma1)

    # update U1
    Us[0] = relu_prox(Vs[0],(rho1*torch.addmm(bs[0].repeat(1,N), Ws[0], train_set) +
                             alpha7*Us[0])/(rho1 + alpha7),(rho1 + alpha7)/gamma1, hidden_size, N)

    # update W1 and b1
    W, b = update_wb_js(Us[0],train_set,Ws[0],bs[0],alpha8,rho1)
    Ws[0] = W
    bs[0] = b
    Layers[0].update_layer(Us[0],train_set,alpha8,rho1)

    #a1_train = nn.ReLU()(torch.addmm(b1.repeat(1, N), W1, train_set))
    #a1_train = train_set
    #for i in range(len(Vs)-1,0,-1):
    #  a1_train = nn.ReLU()(torch.addmm(bs[i].repeat(1, N), Ws[i], a1_train))
    #pred = torch.argmax(torch.addmm(bs[0].repeat(1, N), Ws[0], a1_train), dim=0)
    #pred,_ = make_pred(Ws,bs,train_set,N)

    #a1_test = val_set
    #a1_test = nn.ReLU()(torch.addmm(b1.repeat(1, N_test), W1, val_set))
    #for i in range(len(Vs)-1,0,-1):
    #  a1_test = nn.ReLU()(torch.addmm(bs[i].repeat(1, N_test), Ws[i], a1_test))
    #pred_test = torch.argmax(torch.addmm(bs[0].repeat(1, N_test), Ws[0], a1_test), dim=0)
    #pred_test, prob_test = make_pred(Ws,bs,val_set,N_test)
    pred_Ws = [l.get_weights() for l in Layers]
    pred_bs = [l.get_bias() for l in Layers]
    pred,_ = make_pred(pred_Ws,pred_bs,x_train,N)

    #a1_test = x_test
    #a1_test = nn.ReLU()(torch.addmm(b1.repeat(1, N_test), W1, x_test))
    #for i in range(len(Vs)-1,0,-1):
    #  a1_test = nn.ReLU()(torch.addmm(bs[i].repeat(1, N_test), Ws[i], a1_test))
    #pred_test = torch.argmax(torch.addmm(bs[0].repeat(1, N_test), Ws[0], a1_test), dim=0)
    pred_test, prob_test = make_pred(pred_Ws,pred_bs,x_test,N_test)

    loss_class[k] = torch.sum(- y_test_one_hot * torch.log(prob_test))

    loss1[k] = gamma/2*torch.pow(torch.dist(Vs[-1],y_train_one_hot,2),2).cpu().numpy()
    loss2[k] = loss1[k] + gamma/2 * torch.pow(torch.dist(torch.addmm(bs[0].repeat(1,N), Ws[0], train_set),Us[0],2),2).cpu().numpy()

    for i in range(1,len(layers)):
      loss2[k] = loss2[k] + gamma/2 * torch.pow(torch.dist(torch.addmm(bs[i].repeat(1,N), Ws[i], Vs[i-1]),Us[i],2),2).cpu().numpy()

    #loss2[k] = loss1[k] + rho1/2*torch.pow(torch.dist(torch.addmm(b1.repeat(1,N), W1, train_set),U1,2),2).cpu().numpy() \
    #+rho2/2*torch.pow(torch.dist(torch.addmm(b2.repeat(1,N), W2, V1),U2,2),2).cpu().numpy() \
    #+rho3/2*torch.pow(torch.dist(torch.addmm(b3.repeat(1,N), W3, V2),U3,2),2).cpu().numpy()

    # compute training accuracy
    correct_train = pred == train_labels
    accuracy_train[k] = np.mean(correct_train.cpu().numpy())

    # compute validation accuracy
    correct_test = pred_test == val_labels
    accuracy_test[k] = np.mean(correct_test.cpu().numpy())

    # compute training time
    stop = time.time()
    duration = stop - start
    time1[k] = duration

    # print results
    print('Epoch', k + 1, '/', niter, '\n',
          '-', 'time:', time1[k], '-', 'sq_loss:', loss1[k], '-', 'tot_loss:',
          loss2[k], '-', 'loss_class:', loss_class[k], '-', 'acc:',
          accuracy_train[k], '-', 'val_acc:', accuracy_test[k])
    if(accuracy_test[k]>opt_accuracy):
      early_Ws = Ws
      early_bs = bs
      opt_accuracy = accuracy_test[k]

  print('The total time spent is:', np.sum(time1), 's')
  print('\n\n')
  print('Early stopping accuracy:',opt_accuracy)
  return loss1,loss_class,accuracy_train,accuracy_test,time1,early_Ws,early_bs

In [8]:
_,_,_,_,_,_,_ = execute_training([["Convolution",4],["Average Pooling",3]], input_size, hidden_size, output_size, x_train, x_test, y_train, y_test,y_train_one_hot,y_test_one_hot,
                                         False, niter = 100, gamma = 0.1, alpha = 4)

tensor(1295., device='cuda:0')
tensor(105., device='cuda:0')
Layer  0  W  torch.Size([1600, 784])  Layer W  torch.Size([1600, 784])
['Perceptron']
Layer  1  W  torch.Size([1369, 1600])  Layer W  torch.Size([1369, 1600])
['Convolution', 4]
Layer  2  W  torch.Size([1225, 1369])  Layer W  torch.Size([1225, 1369])
['Average Pooling', 3]
35 35
torch.Size([10, 1225]) torch.Size([1225, 60000])
Layer  0  W  torch.Size([1600, 1])  Layer W  torch.Size([1600, 1])
['Perceptron']
Layer  1  W  torch.Size([1369, 1])  Layer W  torch.Size([1369, 1])
['Convolution', 4]
Layer  2  W  torch.Size([1225, 1])  Layer W  torch.Size([1225, 1])
['Average Pooling', 3]
Layer  3  W  torch.Size([10, 1])  Layer W  torch.Size([10, 1])
['Perceptron', 10]
4 4
Train on 60000 samples, validate on 10000 samples
Epoch 1 / 100 
 - time: 1.3868930339813232 - sq_loss: 1957.901171875 - tot_loss: 2007.0082885830664 - loss_class: 22999.5390625 - acc: 0.09863333333333334 - val_acc: 0.0958
Epoch 2 / 100 
 - time: 1.3784492015838623 

In [9]:
_,_,_,_,_,_,_ = execute_training([["Perceptron",37,37],["Perceptron",35,35]], input_size, hidden_size, output_size, x_train, x_test, y_train, y_test,y_train_one_hot,y_test_one_hot,
                                         False, niter = 100, gamma = 0.1, alpha = 4)

Layer  0  W  torch.Size([1600, 784])  Layer W  torch.Size([1600, 784])
['Perceptron']
Layer  1  W  torch.Size([1369, 1600])  Layer W  torch.Size([1369, 1600])
['Perceptron']
Layer  2  W  torch.Size([1225, 1369])  Layer W  torch.Size([1225, 1369])
['Perceptron']
35 35
torch.Size([10, 1225]) torch.Size([1225, 60000])
Layer  0  W  torch.Size([1600, 1])  Layer W  torch.Size([1600, 1])
['Perceptron']
Layer  1  W  torch.Size([1369, 1])  Layer W  torch.Size([1369, 1])
['Perceptron']
Layer  2  W  torch.Size([1225, 1])  Layer W  torch.Size([1225, 1])
['Perceptron']
Layer  3  W  torch.Size([10, 1])  Layer W  torch.Size([10, 1])
['Perceptron', 10]
4 4
Train on 60000 samples, validate on 10000 samples
Epoch 1 / 100 
 - time: 2.2707924842834473 - sq_loss: 1980.4478515625 - tot_loss: 1980.4479200002836 - loss_class: 22580.66796875 - acc: 0.7233666666666667 - val_acc: 0.7354
Epoch 2 / 100 
 - time: 2.3186275959014893 - sq_loss: 1287.5687500000001 - tot_loss: 1287.5688229401585 - loss_class: 21957.033