<a href="https://colab.research.google.com/github/jdowner212/cs577_addernet/blob/main/AdderNet_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
root = os.getcwd() # whatever you want

In [2]:
import numpy as np
import torch
import json
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()


In [3]:
'''
Keeping track of the equations described in the paper -- not explicitly called
in subsequent code but want to make sure the functionality is present
'''

####################################
# defined in paper but not needed: #
####################################

'''Equation 1'''
# ignore -- updated with Equation 2

'''Equation 3'''
# ignore -- CNN formula

'''Equation 4'''
# ignore -- updated with Equation 5

'''Equation 8'''
# ignore -- CNN formula


####################################
# explicitly used in code -- keep: #
####################################
def L1(a,b):
    return -1*np.abs(a-b)

'''Equation 7'''
def hard_tanh(array):
    array = np.where(array<-1,-1,array)
    array = np.where(array>1, 1, array)
    return array

######################################################
# implicitly used in code -- can technically delete: #
######################################################

'''Equation 2'''
def Y_adder(image, F, m, n, t, similarity_f=L1): # image, group fo filters, row#, col#, filter#, similarity function
    sum_ = 0
    num_filters, k_depth, k_height, k_width = F.shape
    for k in range(k_depth):
        for j in range(k_width):
            for i in range(k_height):
                sum_ += similarity_f(image[k, m+i, n+j], F[t,k,i,j])
    return sum_

'''Equation 5'''
def dY_dF_element(image,filters,m,n,i,j,k,t):
    return image[k,m+i,n+j] - filter[t,k,i,j]

'''Equation 6''' # clipped, full-precision gradient
def dY_dImage_element(image,filters,m,n,i,j,k,t):
    return hard_tanh(filter[t,k,i,j] - image[k,m+i,n+j])


###################################################
# written last week, haven't gotten to these yet: #
###################################################

'''Equation 9'''
def var_Y_adder(X,F,variance_f=K.var):
    # Not sure K.var is the function we want here, if we need to specify axis, etc.
    var_X = variance_f(X)
    var_F = variance_f(F)
    ###
    _, c_in, d, _ = F.shape
    pi = np.pi
    return np.sqrt(pi/2)*(d**2)*(c_in)*(var_X + var_F)

'''Equation 10'''
def batch_norm(minibatch, gamma, beta):
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    gamma*(minibatch-mean)/std + beta
    return gamma*(minibatch-mean)/std + beta

'''Equation 11'''
def dL_dMinibatch_i(minibatch,dL_dy,i,L,gamma): # confused by this notation and need to revisit paper
    # In dL_dy, y is the result of applying batch_norm to the minibatch
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    
    sum_ = 0
    for j in range(m):
        x_term = (minibatch[i]-minibatch[j])*(minibatch[j]-mean)/std
        sum_ += (dL_dy[i] - dL_dy[j]*(1 + x_term))
    sum_ *= gamma/((m**2)*std)
    
    return sum_

'''Equation 12'''
# update rule for F
def delta_F_l(adaptive_lr_l, dL_dF_l, gamma):
    # the update delta for the filter in layer l
    return gamma*adaptive_lr_l*dL_dF_l

'''Equation 13'''
def adaptive_lr_l(dL_dF_l, eta, k):
    # k = number of elements in F_l -- I think equal to len(dL_dF_1)
    # in which case we don't need to explicitly provide it
    
    # l2_norm = torch.sqrt([g**2 for g in dL_dF_l])
    l2_norm = K.sqrt([g**2 for g in dL_dF_l]) # make sure torch.sqrt and K.sqrt are equivalents
    
    return eta*np.sqrt(k)/l2_norm

# Layer definitions

<h2><strong><font color='red'>still need:<br><h4><font color='red'>- fully-connected<br>- batchnorm<br>- adaptive learning rate<br>- ?</font></h4></font></strong></h2>


### `Layer` parent class

In [4]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, X):
        raise NotImplementedError

    def backward(self, upstream_g, learning_rate):
        raise NotImplementedError

### `Adder` layer

In [7]:
'''
dimension format/convention: NxHxWxC -- this is really unintuitive to me but i saw it in some tf 
documentation and thought it might make our lives easier later. Def not married to it
'''


def adder_single_step(window, filter, b=None, similarity_f=L1):
    """
    window -- k_h x k_w x k_d
    filter -- k_h x k_w x k_d
    b      -- 1x1x1
    Z      -- scalar
    """
    H_k,H_w,D_k = filter.shape
    out=0
    for h in range(H_k):
        for w in range(W_k):
            for d in range(D_k):
                out += similarity_f(window[h,w,d], filter[h,w,d])

    if not b:
        b = np.zeros((1,1,1))
    out += b.astype(float)

    return out

class adder_layer(Layer):
    def __init__(self,input_channels,output_channels,kernel_size=3,stride=1,padding=0,bias=None):
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.k=kernel_size
        self.filters = np.random.normal(loc=0,scale=1,size=(self.output_channels, self.kernel_size, self.kernel_size, self.input_channels))
        self.stride = stride
        self.padding = padding
        bias = np.zeros((self.output_channels,1,1,1)) if not bias else bias
        self.bias = bias

    def forward(self,X):
        """    
        X       -- n_tensors x H x W x c_in
        filters -- c_out x k_H x k_W x c_in
        b       -- c_out x 1 x 1 x 1
        Z       -- n_tensors x H_new x W_new, c_out
        cache   -- info needed for backward pass
        """
        self.input = X
        filters,stride,padding,bias = self.filters, self.stride, self.padding, self.bias
        n_tensors, H,   W,   c_in = X.shape
        c_out,     H_k, W_k, c_in = filters.shape
        n_filters = c_out

        X_padded = np.pad(X, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        H_new = int((H + 2*padding - H_k)/stride)+1
        W_new = int((W + 2*padding - W_k)/stride)+1

        Z = np.zeros([n_tensors, H_new, W_new, c_out])

        for i in range(n_tensors):           # traverse batch
            this_img = X_padded[i,:,:,:]     # select ith image in batch
            for f in range(n_filters):       # traverse filters
                this_filter = filters[f,:,:,:]
                this_bias = bias[f,:,:,:]
                for h in range(H_new):       # traverse height
                    for w in range(W_new):   # traverse width
                        
                        v0,v1 = h*stride, h*stride + H_k
                        h0,h1 = w*stride, w*stride + W_k
                        
                        this_window = this_img[v0:v1,h0:h1,:]

                        Z[i, h, w, f] = adder_single_step(this_window, this_filter, this_bias) 

        assert Z.shape == (n_tensors, H_new, W_new, n_filters)

        self.output = Z
        self.cache = X, filters, bias, stride, padding
        
        return self.output

    def backward(self, upstream_g, learning_rate):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, W, B, s, p)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """
        
        X, filters, bias, stride, padding = self.cache

        n_tensors, H_down, W_down, c_down = X.shape
        n_filters, H_k,    W_k,    c_down = filters.shape
        n_tensors, H_up,   W_up,   c_up   = upstream_g.shape
        
        dX       = np.zeros_like(X)                           
        dfilters = np.zeros_like(filters)
        dbias    = np.zeros((n_filters, 1,1,1))

        X_padded  = np.pad(X,  ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        dX_padded = np.pad(dX, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        
        for i in range(n_tensors):                       
            x = X_padded[i]
            dx = dX_padded[i]
            
            for h in range(H_up):                   # traverse height
                for w in range(W_up):               # traverse width
                    for c in range(c_up):           # traverse filters
                        
                        v0,v1 = h,h+H_k
                        h0,h1 = w,w+W_k
                        
                        x_window = x[v0:v1, h0:h1, :]
                        f_window = filters[c,:,:,:]

                        dx_local = x_window-f_window
                        df_local = hard_tanh(f_window-x_window)

                        g = upstream_g[i, h, w, c]

                        dx[v0:v1, v0:v1, :] += dx_local * g
                        dfilters[c,:,:,:]   += df_local * g
                        dbias[c,:,:,:]      += g
                        
            dX[i, :, :, :] = dx[padding:-padding, padding:-padding, :]
        
        assert(dX.shape == (n_tensors, H_down, W_down, c_down))

        self.filters -= learning_rate*dfilters
        self.bias    -= learning_rate*dbias

        return dX

### Fully-connected layer

In [None]:
class fully_connected(Layer):
    def __init__(self,input_channels,output_channels):
        super(Layer, self).__init__()
        self.weights = np.random.random(input_size,output_size)

        self.filters = np.random.normal(loc=0,scale=1,size=(input_size,output_size)
                                        (self.output_channels, self.kernel_size, self.kernel_size, self.input_channsls))

        self.bwd=act_backward

    def forward(self, X):
        self.input = X
        self.output = self.fwd(X)
        return self.output

    def backward(self, upstream_g, learning_rate):
        local_g = self.bwd(self.input)
        return local_g*upstream_g



# inherit from base class Layer
class FCLayer(Layer):
    # input_size = number of input neurons
    # output_size = number of output neurons
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    # returns output for a given input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output

    # computes dE/dW, dE/dB for a given output_error=dE/dY. Returns input_error=dE/dX.
    def backward_propagation(self, output_error, learning_rate):
        input_error = np.dot(output_error, self.weights.T)
        weights_error = np.dot(self.input.T, output_error)
        # dBias = output_error

        # update parameters
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * output_error
        return input_error

### Activation layers

In [8]:
def relu_fwd(X):
    return np.where(X>=0,X,0)
def relu_bwd(X):
    return np.where(X>=0,1,0)
def softmax_fwd(X):
	exp_ = np.exp(X)
	return exp_ / np.sum(exp_)
def softmax_bwd(X):
    I = np.eye(X.shape[0])
    s = softmax_fwd(X)
    return s*(I - s.T)
def sig_fwd(X):
    return 1/(1 + np.exp(-X))
def sig_bwd(X):
    return sig_fwd(X) * (1 - sig_fwd(X))


class Activation(Layer):
    def __init__(self,act_forward,act_backward):
        super(Layer, self).__init__()
        self.fwd=act_forward
        self.bwd=act_backward

    def forward(self, X):
        self.input = X
        self.output = self.fwd(X)
        return self.output

    def backward(self, upstream_g, learning_rate):
        local_g = self.bwd(self.input)
        return local_g*upstream_g

### `Model` class

In [9]:
loss_dict = {'sigmoid': {'forward':  None,
                         'backward': None},
             'softmax': {'forward':  None,
                         'backward': None}}

def single_example_accuracy(y_real,y_pred):
    if y_pred.shape[-1] == 1: # i.e. if not one-hot encoded
        if np.max(y_pred)>1:  # multi-category integer labels
            pass
        else:
            y_real=1 if y_real.astype(float)>=0.5 else 0
            y_pred=1 if y_pred.astype(float)>=0.5 else 0
    else:
        y_real = np.argmax(y_real)
        y_pred = np.argmax(y_pred)
    
    return 1 if y_real == y_pred else 0



class Model:
    def __init__(self,loss_name): # planning on creating a dictionary so we can get a loss function (forward + backward) from its name
        self.layers = []
        self.loss_fwd = loss_dict[loss_name]['forward']
        self.loss_bwd = loss_dict[loss_name]['backward']

    def add(self, layer):
        self.layers.append(layer)

    def predict(self, input_data):
        y_hat = []

        for i in range(input_data.shape[0]):
            Z = input_data[i]
            for layer in self.layers:
                Z = layer.forward(Z)
            y_hat.append(Z)

        return y_hat

    def fit(self, x_train, y_train, epochs, learning_rate, x_val=None, y_val=None):

        history = {'accuracy': [],
                   'loss': [],
                   'val_accuracy': [],
                   'val_loss': []}

        for e in range(epochs):
            loss = 0
            acc  = 0
            val_loss = 0
            val_acc = 0
            for j in range(x_train.shape[0]):

                # forward
                Z = x_train[j]
                for layer in self.layers:
                    Z = layer.forward(Z)

                y_real = y_train[j]
                y_pred = Z

                # compute loss and accuracy
                loss += self.loss_fwd(y_real, y_pred)
                acc  += single_example_accuracy(y_real, y_pred)

                # backwward
                error = self.loss_bwd(y_real, y_pred)
                for layer in (self.layers)[::-1]:
                    error = layer.backward(error, learning_rate)


            loss /= x_train.shape[0]
            acc  /= x_train.shape[0]
            history['accuracy'].append(acc)
            history['loss'].append(loss)

            if not x_val or not y_val:
                print(f'Epoch: {e}   loss = {str(round(loss,3))}   acc = {str(round(acc,3))}')

            elif x_val and y_val:
                for k in range(x_val.shape[0]):
                    Z_val = x_val[j]
                    for layer in self.layers:
                        Z_val = layer.forward(Z_val)

                    y_real_val = y_val[k]
                    y_pred_val = Z_val

                    val_loss += self.loss_fwd(y_real_val, y_pred_val)
                    val_acc  += single_example_accuracy(y_real_val, y_pred_val)

                history['val_accuracy'].append(val_acc)
                history['val_loss'].append(val_loss)

                print(f'Epoch: {e}   loss = {str(round(loss,3))}   acc = {str(round(acc,3))}   val_loss = {str(round(val_loss,3))}   val_accuracy = {str(round(val_acc,3))}')


In [None]:
this_model = Model()

this_model.add(adder_layer)

## Data collection

### p.s. - been having a hard time finding an imagenet dataset/subset that is a reasonable size and easy to download

### CIFAR10

In [None]:
import tarfile
import os
import pickle
import tensorflow.keras.utils as np_utils


#!wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
# data_zip = os.path.join(root,'cifar-10-python.tar.gz')
# f = tarfile.open(data_zip)
# f.extractall(root) 
# f.close()
# os.remove(data_zip)

def load_cifar_data(folder):
    train_batches = [f'{folder}/{f}' for f in os.listdir(folder) if 'batch_' in f]
    test_batch    =  f'{folder}/test_batch'

    # Get train data
    X_trn = None
    y_trn = []
    for i in range(len(train_batches)):
        train_data_dict = pickle.load(open(train_batches[i],'rb'), encoding='latin-1')
        if i+1 == 1:
            X_trn = train_data_dict['data']
        else:
            X_trn = np.vstack((X_trn, train_data_dict['data']))
        y_trn += train_data_dict['labels']
    X_trn = X_trn.reshape(len(X_trn),3,32,32)
    X_trn = np.rollaxis(X_trn,1,4)
    X_trn = X_trn.astype('float32')/255.0
    y_trn = np_utils.to_categorical(np.asarray(y_trn),10)

    # Get test data
    test_data_dict  = pickle.load(open(test_batch,'rb'), encoding='latin-1')
    X_tst = test_data_dict['data']
    X_tst = X_tst.reshape(len(X_tst),3,32,32)
    X_tst = np.rollaxis(X_tst,1,4)
    X_tst = X_tst.astype('float32')/255.0
    y_tst = np_utils.to_categorical(np.asarray(test_data_dict['labels']))
    
    n_90 = int(0.9*len(X_trn))
    X_trn, X_val = X_trn[:n_90], X_trn[n_90:]
    y_trn, y_val = y_trn[:n_90], y_trn[n_90:]

    return X_trn, y_trn, X_tst, y_tst, X_val, y_val

task_2_data_dir = f'{root}/cifar-10-batches-py'
X_trn_c10, y_trn_c10, X_tst_c10, y_tst_c10, X_val_c10, y_val_c10 = load_cifar_data(task_2_data_dir)