<a href="https://colab.research.google.com/github/jdowner212/cs577_addernet/blob/main/AdderNet_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
root = os.getcwd() # whatever you want

In [4]:
import numpy as np
import torch
import json
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()


In [6]:
'''
Keeping track of the equations described in the paper -- not explicitly called
in subsequent code but want to make sure the functionality is present
'''

####################################
# defined in paper but not needed: #
####################################

'''Equation 1'''
# ignore -- updated with Equation 2

'''Equation 3'''
# ignore -- CNN formula

'''Equation 4'''
# ignore -- updated with Equation 5

'''Equation 8'''
# ignore -- CNN formula


####################################
# explicitly used in code -- keep: #
####################################
def L1(a,b):
    return -1*np.abs(a-b)

'''Equation 7'''
def hard_tanh(array):
    array = np.where(array<-1,-1,array)
    array = np.where(array>1, 1, array)
    return array

######################################################
# implicitly used in code -- can technically delete: #
######################################################

'''Equation 2'''
def Y_adder(image, F, m, n, t, similarity_f=L1): # image, group fo filters, row#, col#, filter#, similarity function
    sum_ = 0
    num_filters, k_depth, k_height, k_width = F.shape
    for k in range(k_depth):
        for j in range(k_width):
            for i in range(k_height):
                sum_ += similarity_f(image[k, m+i, n+j], F[t,k,i,j])
    return sum_

'''Equation 5'''
def dY_dF_element(image,filters,m,n,i,j,k,t):
    return image[k,m+i,n+j] - filter[t,k,i,j]

'''Equation 6''' # clipped, full-precision gradient
def dY_dImage_element(image,filters,m,n,i,j,k,t):
    return hard_tanh(filter[t,k,i,j] - image[k,m+i,n+j])


###################################################
# written last week, haven't gotten to these yet: #
###################################################

'''Equation 9'''
def var_Y_adder(X,F,variance_f=K.var):
    # Not sure K.var is the function we want here, if we need to specify axis, etc.
    var_X = variance_f(X)
    var_F = variance_f(F)
    ###
    _, c_in, d, _ = F.shape
    pi = np.pi
    return np.sqrt(pi/2)*(d**2)*(c_in)*(var_X + var_F)

'''Equation 10'''
def batch_norm(minibatch, gamma, beta):
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    gamma*(minibatch-mean)/std + beta
    return gamma*(minibatch-mean)/std + beta

'''Equation 11'''
def dL_dMinibatch_i(minibatch,dL_dy,i,L,gamma): # confused by this notation and need to revisit paper
    # In dL_dy, y is the result of applying batch_norm to the minibatch
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    
    sum_ = 0
    for j in range(m):
        x_term = (minibatch[i]-minibatch[j])*(minibatch[j]-mean)/std
        sum_ += (dL_dy[i] - dL_dy[j]*(1 + x_term))
    sum_ *= gamma/((m**2)*std)
    
    return sum_

'''Equation 12'''
# update rule for F
def delta_F_l(adaptive_lr_l, dL_dF_l, gamma):
    # the update delta for the filter in layer l
    return gamma*adaptive_lr_l*dL_dF_l

'''Equation 13'''
def adaptive_lr_l(dL_dF_l, eta, k):
    # k = number of elements in F_l -- I think equal to len(dL_dF_1)
    # in which case we don't need to explicitly provide it
    
    # l2_norm = torch.sqrt([g**2 for g in dL_dF_l])
    l2_norm = K.sqrt([g**2 for g in dL_dF_l]) # make sure torch.sqrt and K.sqrt are equivalents
    
    return eta*np.sqrt(k)/l2_norm

In [845]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, input):
        raise NotImplementedError

    def backward(self, loss, learning_rate):
        raise NotImplementedError

In [846]:
'''
dimension format/convention: NxHxWxC -- this is really unintuitive to me but i saw it in some tf 
documentation and thought it might make our lives easier later. Def not married to it
'''

def relu(array):
    return np.where(array>=0,array,0)

def adder_single_step(window, filter, b=None, similarity_f=L1):
    """
    window -- k_h x k_w x k_d
    filter -- k_h x k_w x k_d
    b      -- 1x1x1
    Z      -- scalar
    """
    k_h,k_w,k_d = filter.shape
    out=0
    for h in range(k_h):
        for w in range(k_w):
            for d in range(k_d):
                out += similarity_f(window[h,w,d], filter[h,w,d])

    if not b:
        b = np.zeros((1,1,1))
    out += b.astype(float)

    return out

class adder_layer(Layer):
    def __init__(self,F,X,stride=1,padding=0,activation=relu,B=None):
        super(Layer, self).__init__()
        self.F = F
        self.X = X
        self.s = stride
        self.p = padding
        self.act = activation
        B = np.zeros((F.shape[0],1,1,1)) if not B else B
        self.B = B

    def forward(self):
        """    
        X -- n_tensors x H x W x c_in
        F -- c_out x k_H x k_W x c_in
        b -- c_out x 1 x 1 x 1
        Z -- n_tensors x H_new x W_new, c_out
        cache -- info needed for backward pass
        """
        X,F,s,p,act,B = self.X, self.F, self.s, self.p, self.act, self.B
        n_tensors, H,   W,   c_in = X.shape
        c_out,     k_H, k_W, c_in = F.shape
        n_filters = c_out

        H_new = int((H + 2*p - k_H)/s)+1
        W_new = int((W + 2*p - k_W)/s)+1

        
        Z = np.zeros([n_tensors, H_new, W_new, c_out])
        X_padded = np.pad(X, ((0,0), (p,p), (p,p), (0,0)), 'constant', constant_values = (0,0))
        
        for i in range(n_tensors):           # traverse batch
            this_img = X_padded[i,:,:,:]     # select ith image in batch
            for f in range(n_filters):       # traverse filters
                this_filter = F[f,:,:,:]
                this_B = B[f,:,:,:]
                for h in range(H_new):       # traverse height
                    for w in range(W_new):   # traverse width
                        
                        v0 = h*s
                        v1 = h*s + k_H
                        h0 = w*s 
                        h1 = w*s + k_W
                        
                        this_window = this_img[v0:v1,h0:h1,:]

                        Z[i, h, w, f] = adder_single_step(this_window, this_filter, this_B) 

        assert Z.shape == (n_tensors, H_new, W_new, n_filters)
        cache = (X, F, B, s, p)
        return Z, cache

    def backward(self,upstream_g, cache):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, W, B, s, p)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """
        
        X, F, B, s, p = cache
        n_tensors, H_down, W_down, c_down = X.shape
        n_filters, k, k, k = F.shape
        
        n_tensors, H_up, W_up, c_up = upstream_g.shape
        

        dX_down = np.zeros((n_tensors, H_down, W_down, c_down))                           
        dF = np.zeros((n_filters, k, k, k))
        dB = np.zeros((n_filters, 1, 1, 1))

        X_padded = np.pad(X, ((0,0), (p,p), (p,p), (0,0)), 'constant', constant_values = (0,0))
        dX_down_padded = np.pad(dX_down, ((0,0), (p,p), (p,p), (0,0)), 'constant', constant_values = (0,0))
        
        for i in range(n_tensors):                       
            x = X_padded[i]
            dx = dX_down_padded[i]
            
            for h in range(H_up):                   # loop over vertical axis of the output volume
                for w in range(W_up):               # loop over horizontal axis of the output volume
                    for c in range(c_up):           # loop over the channels of the output volume
                        
                        v0,v1 = h,h+k
                        h0,h1 = w,w+k
                        
                        x_window = x[v0:v1, h0:h1, :]
                        dx_local = x_window-F[c,:,:,:]
                        df_local = hard_tanh(F[c,:,:,:]-x_window)

                        dx[v0:v1, v0:v1, :] += dx_local * upstream_g[i, h, w, c]
                        dF[c,:,:,:] += df_local * upstream_g[i, h, w, c]
                        dB[c,:,:,:] += upstream_g[i, h, w, c]
                        
            dX_down[i, :, :, :] = dx[p:-p, p:-p, :]
        
        assert(dX_down.shape == (n_tensors, H_down, W_down, c_down))
        return dX_down, dF, dB

In [269]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Function
import math

def adder2d_function(X, W, stride=1, padding=0):
    n_filters, d_filter, h_filter, w_filter = W.size()
    n_x, d_x, h_x, w_x = X.size()

    h_out = (h_x - h_filter + 2 * padding) / stride + 1
    w_out = (w_x - w_filter + 2 * padding) / stride + 1

    h_out, w_out = int(h_out), int(w_out)
    X_col = torch.nn.functional.unfold(X.reshape(1, -1, h_x, w_x), h_filter, dilation=1, padding=padding, stride=stride).view(n_x, -1, h_out*w_out)
    X_col = X_col.permute(1,2,0).contiguous().view(X_col.size(1),-1)
    W_col = W.view(n_filters, -1)
    
    out = adder.apply(W_col,X_col)
    
    out = out.view(n_filters, h_out, w_out, n_x)
    out = out.permute(3, 0, 1, 2).contiguous()
    
    return out

class adder(Function):
    @staticmethod
    def forward(ctx, W_col, X_col):
        ctx.save_for_backward(W_col,X_col)
        output = -(W_col.unsqueeze(2)-X_col.unsqueeze(0)).abs().sum(1)
        return output

    @staticmethod
    def backward(ctx,grad_output):
        W_col,X_col = ctx.saved_tensors
        grad_W_col = ((X_col.unsqueeze(0)-W_col.unsqueeze(2))*grad_output.unsqueeze(1)).sum(2)
        grad_W_col = grad_W_col/grad_W_col.norm(p=2).clamp(min=1e-12)*math.sqrt(W_col.size(1)*W_col.size(0))/5
        grad_X_col = (-(X_col.unsqueeze(0)-W_col.unsqueeze(2)).clamp(-1,1)*grad_output.unsqueeze(1)).sum(0)
        
        return grad_W_col, grad_X_col
    
class adder2d(nn.Module):

    def __init__(self,input_channel,output_channel,kernel_size, stride=1, padding=0, bias = False):
        super(adder2d, self).__init__()
        self.stride = stride
        self.padding = padding
        self.input_channel = input_channel
        self.output_channel = output_channel
        self.kernel_size = kernel_size
        #self.adder = torch.nn.Parameter(nn.init.normal_(torch.randn(output_channel,input_channel,kernel_size,kernel_size)))
        self.adder = torch.nn.Parameter(torch.tensor(filters))
        self.bias = bias
        if bias:
            self.b = torch.nn.Parameter(nn.init.uniform_(torch.zeros(output_channel)))

    def forward(self, x):
        output = adder2d_function(x,self.adder, self.stride, self.padding)
        if self.bias:
            output += self.b.unsqueeze(0).unsqueeze(2).unsqueeze(3)
        
        return output

In [770]:
X = np.random.randint(0,10,(2,4,4,3))
X2 = torch.tensor(np.rollaxis(X,3,1),dtype=float)
W = np.random.randint(-2,2,(2,3,3,3))
W2 = torch.tensor(W,dtype=float)

In [101]:
import tarfile
import os
import pickle
import tensorflow.keras.utils as np_utils


#!wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
# data_zip = os.path.join(root,'cifar-10-python.tar.gz')
# f = tarfile.open(data_zip)
# f.extractall(root) 
# f.close()
# os.remove(data_zip)

def load_cifar_data(folder):
    train_batches = [f'{folder}/{f}' for f in os.listdir(folder) if 'batch_' in f]
    test_batch    =  f'{folder}/test_batch'

    # Get train data
    X_trn = None
    y_trn = []
    for i in range(len(train_batches)):
        train_data_dict = pickle.load(open(train_batches[i],'rb'), encoding='latin-1')
        if i+1 == 1:
            X_trn = train_data_dict['data']
        else:
            X_trn = np.vstack((X_trn, train_data_dict['data']))
        y_trn += train_data_dict['labels']
    X_trn = X_trn.reshape(len(X_trn),3,32,32)
    X_trn = np.rollaxis(X_trn,1,4)
    X_trn = X_trn.astype('float32')/255.0
    y_trn = np_utils.to_categorical(np.asarray(y_trn),10)

    # Get test data
    test_data_dict  = pickle.load(open(test_batch,'rb'), encoding='latin-1')
    X_tst = test_data_dict['data']
    X_tst = X_tst.reshape(len(X_tst),3,32,32)
    X_tst = np.rollaxis(X_tst,1,4)
    X_tst = X_tst.astype('float32')/255.0
    y_tst = np_utils.to_categorical(np.asarray(test_data_dict['labels']))
    
    n_90 = int(0.9*len(X_trn))
    X_trn, X_val = X_trn[:n_90], X_trn[n_90:]
    y_trn, y_val = y_trn[:n_90], y_trn[n_90:]

    return X_trn, y_trn, X_tst, y_tst, X_val, y_val

task_2_data_dir = f'{root}/cifar-10-batches-py'
X_trn_c10, y_trn_c10, X_tst_c10, y_tst_c10, X_val_c10, y_val_c10 = load_cifar_data(task_2_data_dir)

In [104]:
X_trn_c10.shape

(45000, 32, 32, 3)