<a href="https://colab.research.google.com/github/jdowner212/cs577_addernet/blob/main/AdderNet_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

## Before running:

1. Set `root` (below) to chosen directory

In [2]:
root = os.getcwd() # whatever you want

2. Download CIFAR10 data

In [3]:
import tarfile


!wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
data_zip = os.path.join(root,'cifar-10-python.tar.gz')
f = tarfile.open(data_zip)
f.extractall(root) 
f.close()
os.remove(data_zip)

--2022-11-16 19:23:08--  https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170498071 (163M) [application/x-gzip]
Saving to: ‘cifar-10-python.tar.gz’


2022-11-16 19:23:17 (21.4 MB/s) - ‘cifar-10-python.tar.gz’ saved [170498071/170498071]



In [15]:
import numpy as np
import tensorflow as tf
#import tensorflow.keras.backend as K
from tensorflow.python.ops.numpy_ops import np_config
import pickle
import sys
import tensorflow.keras.utils as np_utils
np_config.enable_numpy_behavior()
from scipy.sparse import diags
import matplotlib.pyplot as plt

In [5]:
# def L1(a,b):
#     return np.abs(a-b)

def hard_tanh(array):
    return np.clip(array,-1,1)

def eps():
    return np.random.uniform(1e-07,1e-06)


# Layer definitions

### `Layer` parent class

In [19]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, X, init_weights):
        raise NotImplementedError

    def backward(self, upstream_g, learning_rate):
        raise NotImplementedError

### `Adder` layer

In [492]:
# def adder_single_step(window, filter_, similarity_f=L1):
#     """
#     window -- k_h x k_w x k_d
#     filter -- k_h x k_w x k_d
#     b      -- 1x1x1
#     Z      -- scalar
#     """
#     #H_k,W_k,D_k = filter_.shape
#     #out=0
#     #for h in range(H_k):
#     #    for w in range(W_k):
#     #        for d in range(D_k):
#     #            out += similarity_f(window[h,w,d], filter_[h,w,d])
#     return np.abs(window-filter_).sum()
#     #return out

class adder_layer(Layer):
    def __init__(self,output_channels,kernel_size=3,stride=1,padding=0,adaptive_eta=0):
        self.output_channels = output_channels
        self.output_channels = output_channels
        self.adaptive_eta=adaptive_eta
        self.kernel_size=kernel_size        
        self.stride = stride
        self.padding = padding
        self.filters = np.ones((self.output_channels,self.kernel_size,self.kernel_size,1))
        self.bias = np.zeros((self.output_channels,1,1,1))

    def get_adaptive_lr(self, k, dfilters, eta):
        """    
        k           -- n_tensors 
        dfilters    -- c_out x k_H x k_W x c_in
        eta         -- scalar
        """
        norm = np.linalg.norm(dfilters, ord=2, axis=0)
        return (eta * np.sqrt(k)) / (norm+eps())
    def forward(self,X, init_weights=False):
        """    
        X       -- n_tensors x H x W x c_in
        filters -- c_out x k_H x k_W x c_in
        b       -- c_out x 1 x 1 x 1
        Z       -- n_tensors x H_new x W_new, c_out
        cache   -- info needed for backward pass
        """
        self.input = X
        self.input_channels = X.shape[-1]
        if init_weights==True:
            self.filters = np.random.normal(loc=0,scale=1,size=(self.output_channels, self.kernel_size, self.kernel_size, self.input_channels))
            self.bias = np.zeros((self.output_channels,1,1,self.input_channels))

        filters, bias, stride,padding = self.filters, self.bias, self.stride, self.padding

        # single images:
        # H, W, c_in = X.shape
        # batches:
        n_tensors, H,   W,   c_in = X.shape
        c_out,     H_k, W_k, c_in = filters.shape
        n_filters = c_out


        # single images:
        # X_padded = np.pad(X, ((padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        # batches:
        X_padded = np.pad(X, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        H_new = int((H + 2*padding - H_k)/stride)+1
        W_new = int((W + 2*padding - W_k)/stride)+1


        # batches: 
        Z = np.zeros([n_tensors, H_new, W_new, c_out])
        # single images:
        # Z = np.zeros([H_new,W_new,c_out])

        # batches:
        for i in range(n_tensors):           # traverse batch
            # batches: 
            this_img = X_padded[i,:,:,:]     # select ith image in batch
            for f in range(n_filters):       # traverse filters
                this_filter = filters[f,:,:,:]
                this_bias = bias[f,:,:,:]
                for h in range(H_new-H_k):   # traverse height
                    for w in range(W_new):   # traverse width
                        v0,v1 = h*stride, h*stride + H_k
                        h0,h1 = w*stride, w*stride + W_k
                        this_window = this_img[v0:v1,h0:h1,:]

                        # batches:
                        Z[i, h, w, f] = np.abs(this_window-this_filter).sum()
                        # single images
                        # Z[h,w,f] = np.abs(this_window-this_filter).sum() 

        # batches:
        assert Z.shape == (n_tensors, H_new, W_new, n_filters)
        # single images:
        # assert Z.shape == (H_new, W_new, n_filters)

        self.output = Z
        self.cache = X, filters, bias, stride, padding
        
        return self.output

    def backward(self, upstream_g, learning_rate):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, W, B, s, p)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """
        
        X, filters, bias, stride, padding = self.cache

        # single images:
        #H_down, W_down, c_down = X.shape
        # batches:
        n_tensors, H_down, W_down, c_down = X.shape

        n_filters, H_k,    W_k,    c_down = filters.shape
        n_tensors, H_up,   W_up,   c_up   = upstream_g.shape
        
        dX       = np.zeros_like(X)                           
        dfilters = np.zeros_like(filters)
        dbias    = np.zeros_like(bias)


        # batches:
        X_padded  = np.pad(X,  ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        dX_padded = np.pad(dX, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        
        # single images:
        # X_padded  = np.pad(X,  ((padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        # dX_padded = np.pad(dX, ((padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        

        for i in range(n_tensors):                       
            x = X_padded[i]
            dx = dX_padded[i]
            
        #x, dx = X_padded, dX_padded
            for h in range(H_up):                   # traverse height
                for w in range(W_up):               # traverse width
                    for c in range(c_up):           # traverse filters
                        
                        v0,v1 = h,h+H_k
                        h0,h1 = w,w+W_k
                        
                        x_window = x[v0:v1, h0:h1, :]
                        f_window = filters[c,:,:,:]

                        dx_local = hard_tanh(f_window-x_window)
                        df_local = x_window-f_window

                        # single images:
                        # g = upstream_g[h,w,c]
                        # batches:
                        g = upstream_g[i, h, w, c]

                        dx[v0:v1, v0:v1, :] += dx_local * g
                        dfilters[c,:,:,:]   += df_local * g
                        dbias[c,:,:,:]      += g

            # single images:        
            # dX = dx[padding:-padding, padding:-padding,:]
            # batches:
            dX[i, :, :, :] = dx[padding:-padding, padding:-padding, :]

        #print('adder_layer:')
        #print(f'dX min, max = {str(round(np.min(dX.flatten()),3))},{str(round(np.max(dX.flatten()),3))}')
        #print(f'dF min, max = {str(round(np.min(dfilters.flatten()),3))},{str(round(np.max(dfilters.flatten()),3))}')

        # single images:
        # assert (dX.shape == (H_down, W_down, c_down))
        # batches:
        assert(dX.shape == (n_tensors, H_down, W_down, c_down))


        adaptive_lr = self.get_adaptive_lr(n_filters, dfilters, self.adaptive_eta)

        self.filters -= learning_rate*adaptive_lr*dfilters
        self.bias    -= learning_rate*dbias

        #plt.plot(range(len(dfilters.flatten())),dfilters.flatten())
        #plt.title('adder_layer.backward: dfilters values')
        #plt.show()
        return dX

### Fully-connected layer

In [493]:
class FullyConnected(Layer):
    def __init__(self,output_channels):
        super(Layer, self).__init__()
        self.output_channels = output_channels
        self.weights=np.ones((1,self.output_channels))
        self.bias=np.zeros((1,self.output_channels))

    def forward(self, X,init_weights):
        self.input = X
        self.input_channels = X.shape[-1]
        if init_weights==True:
            self.weights = np.random.normal(loc=0,scale=1,size=(self.input_channels,self.output_channels))
            self.bias    = np.random.normal(loc=0,scale=1,size=(self.output_channels))

        self.output = np.dot(self.input, self.weights)
        for i in range(self.output.shape[0]):
            self.output[i] += self.bias
        return self.output

    def backward(self, upstream_g, learning_rate):
        dX    = np.dot(upstream_g, self.weights.T)
        dW    = np.dot(self.input.T, upstream_g)
        dbias = np.mean(upstream_g)
        self.weights -= learning_rate*dW
        self.bias    -= learning_rate*dbias


        #print('FullyConnected:')
        #print(f'dX min, max = {str(round(np.min(dX.flatten()),3))},{str(round(np.max(dX.flatten()),3))}')
        #print(f'dW min, max = {str(round(np.min(dW.flatten()),3))},{str(round(np.max(dW.flatten()),3))}')


        #plt.plot(range(len(dW.flatten())),dW.flatten())
        #plt.title('FullyConnected.backward: dW values')
        #plt.show()
        return dX

'''
     25 
     26         self.weights -= learning_rate*dW
---> 27         self.bias    -= learning_rate*dbias
     28 
     29         print('FullyConnected:')

ValueError: non-broadcastable output operand with shape (10,) doesn't match the broadcast shape (16,10)
''';

### Flatten layer

In [494]:
class Flatten(Layer):
    def forward(self, X,init_weights):
        self.original_shape = X.shape
        # single images:
        # self.output = X
        # batches:
        self.output = X.reshape(X.shape[0], np.product(X.shape[1:]))
        return self.output

    def backward(self, upstream_g, learning_rate):
        dX = upstream_g.reshape(self.original_shape)
        #print('Flatten:')
        #print(f'dX min, max = {str(round(np.min(dX.flatten()),3))},{str(round(np.max(dX.flatten()),3))}')
        return dX

### BatchNorm Layer

In [495]:
class batch_norm_layer(Layer):
    def __init__(self, gamma=None,beta=None):

        gamma = None if not gamma else gamma
        beta = None if not beta else beta
        self.gamma = 1 #np.ones((1,1,1,1))
        self.beta = 0  #np.zeros((1,1,1,1))


    def forward(self, X,init_weights):
        """    
        X       -- n_tensors x H x W x c_in
        gamma   -- n_tensors x 1 x 1 x 1
        beta    -- n_tensors x 1 x 1 x 1
        cache   -- info needed for backward pass
        """

        self.input = X

        if init_weights==True:
            # single images:
            # self.gamma = np.ones((1,1,1))
            # self.beta = np.zeros((1,1,1))
            # batch:
            self.gamma = np.ones((1,1,1))
            self.beta = np.zeros((1,1,1))

        mean = np.mean(X,axis=(0, 1, 2), keepdims=True)
        var = np.mean(((X-mean)**2), axis=(0, 1, 2), keepdims=True)
        std = np.sqrt(var)
        
        X_center = X - mean
        X_norm = X_center/(std+eps())

        #out = X_norm*self.gamma
        
        self.output = X_norm*self.gamma + self.beta
        self.cache = X, X_center, X_norm

        return self.output 
        '''
        ---> 36         self.output = X_norm*self.gamma + self.beta
            37         self.cache = X, X_center, X_norm
            38 

        ValueError: operands could not be broadcast together with shapes (33,16,16,8) (3,16,16,8) 
        '''
    def backward(self, upstream_g, learning_rate):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, X_norm)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """

        X, X_center, X_norm = self.cache

        dGamma = np.sum(upstream_g * X_norm, axis=0)
        dBeta  = np.sum(upstream_g, axis=0)

        m = len(X)
        mean = np.mean(X)
        std = np.std(X)
        
        dX = np.zeros_like(X)

        for i in range(m):
            for j in range(m):
                dX[i] += (upstream_g[i] - upstream_g[j]*(1 + (X[i]-X[j])*(X[j]-mean)/std))

        dX *= self.gamma/((m**2)*std)
        
        self.gamma = self.gamma - learning_rate*dGamma
        self.beta  = self.beta  - learning_rate*dBeta


       # print('batch_norm_layer:')
       # print(f'dX min, max = {str(round(np.min(dX.flatten()),3))},{str(round(np.max(dX.flatten()),3))}')

        return dX

### Maxpool layer

In [496]:
class MaxPool(Layer):
    def __init__(self,pool_size=2):
        self.pool_size=pool_size
        self.stride = pool_size

    def forward(self,X,init_weights):
        # single images:
        # H,W,c_in = X.shape 
        # batches: 
        n_tensors, H, W, c_in = X.shape



        H_new = int(1 + (H - self.pool_size) / self.stride)
        W_new = int(1 + (W - self.pool_size) / self.stride)
        c_out = c_in
        
        # single images:
        # Z = np.zeros((H_new,W_new,c_out))
        # batches: 
        Z = np.zeros((n_tensors, H_new, W_new, c_out))              
        
        for i in range(n_tensors):                     # loop over the training examples
            for h in range(H_new):                     # loop on the vertical axis of the output volume
                for w in range(W_new):                 # loop on the horizontal axis of the output volume
                    for c in range(c_out):             # loop over the channels of the output volume
                        
                        v0,v1 = h*self.stride, h*self.stride + self.pool_size
                        h0,h1 = w*self.stride, w*self.stride + self.pool_size
                        
                        # single images:
                        # window = X[v0:v1, h0:h1, c]
                        # Z[h,w,c] = np.max(window)
                        # batches:
                        window = X[i, v0:v1, h0:h1,c]
                        Z[i, h, w, c] = np.max(window)
                    

        self.output = Z
        self.cache = X, self.pool_size, self.stride
        
        return self.output

    def backward(self, upstream_g,learning_rate):
        X, pool_size, stride = self.cache

        # batches: 
        n_tensors, H_down, W_down, c_down = X.shape
        # single images:
        # H_down, W_down, c_down = X.shape 
        n_tensors, H_up,   W_up,   c_up   = upstream_g.shape


        dX = np.zeros(X.shape)
        
        for i in range(n_tensors):                       
            x = X[i]
            for h in range(H_up):       
                for w in range(W_up):    
                    for c in range(c_up):       
                        v0,v1 = h, h+pool_size
                        h0,h1 = w, w+pool_size

                        x_window = x[v0:v1, h0:h1, c]
                        
                        local_g = np.where(x_window==np.max(x_window),1,0)
                        # single images:
                        # g = upstream_g[h,w,c] 
                        # batches: 
                        g = upstream_g[i, h, w, c]

                        # single images:  
                        # dX[v0:v1, h0:h1, c] += local_g*g  
                        # batches: 
                        dX[i, v0:v1, h0:h1, c] += local_g * g

        assert(dX.shape == X.shape)
     #   print('MaxPool:')
     #   print(f'dX min, max = {str(round(np.min(dX.flatten()),3))},{str(round(np.max(dX.flatten()),3))}')

  
        return dX


### Activation layers

In [497]:
def relu_fwd(X):
    return np.where(X>=0,X,0)
def relu_bwd(X):
    return np.where(X>=0,1,0)


def softmax_fwd(x):
    soft = tf.nn.softmax(x)
    return soft.numpy()

def softmax_bwd(X): 
    s = softmax_fwd(X)
    J = np.zeros_like(s)

    for i in range(len(s)):
        for j in range(len(s[i])):
            indicator_ij = 1 if i==j else 0
            J[i][j] = s[i][j]*(indicator_ij - s[i][j])
    return J



activation_dict = {'relu':    {'forward':  relu_fwd,
                               'backward': relu_bwd},
                   'softmax': {'forward':  softmax_fwd,
                               'backward': softmax_bwd}}

class Activation(Layer):
    def __init__(self,activation_name):
        self.activation_name = activation_name
        super(Layer, self).__init__()
    def forward(self, X, init_weights=False):
        self.input = X
        if self.activation_name == 'relu':
            self.output = np.where(X>=0,X,0)
            return self.output
        elif self.activation_name == 'softmax':
            self.output = tf.nn.softmax(X).numpy()
            return self.output
    def backward(self, upstream_g, learning_rate):

        local_g = None     
        if self.activation_name == 'relu':
            local_g = np.where(self.input>=0,1,0)

        elif self.activation_name == 'softmax':
            s = self.output
            J = np.zeros_like(s)
            for i in range(len(s)):
                for j in range(len(s[i])):
                    indicator_ij = 1 if i==j else 0
                    J[i][j] = s[i][j]*(indicator_ij - s[i][j])
            local_g = J

        #print('local_g:',local_g)
        #print('upstream_g:',upstream_g)
        dX = np.zeros_like(self.input)
        for i, (l,u) in enumerate(zip(local_g, upstream_g)):
            dX[i]=learning_rate*l*u
        
        #learning_rate*np.dot(local_g,upstream_g.T)
  
        # print(f'Activation ({self.activation_name}):')
        # print(f'local_g min, max = {str(round(np.min(local_g.flatten()),3))},{str(round(np.max(local_g.flatten()),3))}')
        # print(f'up_g min, max = {str(round(np.min(upstream_g.flatten()),3))},{str(round(np.max(upstream_g.flatten()),3))}')
        # print(f'dX min, max = {str(round(np.min(dX.flatten()),3))},{str(round(np.max(dX.flatten()),3))}')
        return dX

        #print(f'{self.activation_name} activation layer:  min dX = {str(round(np.min(out.flatten()),3))}, max dX = {str(round(np.max(out.flatten()),3))}')


### `Model` class

In [498]:
def cat_cross_entropy(y_true, y_pred):    
    out = -np.sum(np.multiply(y_true,np.log(y_pred+eps())))
    return out/y_true.shape[0]

# def cat_cross_entropy(y_true, y_pred):    
#     return tf.keras.losses.CategoricalCrossentropy()(y_true,y_pred).numpy()

def cat_cross_entropy_prime(y_true,y_pred):
    return np.sum([-y/(yhat+eps()) for (y,yhat) in zip(y_true,y_pred)])


loss_dict = {'cat_cross_entropy':    {'forward':  cat_cross_entropy,
                                      'backward': cat_cross_entropy_prime}}

def get_mini_batches(X,y,batch_size):
    mini_batches = []
    for i in range(0,len(X), batch_size):
        lower = i
        upper = np.min([len(X), i + batch_size])
        X_batch = X[lower:upper]
        y_batch = y[lower:upper]
        mini_batches.append((X_batch,y_batch))

    return mini_batches

class Model:
    def __init__(self,loss_name): 
        self.layers = []
        self.loss_fwd = loss_dict[loss_name]['forward']

    def add(self, layer):
        self.layers.append(layer)

    def predict(self, input_data):
        y_hat = []
        Z = input_data
        for layer in self.layers:
            Z = layer.forward(Z,init_weights=False)
        y_hat = Z
        return y_hat

    def fit(self, x_train, y_train, epochs, batch_size, learning_rate, x_val=None, y_val=None):
        history = {'accuracy': [],'loss': [],'val_accuracy': [],'val_loss': []}

        for e in range(epochs):
            print(e)

            loss_,acc,val_loss,val_acc=0,0,0,0

            mini_batches = get_mini_batches(x_train,y_train, batch_size)

            # single images:
            # for i,img in enumerate(x_train):
            # batches: 
            for i, mini_batch in enumerate(mini_batches):    
                print(i)

                # single images:
                # x_batch= img
                # y_batch= y_train[i] 
                # batches: 
                x_batch = mini_batch[0]
                y_batch = mini_batch[1]

                # forward
                Z = x_batch

                for layer in self.layers:
                    #print(layer)
                    init_weights=True if e==0 else False
                    Z = layer.forward(Z,init_weights=init_weights)
                y_real = y_batch
                y = np.argmax(y_real,axis=1)
                y_pred = Z

                # compute loss and accuracy
                # loss defined in line:
                m = len(y)
                #this_loss = -np.log(Z[range(m),y])
                #loss_ += this_loss
                #-np.sum(y_true * np.log(y_pred + 10**-100))
                # if calling loss function explicitly:
                this_loss = self.loss_fwd(y_real, y_pred)

                acc   += sum(np.where(np.argmax(y_real,axis=1)==np.argmax(y_pred,axis=1),1,0))
                print(f'batch: {i+1}/{len(mini_batches)+1}'.ljust(15) + f'this batch loss: {str(round(np.sum(this_loss),3))}','\r')#, flush=True)
                sys.stdout.flush()

                # backward - dCCE/dsoftmax
                # Z[:,y] -= 1
                # Z /= len(Z)
                # error = Z
                error = -y/(np.argmax(y_pred,axis=1) + eps())

                for layer in (self.layers)[::-1]:
                    #print(layer)
                    error = layer.backward(error, learning_rate)
              
            loss_ /= x_train.shape[0]
            acc  /= x_train.shape[0]
            
            history['loss'].append(loss_)
            history['accuracy'].append(acc)

            if x_val is None or y_val is None:
                print(f'Epoch: {e}   loss = {str(round(loss_,3))}   acc = {str(round(acc,3))}')
            else:
                ## single images:
                # val_acc=0
                # for i,img in enumerate(x_val):
                #     Z_val = img
                #     for layer in self.layers:
                #         Z_val = layer.forward(Z_val, init_weights=False)

                #     y_real_val = y_val[i]
                #     y_pred_val= Z_val
                #     val_loss = self.loss_fwd(y_real_val, y_pred_val)
                #     val_acc += 1 if np.argmax(y_real_val,axis=1)==np.argmax(y_pred_val,axis=1) else 0


                ## batches:
                Z_val = x_val
                for layer in self.layers:
                   Z_val = layer.forward(Z_val,init_weights=False)

                y_real_val = y_val
                y_pred_val = Z_val

                val_loss = self.loss_fwd(y_real_val, y_pred_val)
                val_acc = sum(np.where(np.argmax(y_real_val,axis=1)==np.argmax(y_pred_val,axis=1),1,0))
                val_acc  /= x_val.shape[0]                

                history['val_accuracy'].append(val_acc)
                history['val_loss'].append(val_loss)

                print(f'Epoch: {e}   loss = {str(round(loss_,3))}   acc = {str(round(acc,3))}   val_loss = {str(round(val_loss,3))}   val_accuracy = {str(round(val_acc,3))}')

        return history


## Data collection

### CIFAR10

In [499]:
def load_cifar_data(folder,tiny=False):
    train_batches = [f'{folder}/{f}' for f in os.listdir(folder) if 'batch_' in f]
    test_batch    =  f'{folder}/test_batch'

    # Get train data
    X_trn = None
    y_trn = []
    for i in range(len(train_batches)):
        train_data_dict = pickle.load(open(train_batches[i],'rb'), encoding='latin-1')
        if i+1 == 1:
            X_trn = train_data_dict['data']
        else:
            X_trn = np.vstack((X_trn, train_data_dict['data']))
        y_trn += train_data_dict['labels']
    X_trn = X_trn.reshape(len(X_trn),3,32,32)
    X_trn = np.rollaxis(X_trn,1,4)
    X_trn = X_trn.astype('float32')/255.0
    y_trn = np_utils.to_categorical(np.asarray(y_trn),10)

    # Get test data
    test_data_dict  = pickle.load(open(test_batch,'rb'), encoding='latin-1')
    X_tst = test_data_dict['data']
    X_tst = X_tst.reshape(len(X_tst),3,32,32)
    X_tst = np.rollaxis(X_tst,1,4)
    X_tst = X_tst.astype('float32')/255.0
    y_tst = np_utils.to_categorical(np.asarray(test_data_dict['labels']))
    
    n_90 = int(0.9*len(X_trn))
    X_trn, X_val = X_trn[:n_90], X_trn[n_90:]
    y_trn, y_val = y_trn[:n_90], y_trn[n_90:]

    if tiny:
        X_trn,y_trn,X_tst,y_tst,X_val,y_val = X_trn[:500],y_trn[:500],X_tst[:100],y_tst[:100],X_val[:50],y_val[:50]

    return X_trn, y_trn, X_tst, y_tst, X_val, y_val

data_dir = f'{root}/cifar-10-batches-py'
X_trn_c10, y_trn_c10, X_tst_c10, y_tst_c10, X_val_c10, y_val_c10 = load_cifar_data(data_dir,tiny=True)

In [500]:
this_model = Model(loss_name='cat_cross_entropy')

this_model.add(adder_layer(output_channels=8,kernel_size=3,stride=1,padding=1,adaptive_eta=0.1))
this_model.add(Activation('relu'))
this_model.add(MaxPool(pool_size=2))
this_model.add(batch_norm_layer())


this_model.add(Flatten())
this_model.add(FullyConnected(output_channels=64))
this_model.add(Activation('relu'))
this_model.add(FullyConnected(output_channels=10))
this_model.add(Activation('softmax'))

In [501]:
sum(np.array([-6.25000000e-02,  4.72255243e-09, -6.25000000e-02, -6.25000000e-02,
  -6.25000000e-02, -6.25000000e-02, -4.72255243e-09, -6.25000000e-02,
  -6.25000000e-02, -6.25000000e-02]))

-0.49999999999999994

In [None]:
history = this_model.fit(X_trn_c10,y_trn_c10,epochs=20,batch_size=16,learning_rate=1e-05,x_val=X_val_c10,y_val=y_val_c10)

0
0
batch: 1/33    this batch loss: 15.88 
1
batch: 2/33    this batch loss: 14.019 
2
batch: 3/33    this batch loss: 13.86 
3
batch: 4/33    this batch loss: 15.367 
4
batch: 5/33    this batch loss: 11.162 
5
batch: 6/33    this batch loss: 13.415 
6
batch: 7/33    this batch loss: 12.266 
7
batch: 8/33    this batch loss: 12.971 
8
batch: 9/33    this batch loss: 10.431 
9
batch: 10/33   this batch loss: 11.743 
10
batch: 11/33   this batch loss: 13.013 
11
batch: 12/33   this batch loss: 11.33 
12
batch: 13/33   this batch loss: 9.628 
13
batch: 14/33   this batch loss: 15.634 
14
batch: 15/33   this batch loss: 11.482 
15
batch: 16/33   this batch loss: 14.619 
16
batch: 17/33   this batch loss: 13.125 
17
batch: 18/33   this batch loss: 11.016 
18
batch: 19/33   this batch loss: 11.48 
19
batch: 20/33   this batch loss: 14.345 
20
batch: 21/33   this batch loss: 14.421 
21
batch: 22/33   this batch loss: 12.188 
22
batch: 23/33   this batch loss: 11.648 
23
batch: 24/33   this b

In [None]:
plt.figure(figsize=(8,5))
plt.plot(range(20),history['accuracy'],label='train')
plt.plot(range(20),history['val_accuracy'],label='val')
plt.legend()
plt.xlabel('Epochs')
plt.xticks(range(0,21,2))
plt.ylim(0,0.3)
plt.title('Accuracy -- our AdderNet implementation')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.plot(range(20),history['loss'],label='train')
plt.plot(range(20),history['val_loss'],label='val')
plt.legend()
plt.xlabel('Epochs')
plt.xticks(range(0,21,2))
plt.ylim(0,0.1)
plt.title('Loss -- our AdderNet implementation')
plt.show()



In [None]:
#this_model = Model(loss_name='cat_cross_entropy')
# l1 = X_trn_c10
# l1=adder_layer(output_channels=8,kernel_size=3,stride=1,padding=1).forward(l1)
# print(l1.shape)
# l2=Activation('relu').forward(l1)
# print(l2.shape)
# l3=MaxPool(pool_size=2).forward(l2)
# print(l3.shape)
# l4=batch_norm_layer().forward(l3)
# print(l4.shape)

# l5=Flatten().forward(l4)
# print(l5.shape)
# l6=FullyConnected(output_channels=64).forward(l5)
# print(l6.shape)
# l7=Activation('relu').forward(l6)
# print(l7.shape)
# l8=FullyConnected(output_channels=10).forward(l7)
# print(l8.shape)
# l9=Activation('softmax').forward(l8)
# print(l9.shape)

In [179]:
cnn = Model(loss_name='cat_cross_entropy')

cnn.add(conv_layer(output_channels=8,kernel_size=3,stride=1,padding=1))
cnn.add(Activation('relu'))
cnn.add(MaxPool(pool_size=2))
cnn.add(batch_norm_layer())


cnn.add(Flatten())

cnn.add(FullyConnected(output_channels=64))
cnn.add(batch_norm_layer())
cnn.add(Activation('relu'))
cnn.add(FullyConnected(output_channels=10))
cnn.add(batch_norm_layer())
cnn.add(Activation('softmax'))

NameError: ignored

In [180]:
cnn.fit(X_trn_c10,y_trn_c10,10,1e-05,X_val_c10,y_val_c10)

0
0


ValueError: ignored

In [181]:
def add2d(X, K):  
    """Compute 2D cross-correlation."""
    h, w = K.shape
    Y = tf.Variable(tf.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1)))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j].assign(tf.reduce_sum(
                X[i: i + h, j: j + w] + K))
    return Y


class adder_2d(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

    def build(self, kernel_size):
        initializer = tf.random_normal_initializer()
        self.weight = self.add_weight(name='w', shape=kernel_size,
                                      initializer=initializer)
        self.bias = self.add_weight(name='b', shape=(1, ),
                                    initializer=initializer)

    def call(self, inputs):
        return add2d(inputs, self.weight) + self.bias


class add_it(tf.keras.layers.Conv2D):
    def convolution_op(self, inputs, kernel):
        mean, var = tf.nn.moments(kernel, axes=[0, 1, 2], keepdims=True)
        return tf.nn.conv2d(
            inputs,
            (kernel - mean) / tf.sqrt(var + 1e-10),
            padding="VALID",
            strides=list(self.strides),
            name=self.__class__.__name__,
        )

In [378]:
def conv_single_step(window, filter_, bias):
    """
    window -- k_h x k_w x k_d
    filter_ -- k_h x k_w x k_d
    b      -- 1x1x1
    Z      -- scalar
    """
    out = np.sum((np.multiply(window,filter_) + bias.astype(float))).astype(float)
    
    return out

class conv_layer(Layer):
    def __init__(self,output_channels,kernel_size=3,stride=1,padding=0):#,similarity_f = L1):
        self.output_channels = output_channels


        self.output_channels = output_channels
        self.adaptive_eta=0

        self.kernel_size=kernel_size        
        self.stride = stride
        self.padding = padding



    def forward(self,X):
        """    
        X       -- n_tensors x H x W x c_in
        filters -- c_out x k_H x k_W x c_in
        b       -- c_out x 1 x 1 x 1
        Z       -- n_tensors x H_new x W_new, c_out
        cache   -- info needed for backward pass
        """
        self.input = X

        # in case input size not given
        self.input_channels = X.shape[-1]

        self.filters = np.random.normal(loc=0,scale=1,size=(self.output_channels, self.kernel_size, self.kernel_size, self.input_channels))
        self.bias    = np.random.normal(loc=0,scale=1,size=(self.output_channels, 1,1 ,self.input_channels))
        
        filters,stride,padding,bias = self.filters, self.stride, self.padding, self.bias
        n_tensors, H,   W,   c_in = X.shape
        c_out,     H_k, W_k, c_in = filters.shape
        n_filters = c_out

        X_padded = np.pad(X, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        H_new = int((H + 2*padding - H_k)/stride)+1
        W_new = int((W + 2*padding - W_k)/stride)+1

        Z = np.zeros([n_tensors, H_new, W_new, c_out])

        for i in range(n_tensors):           # traverse batch
            this_img = X_padded[i,:,:,:]     # select ith image in batch
            for f in range(n_filters):       # traverse filters
                this_filter = filters[f,:,:,:]
                this_bias   = bias[f,:,:,:]
                for h in range(H_new):       # traverse height
                    for w in range(W_new):   # traverse width
                        v0,v1 = h*stride, h*stride + H_k
                        h0,h1 = w*stride, w*stride + W_k
                        this_window = this_img[v0:v1,h0:h1,:]

                        Z[i, h, w, f] = conv_single_step(this_window, this_filter, this_bias) 

        assert Z.shape == (n_tensors, H_new, W_new, n_filters)

        self.output = Z
        self.cache = X, filters, bias, stride, padding
        
        return self.output

    def backward(self, upstream_g, learning_rate):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, W, B, s, p)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """
        X, filters, bias, stride, padding = self.cache

        n_tensors, H_down, W_down, c_down = X.shape
        n_filters, H_k,    W_k,    c_down = filters.shape
        n_tensors, H_up,   W_up,   c_up   = upstream_g.shape
        
        dX       = np.zeros_like(X)                           
        dfilters = np.zeros_like(filters)
        dbias    = np.zeros_like(bias)

        X_padded  = np.pad(X,  ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        dX_padded = np.pad(dX, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        
        for i in range(n_tensors):                       
            x = X_padded[i]
            dx = dX_padded[i]
            
            for h in range(H_up):                   # traverse height
                for w in range(W_up):               # traverse width
                    for c in range(c_up):           # traverse filters
                        
                        v0,v1 = h,h+H_k
                        h0,h1 = w,w+W_k
                        
                        x_window = x[v0:v1, h0:h1, :]
                        f_window = filters[c,:,:,:]

                        dx_local = hard_tanh(f_window-x_window)
                        df_local = x_window-f_window

                        g = upstream_g[i, h, w, c]

                        dx[v0:v1, v0:v1, :] += np.multiply(dx_local,g)
                        dfilters[c,:,:,:]   += np.multiply(df_local,g)
                        dbias[c,:,:,:]      += g
                        
            dX[i, :, :, :] = dx[padding:-padding, padding:-padding, :]
        
        assert(dX.shape == (n_tensors, H_down, W_down, c_down))

        self.filters -= learning_rate*dfilters
        self.bias    -= learning_rate*dbias

        return dX