<a href="https://colab.research.google.com/github/jdowner212/cs577_addernet/blob/main/AdderNet_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [805]:
import os
root = os.getcwd() # whatever you want

In [806]:
import numpy as np
import torch
import json
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()


In [945]:
'''
Keeping track of the equations described in the paper -- not explicitly called
in subsequent code but want to make sure the functionality is present
'''

####################################
# defined in paper but not needed: #
####################################

'''Equation 1'''
# ignore -- updated with Equation 2

'''Equation 3'''
# ignore -- CNN formula

'''Equation 4'''
# ignore -- updated with Equation 5

'''Equation 8'''
# ignore -- CNN formula


####################################
# explicitly used in code -- keep: #
####################################
def L1(a,b):
    return np.abs(a-b)

'''Equation 7'''
def hard_tanh(array):
    array = np.where(array<-1,-1,array)
    array = np.where(array>1, 1, array)
    return array

######################################################
# implicitly used in code -- can technically delete: #
######################################################

'''Equation 2'''
def Y_adder(image, F, m, n, t, similarity_f=L1): # image, group fo filters, row#, col#, filter#, similarity function
    sum_ = 0
    num_filters, k_depth, k_height, k_width = F.shape
    for k in range(k_depth):
        for j in range(k_width):
            for i in range(k_height):
                sum_ += similarity_f(image[k, m+i, n+j], F[t,k,i,j])
    return sum_

'''Equation 5'''
def dY_dF_element(image,filters,m,n,i,j,k,t):
    return image[k,m+i,n+j] - filter[t,k,i,j]

'''Equation 6''' # clipped, full-precision gradient
def dY_dImage_element(image,filters,m,n,i,j,k,t):
    return hard_tanh(filter[t,k,i,j] - image[k,m+i,n+j])


###################################################
# written last week, haven't gotten to these yet: #
###################################################

'''Equation 9'''
def var_Y_adder(X,F,variance_f=K.var):
    # Not sure K.var is the function we want here, if we need to specify axis, etc.
    var_X = variance_f(X)
    var_F = variance_f(F)
    ###
    _, c_in, d, _ = F.shape
    pi = np.pi
    return np.sqrt(pi/2)*(d**2)*(c_in)*(var_X + var_F)

'''Equation 10'''
def batch_norm(minibatch, gamma, beta):
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    gamma*(minibatch-mean)/std + beta
    return gamma*(minibatch-mean)/std + beta

'''Equation 11'''
def dL_dMinibatch_i(minibatch,dL_dy,i,L,gamma): # confused by this notation and need to revisit paper
    # In dL_dy, y is the result of applying batch_norm to the minibatch
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    
    sum_ = 0
    for j in range(m):
        x_term = (minibatch[i]-minibatch[j])*(minibatch[j]-mean)/std
        sum_ += (dL_dy[i] - dL_dy[j]*(1 + x_term))
    sum_ *= gamma/((m**2)*std)
    
    return sum_

'''Equation 12'''
# update rule for F
def delta_F_l(adaptive_lr_l, dL_dF_l, gamma):
    # the update delta for the filter in layer l
    return gamma*adaptive_lr_l*dL_dF_l

'''Equation 13'''
def adaptive_lr_l(dL_dF_l, eta, k):
    # k = number of elements in F_l -- I think equal to len(dL_dF_1)
    # in which case we don't need to explicitly provide it
    
    # l2_norm = torch.sqrt([g**2 for g in dL_dF_l])
    l2_norm = K.sqrt([g**2 for g in dL_dF_l]) # make sure torch.sqrt and K.sqrt are equivalents
    
    return eta*np.sqrt(k)/l2_norm

# Layer definitions

<h2><strong><font color='red'>still need:<br><h4><font color='red'>- fully-connected<br>- batchnorm<br>- adaptive learning rate<br>- ?</font></h4></font></strong></h2>


### `Layer` parent class

In [946]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, X):
        raise NotImplementedError

    def backward(self, upstream_g, learning_rate):
        raise NotImplementedError

### `Adder` layer

In [997]:
'''
dimension format/convention: NxHxWxC -- this is really unintuitive to me but i saw it in some tf 
documentation and thought it might make our lives easier later. Def not married to it
'''


def adder_single_step(window, filter_, similarity_f=L1):
    """
    window -- k_h x k_w x k_d
    filter -- k_h x k_w x k_d
    b      -- 1x1x1
    Z      -- scalar
    """
    #H_k,W_k,D_k = filter_.shape
    #out=0
    #for h in range(H_k):
    #    for w in range(W_k):
    #        for d in range(D_k):
    #            out += similarity_f(window[h,w,d], filter_[h,w,d])
    return np.abs(-window-filter_).sum()
    #return out

class adder_layer(Layer):
    def __init__(self,output_channels,kernel_size=3,stride=1,padding=0,bias=None,similarity_f = L1,input_channels=None,adaptive_eta=0):
        self.output_channels = output_channels


        self.input_channels=None if not input_channels else input_channels
        self.output_channels = output_channels
        self.adaptive_eta=0

        # making weight instantiation optional in case we want to infer input channels from forward pass rather than defining explicitly
        self.kernel_size=kernel_size        
        self.stride = stride
        self.padding = padding
        bias = np.zeros((1,1,1,self.output_channels)) if not bias else bias
        self.bias = bias
        self.similarity_f = similarity_f


    def get_adaptive_lr(self, k, dfilters, eta):
        """    
        k           -- n_tensors 
        dfilters    -- c_out x k_H x k_W x c_in
        eta         -- scalar
        """
        norm = np.linalg.norm(dfilters, ord=2, axis=0)

        return (eta * np.sqrt(k)) / norm


    def forward(self,X):
        """    
        X       -- n_tensors x H x W x c_in
        filters -- c_out x k_H x k_W x c_in
        b       -- c_out x 1 x 1 x 1
        Z       -- n_tensors x H_new x W_new, c_out
        cache   -- info needed for backward pass
        """
        self.input = X

        # in case input size not given
        self.input_channels = X.shape[-1]
        #if self.filters==None:
        self.filters = np.random.normal(loc=0,scale=1,size=(self.output_channels, self.kernel_size, self.kernel_size, self.input_channels))

        filters,stride,padding,bias = self.filters, self.stride, self.padding, self.bias
        n_tensors, H,   W,   c_in = X.shape
        c_out,     H_k, W_k, c_in = filters.shape
        n_filters = c_out

        X_padded = np.pad(X, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        H_new = int((H + 2*padding - H_k)/stride)+1
        W_new = int((W + 2*padding - W_k)/stride)+1

        Z = np.zeros([n_tensors, H_new, W_new, c_out])

        for i in range(n_tensors):           # traverse batch
            this_img = X_padded[i,:,:,:]     # select ith image in batch
            for f in range(n_filters):       # traverse filters
                this_filter = filters[f,:,:,:]
                #this_bias = bias[f,:,:,:]
                for h in range(H_new):       # traverse height
                    for w in range(W_new):   # traverse width
                        
                        v0,v1 = h*stride, h*stride + H_k
                        h0,h1 = w*stride, w*stride + W_k
                        
                        this_window = this_img[v0:v1,h0:h1,:]

                        Z[i, h, w, f] = adder_single_step(this_window, this_filter)#, this_bias) 

        assert Z.shape == (n_tensors, H_new, W_new, n_filters)

        self.output = Z
        self.cache = X, filters, bias, stride, padding
        
        return self.output

    def backward(self, upstream_g, learning_rate):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, W, B, s, p)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """
        
        X, filters, bias, stride, padding = self.cache

        n_tensors, H_down, W_down, c_down = X.shape
        n_filters, H_k,    W_k,    c_down = filters.shape
        n_tensors, H_up,   W_up,   c_up   = upstream_g.shape
        
        dX       = np.zeros_like(X)                           
        dfilters = np.zeros_like(filters)
        #dbias    = np.zeros((n_filters, 1,1,c_down))

        X_padded  = np.pad(X,  ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        dX_padded = np.pad(dX, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        
        for i in range(n_tensors):                       
            x = X_padded[i]
            dx = dX_padded[i]
            
            for h in range(H_up):                   # traverse height
                for w in range(W_up):               # traverse width
                    for c in range(c_up):           # traverse filters
                        
                        v0,v1 = h,h+H_k
                        h0,h1 = w,w+W_k
                        
                        x_window = x[v0:v1, h0:h1, :]
                        f_window = filters[c,:,:,:]

                        dx_local = x_window-f_window
                        df_local = hard_tanh(f_window-x_window)

                        g = upstream_g[i, h, w, c]

                        dx[v0:v1, v0:v1, :] += dx_local * g
                        dfilters[c,:,:,:]   += df_local * g
                        #dbias[c,:,:,:]      += g
                        
            dX[i, :, :, :] = dx[padding:-padding, padding:-padding, :]
        
        assert(dX.shape == (n_tensors, H_down, W_down, c_down))


        adaptive_lr = self.get_adaptive_lr(n_filters, dfilters, self.adaptive_eta)

        self.filters -= learning_rate*adaptive_lr*dfilters
        #self.bias    -= learning_rate*dbias

        return dX

In [998]:
'''
dimension format/convention: NxHxWxC -- this is really unintuitive to me but i saw it in some tf 
documentation and thought it might make our lives easier later. Def not married to it
'''


def conv_single_step(window, filter_, bias):
    """
    window -- k_h x k_w x k_d
    filter_ -- k_h x k_w x k_d
    b      -- 1x1x1
    Z      -- scalar
    """
    out = np.sum((np.multiply(window,filter_) + bias.astype(float))).astype(float)
    
    return out

class conv_layer(Layer):
    def __init__(self,output_channels,kernel_size=3,stride=1,padding=0,similarity_f = L1,input_channels=None):
        self.output_channels = output_channels


        self.input_channels=None if not input_channels else input_channels
        self.output_channels = output_channels
        self.adaptive_eta=0

        # making weight instantiation optional in case we want to infer input channels from forward pass rather than defining explicitly
        self.kernel_size=kernel_size        
        self.stride = stride
        self.padding = padding



    def forward(self,X):
        """    
        X       -- n_tensors x H x W x c_in
        filters -- c_out x k_H x k_W x c_in
        b       -- c_out x 1 x 1 x 1
        Z       -- n_tensors x H_new x W_new, c_out
        cache   -- info needed for backward pass
        """
        self.input = X

        # in case input size not given
        self.input_channels = X.shape[-1]

        self.filters = np.random.normal(loc=0,scale=1,size=(self.output_channels, self.kernel_size, self.kernel_size, self.input_channels))
        self.bias    = np.random.normal(loc=0,scale=1,size=(self.output_channels, 1,1,1))
        
        filters,stride,padding,bias = self.filters, self.stride, self.padding, self.bias
        n_tensors, H,   W,   c_in = X.shape
        c_out,     H_k, W_k, c_in = filters.shape
        n_filters = c_out

        X_padded = np.pad(X, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        H_new = int((H + 2*padding - H_k)/stride)+1
        W_new = int((W + 2*padding - W_k)/stride)+1

        Z = np.zeros([n_tensors, H_new, W_new, c_out])

        for i in range(n_tensors):           # traverse batch
            this_img = X_padded[i,:,:,:]     # select ith image in batch
            for f in range(n_filters):       # traverse filters
                this_filter = filters[f,:,:,:]
                this_bias   = bias[f,:,:,:]
                for h in range(H_new):       # traverse height
                    for w in range(W_new):   # traverse width
                        
                        v0,v1 = h*stride, h*stride + H_k
                        h0,h1 = w*stride, w*stride + W_k
                        
                        this_window = this_img[v0:v1,h0:h1,:]

                        Z[i, h, w, f] = conv_single_step(this_window, this_filter, this_bias) 

        assert Z.shape == (n_tensors, H_new, W_new, n_filters)

        self.output = Z
        self.cache = X, filters, bias, stride, padding
        
        return self.output

    def backward(self, upstream_g, learning_rate):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, W, B, s, p)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """
        
        X, filters, bias, stride, padding = self.cache

        n_tensors, H_down, W_down, c_down = X.shape
        n_filters, H_k,    W_k,    c_down = filters.shape
        n_tensors, H_up,   W_up,   c_up   = upstream_g.shape
        
        dX       = np.zeros_like(X)                           
        dfilters = np.zeros_like(filters)
        dbias    = np.zeros((n_filters, 1,1,1))

        X_padded  = np.pad(X,  ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        dX_padded = np.pad(dX, ((0,0), (padding,padding), (padding,padding), (0,0)), 'constant', constant_values = (0,0))
        
        for i in range(n_tensors):                       
            x = X_padded[i]
            dx = dX_padded[i]
            
            for h in range(H_up):                   # traverse height
                for w in range(W_up):               # traverse width
                    for c in range(c_up):           # traverse filters
                        
                        v0,v1 = h,h+H_k
                        h0,h1 = w,w+W_k
                        
                        x_window = x[v0:v1, h0:h1, :]
                        f_window = filters[c,:,:,:]

                        dx_local = x_window-f_window
                        df_local = hard_tanh(f_window-x_window)

                        g = upstream_g[i, h, w, c]

                        dx[v0:v1, v0:v1, :] += dx_local * g
                        dfilters[c,:,:,:]   += df_local * g
                        dbias[c,:,:,:]      += g
                        
            dX[i, :, :, :] = dx[padding:-padding, padding:-padding, :]
        
        assert(dX.shape == (n_tensors, H_down, W_down, c_down))

        self.filters -= learning_rate*dfilters
        self.bias    -= learning_rate*dbias

        return dX

### Fully-connected layer

In [999]:
class FullyConnected(Layer):
    def __init__(self,output_channels,input_channels=None):
        super(Layer, self).__init__()

        self.input_channels=None if not input_channels else input_channels
        self.output_channels = output_channels

        # making weight instantiation optional in case we want to infer input channels from forward pass rather than defining explicitly
        #self.weights = None
        #if input_channels:
        #    self.weights = np.random.normal(loc=0,scale=1,size=(input_channels,output_channels))

        # self.bias = np.random.normal(loc=0,scale=1,size=(1, output_channels))

    def forward(self, X):
        self.input = X
        self.input_channels = X.shape[-1]
        self.weights = np.random.normal(loc=0,scale=1,size=(self.input_channels,self.output_channels))
        self.bias = np.random.normal(loc=0,scale=1,size=(X.shape[0], self.output_channels))
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output

    def backward(self, upstream_g, learning_rate):
        dX    = np.dot(upstream_g, self.weights.T)
        dW    = np.dot(self.input.T, upstream_g)
        dbias = upstream_g

        self.weights -= learning_rate*dW
        self.bias    -= learning_rate*dbias

        return dX

### Flatten layer

In [1000]:
class Flatten(Layer):
    def forward(self, X):
        self.original_shape = X.shape
        self.output = X.reshape(X.shape[0],np.product(X.shape[1:]))
        return self.output

    def backward(self, upstream_g, learning_rate):
        return upstream_g.reshape(self.original_shape)

### BatchNorm Layer

In [1001]:
class batch_norm_layer(Layer):
    def __init__(self, input_channels=None, gamma=None,beta=None):
    # def __init__(self, input_channels, gamma=None, beta=None):

        '''
        I think we need n_tensors rather than input_channels for gamma and beta dimensions
        Also i think gamma shoudl be ones and beta should be zeros right? because then the initial
        values do nothing rather than multiply by 0 and add 1
        '''
        gamma = None if not gamma else gamma
        self.gamma = gamma
        beta = None if not beta else beta
        self.beta = beta

    def forward(self, X):
        """    
        X       -- n_tensors x H x W x c_in
        gamma   -- n_tensors x 1 x 1 x 1
        beta    -- n_tensors x 1 x 1 x 1
        cache   -- info needed for backward pass
        """

        self.input = X


        self.gamma = np.ones((X.shape[0],1,1,1))
        self.beta = np.zeros((X.shape[0],1,1,1))


        mean = np.mean(X,axis=(0, 1, 2), keepdims=True)
        var = np.mean(((X-mean)**2), axis=(0, 1, 2), keepdims=True)
        std = np.sqrt(var)
        
        X_center = X - mean
        X_norm = X_center/std

        self.output = X_norm*self.gamma + self.beta
        
        self.cache = X, X_center, X_norm

        return self.output 


    def backward(self, upstream_g, learning_rate):
        """
        upstream_g (dL/dZ) -- n_tensors x H_up x W_up x c_up
        cache (values from previous layers) -- (X, X_norm)               
        
        Output:
        dX -- dL/dX, shape n_tensors x H_down x W_down x c_down
        dF -- dL/dW, shape n_filters x k x k x k
        dB -- dL/dB, shape n_filters x 1 x 1 x 1
        """

        X, X_center, X_norm = self.cache


        dGamma = np.sum(upstream_g * X_norm, axis=0)
        dBeta = np.sum(upstream_g, axis=0)

        m = len(X)
        mean = np.mean(X)
        std = np.std(X)
        
        dX = np.zeros_like(X)

        for i in range(m):
            for j in range(m):
                dX[i] += (upstream_g[i] - upstream_g[j]*(1 + (X[i]-X[j])*(X[j]-mean)/std))
        dX *= self.gamma/((m**2)*std)
        
        self.gamma = self.gamma+learning_rate*dGamma
        self.beta = self.beta + learning_rate*dBeta

        return dX

### Maxpool layer

In [1002]:
class MaxPool(Layer):
    def __init__(self,pool_size=2):
        self.pool_size=pool_size
        self.stride = pool_size

    def forward(self,X):
        n_tensors, H, W, c_in = X.shape

        H_new = int(1 + (H - self.pool_size) / self.stride)
        W_new = int(1 + (W - self.pool_size) / self.stride)
        c_out = c_in
        
        Z = np.zeros((n_tensors, H_new, W_new, c_out))              
        
        for i in range(n_tensors):                     # loop over the training examples
            for h in range(H_new):                     # loop on the vertical axis of the output volume
                for w in range(W_new):                 # loop on the horizontal axis of the output volume
                    for c in range(c_out):             # loop over the channels of the output volume
                        
                        v0,v1 = h*self.stride, h*self.stride + self.pool_size
                        h0,h1 = w*self.stride, w*self.stride + self.pool_size
                        
                        window = X[i, v0:v1, h0:h1,c]
                    
                        Z[i, h, w, c] = np.max(window)

        self.output = Z
        self.cache = X, self.pool_size, self.stride
        
        return self.output

    def backward(self, upstream_g,learning_rate):
        X, pool_size, stride = self.cache

        n_tensors, H_down, W_down, c_down = X.shape
        n_tensors, H_up,   W_up,   c_up   = upstream_g.shape


        dX = np.zeros(X.shape)
        
        for i in range(n_tensors):                       
            x = X[i]
            for h in range(H_up):       
            
                for w in range(W_up):    
                    for c in range(c_up):       
                        v0,v1 = h, h+pool_size
                        h0,h1 = w, w+pool_size

                        x_window = x[v0:v1, h0:h1, c]
                        
                        local_g = np.where(x_window==np.max(x_window),1,0)
                        g       = upstream_g[i, h, w, c]
                         
                        dX[i, v0:v1, h0:h1, c] += local_g * g

        assert(dX.shape == X.shape)
        
        return dX


### Activation layers

In [1003]:
def relu_fwd(X):
    return np.where(X>=0,X,0)
def relu_bwd(X):
    return np.where(X>=0,1,0)
def softmax_fwd(X):
    x_max = np.amax(X, axis=0, keepdims=True)
    exp_ = np.exp(X - x_max)
    return exp_ / np.sum(exp_, axis=1, keepdims=True)
def softmax_bwd(X):
    s = softmax_fwd(X)
    si_sj = - s * s.transpose(0,1)
    s_prime = np.diag(s) + si_sj
    return s_prime

def sig_fwd(X):
    #return tf.math.sigmoid(X)
    return 1/(1 + np.exp(-X))
def sig_bwd(X):
    return sig_fwd(X) * (1 - sig_fwd(X))


activation_dict = {'relu':    {'forward':  relu_fwd,
                               'backward': relu_bwd},
                   'softmax': {'forward':  softmax_fwd,
                               'backward': softmax_bwd},
                   'sigmoid': {'forward':  sig_fwd,
                               'backward': sig_bwd}}


class Activation(Layer):
    def __init__(self,activation_name):
        super(Layer, self).__init__()
        self.fwd=activation_dict[activation_name]['forward']
        self.bwd=activation_dict[activation_name]['backward']

    def forward(self, X):
        self.input = X
        self.output = self.fwd(X)
        return self.output

    def backward(self, upstream_g, learning_rate):
        local_g = self.bwd(self.input)
        return local_g*upstream_g

### `Model` class

In [1024]:
np.where(np.argmax(y_val_c10,axis=1)==np.argmax(y_pred,axis=1),1,0)#.shape

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [1017]:
single_example_accuracy(y_pred,y_val_c10)

0

In [1025]:
def binary_cross_entropy(y_true, y_pred):    
    eps1=np.random.uniform(1e-07,1e-06)
    eps2=np.random.uniform(1e-07,1e-06)
    log_likelihood =  y_true * np.log(y_pred+eps1) + (1 - y_true) * np.log(1 - y_pred+eps2)
    return -np.mean(log_likelihood)

def binary_cross_entropy_prime(y_true,y_pred):
    eps1=np.random.uniform(1e-07,1e-06)
    eps2=np.random.uniform(1e-07,1e-06)

    # return np.mean(-y_true/(y_pred+eps) + (1-y_true)/(1-y_pred+eps))
    return np.mean(-y_true/(y_pred+eps1) + (1-y_true)/(1-y_pred+eps2))


def cat_cross_entropy(y_true, y_pred):    
    eps=np.random.uniform(1e-07,1e-06)
    out=-np.mean(y_true*np.log(y_pred+eps))
    return out/float(y_pred.shape[0])


def cat_cross_entropy_prime(y_true,y_pred):
    eps=np.random.uniform(1e-07,1e-06)
    return np.mean([-y/(yhat+eps) for (y,yhat) in zip(y_true,y_pred)])



loss_dict = {'binary_cross_entropy': {'forward':  binary_cross_entropy,
                                      'backward': binary_cross_entropy_prime},
             'cat_cross_entropy':    {'forward':  cat_cross_entropy,
                                      'backward': cat_cross_entropy_prime}}

def single_example_accuracy(y_real,y_pred):
    if y_pred.shape[-1] == 1: # i.e. if not one-hot encoded
        if np.max(y_pred)>1:  # multi-category integer labels
            pass
        else:
            y_real=1 if y_real.astype(float)>=0.5 else 0
            y_pred=1 if y_pred.astype(float)>=0.5 else 0
    else:
        y_real = np.argmax(y_real)
        y_pred = np.argmax(y_pred)
    
    return 1 if y_real == y_pred else 0



class Model:
    def __init__(self,loss_name): # planning on creating a dictionary so we can get a loss function (forward + backward) from its name
        self.layers = []
        self.loss_fwd = loss_dict[loss_name]['forward']
        self.loss_bwd = loss_dict[loss_name]['backward']

    def add(self, layer):
        self.layers.append(layer)

    def predict(self, input_data):
        y_hat = []

        #for i in range(input_data.shape[0]):
        Z = input_data
        for layer in self.layers:
            Z = layer.forward(Z)
        y_hat = Z

        return y_hat

    def fit(self, x_train, y_train, epochs, learning_rate, x_val=None, y_val=None):

        history = {'accuracy': [],
                   'loss': [],
                   'val_accuracy': [],
                   'val_loss': []}

        for e in range(epochs):
            print(e)
            # val_loss = 0
            # val_acc = 0
            #for j in range(x_train.shape[0]):

            # forward
            Z = x_train#[j]
            for layer in self.layers:
                print(layer)
                Z = layer.forward(Z)

            y_real = y_train#[j]
            y_pred = Z

            # compute loss and accuracy
            loss = self.loss_fwd(y_real, y_pred)
            loss /= x_train.shape[0]
            acc = sum(np.where(np.argmax(y_real,axis=1)==np.argmax(y_pred,axis=1),1,0))#single_example_accuracy(y_real, y_pred)
            acc  /= x_train.shape[0]

            # backwward
            error = self.loss_bwd(y_real, y_pred)
            for layer in (self.layers)[::-1]:
                error = layer.backward(error, learning_rate)



            history['accuracy'].append(acc)
            history['loss'].append(loss)

            if x_val is None or y_val is None:
                print(f'Epoch: {e}   loss = {str(round(loss,3))}   acc = {str(round(acc,3))}')

            else:
                #for k in range(x_val.shape[0]):
                Z_val = x_val#[k]
                for layer in self.layers:
                    Z_val = layer.forward(Z_val)

                y_real_val = y_val#[k]
                y_pred_val = Z_val

                #val_loss += self.loss_fwd(y_real_val, y_pred_val)
                #val_acc  += single_example_accuracy(y_real_val, y_pred_val)

                val_loss = self.loss_fwd(y_real_val, y_pred_val)
                val_loss /= x_val.shape[0]
                val_acc = sum(np.where(np.argmax(y_real_val,axis=1)==np.argmax(y_pred_val,axis=1),1,0))#single_example_accuracy(y_real, y_pred)
                val_acc  /= x_val.shape[0]                

                history['val_accuracy'].append(val_acc)
                history['val_loss'].append(val_loss)

                print(f'Epoch: {e}   loss = {str(round(loss,3))}   acc = {str(round(acc,3))}   val_loss = {str(round(val_loss,3))}   val_accuracy = {str(round(val_acc,3))}')


## Data collection

### p.s. - been having a hard time finding an imagenet dataset/subset that is a reasonable size and easy to download

### CIFAR10

In [1012]:
import tarfile
import os
import pickle
import tensorflow.keras.utils as np_utils


# !wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
# data_zip = os.path.join(root,'cifar-10-python.tar.gz')
# f = tarfile.open(data_zip)
# f.extractall(root) 
# f.close()
# os.remove(data_zip)

def load_cifar_data(folder,tiny=False):
    train_batches = [f'{folder}/{f}' for f in os.listdir(folder) if 'batch_' in f]
    test_batch    =  f'{folder}/test_batch'

    # Get train data
    X_trn = None
    y_trn = []
    for i in range(len(train_batches)):
        train_data_dict = pickle.load(open(train_batches[i],'rb'), encoding='latin-1')
        if i+1 == 1:
            X_trn = train_data_dict['data']
        else:
            X_trn = np.vstack((X_trn, train_data_dict['data']))
        y_trn += train_data_dict['labels']
    X_trn = X_trn.reshape(len(X_trn),3,32,32)
    X_trn = np.rollaxis(X_trn,1,4)
    X_trn = X_trn.astype('float32')/255.0
    y_trn = np_utils.to_categorical(np.asarray(y_trn),10)

    # Get test data
    test_data_dict  = pickle.load(open(test_batch,'rb'), encoding='latin-1')
    X_tst = test_data_dict['data']
    X_tst = X_tst.reshape(len(X_tst),3,32,32)
    X_tst = np.rollaxis(X_tst,1,4)
    X_tst = X_tst.astype('float32')/255.0
    y_tst = np_utils.to_categorical(np.asarray(test_data_dict['labels']))
    
    n_90 = int(0.9*len(X_trn))
    X_trn, X_val = X_trn[:n_90], X_trn[n_90:]
    y_trn, y_val = y_trn[:n_90], y_trn[n_90:]

    if tiny:
        X_trn,y_trn,X_tst,y_tst,X_val,y_val = X_trn[:1000],y_trn[:1000],X_tst[:100],y_tst[:100],X_val[:100],y_val[:100]

    return X_trn, y_trn, X_tst, y_tst, X_val, y_val

data_dir = f'{root}/cifar-10-batches-py'
X_trn_c10, y_trn_c10, X_tst_c10, y_tst_c10, X_val_c10, y_val_c10 = load_cifar_data(data_dir,tiny=True)

In [1015]:
y_pred = this_model.predict(X_val_c10)

In [1026]:
this_model = Model(loss_name='cat_cross_entropy')

this_model.add(adder_layer(output_channels=8,kernel_size=3,stride=1,padding=1))
this_model.add(Activation('relu'))
this_model.add(MaxPool(pool_size=2))
this_model.add(batch_norm_layer())


this_model.add(Flatten())

this_model.add(FullyConnected(output_channels=64))
this_model.add(Activation('relu'))
this_model.add(FullyConnected(output_channels=10))
this_model.add(Activation('softmax'))

In [1027]:
this_model.fit(X_trn_c10,y_trn_c10,10,1e-03,X_val_c10,y_val_c10)

0
<__main__.adder_layer object at 0x7f18398c6fd0>


KeyboardInterrupt: ignored

In [None]:
cnn = Model(loss_name='cat_cross_entropy')

cnn.add(conv_layer(output_channels=8,kernel_size=3,stride=1,padding=1))
cnn.add(Activation('relu'))
cnn.add(MaxPool(pool_size=2))
cnn.add(batch_norm_layer())


cnn.add(Flatten())

cnn.add(FullyConnected(output_channels=64))
cnn.add(Activation('relu'))
cnn.add(FullyConnected(output_channels=10))
cnn.add(Activation('softmax'))

In [None]:
cnn.fit(X_trn_c10,y_trn_c10,10,1e-05,X_val_c10,y_val_c10)