<a href="https://colab.research.google.com/github/jdowner212/cs577_addernet/blob/main/AdderNet_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import torch
import tensorflow as tf
import tensorflow.keras.backend as K
import skimage
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# so we can run original addernet:
from torch.torch_version import TorchVersion
import numpy as np
import torch

In [6]:
'''
Functions described in paper
'''

def L1(a,b):
    return -1*np.abs(a-b)

def hard_tanh(value):
    if -1 < value and value < 1:
        return value
    elif value > 1:
        return 1
    elif value < -1:
        return -1

'''Equation 1'''
# modified by Equation 2


'''Equation 2'''
def Y_adder(X, F, m, n, t, similarity_f=L1):
    # assuming F.shape returns (#filters, #channels, #rows, #columns)
    # assuming X.shape returns (#channels, #rows, #columns)
    # assuming t specifies filter #
    sum_ = 0
    _, c_in, d, _ = F.shape     
    for k in range(c_in):
        for j in range(d):
            for i in range(d):
                sum_ += similarity_f(X[k, m+i, n+j], F[i, j, k, t])
    return sum_

'''Equation 3'''
# ignore -- CNN formula

'''Equation 4'''
# ignore -- updated with equation 5

'''Equation 5'''
def dY_dF(X,F,m,n,i,j,k,t):
    X_ = X[k,m+i,n+j]
    F_ = F[t,k,i,j]
    return X_ - F_

'''Equation 6'''
def dY_dX(X,F,m,n,i,j,k,t):
    # clipped, full-precision gradient
    X_ = X[k,m+i,n+j]
    F_ = F[t,k,i,j]
    return hard_tanh(X_ - F_)

'''Equation 7'''
# ignore -- hard_tanh implemented previously

'''Equation 8'''
# ignore -- CNN formula

'''Equation 9'''
# def var_Y_adder(X,F,variance_f=torch.var):
    # check torch.var documentation: https://pytorch.org/docs/stable/generated/torch.var.html
    # not sure if we can call torch.var(X) with default parameters
    # or if we need to specify. Does this output a scalar or a tensor?
# Trying K.var as tensorflow substitute for torch.var -- make sure they work the same
# or tf.var?
def var_Y_adder(X,F,variance_f=K.var):
    var_X = variance_f(X)
    var_F = variance_f(F)
    ###
    _, c_in, d, _ = F.shape
    pi = np.pi

    return np.sqrt(pi/2)*(d**2)*(c_in)*(var_X + var_F)

'''Equation 10'''
def batch_norm(minibatch, gamma, beta):
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    gamma*(minibatch-mean)/std + beta
    return gamma*(minibatch-mean)/std + beta

'''Equation 11'''
def dL_dMinibatch_i(minibatch,dL_dy,i,L,gamma):
    # In dL_dy, y is the result of applying batch_norm to the minibatch
    m = len(minibatch)
    mean = (1/m)*sum(minibatch)
    std = (1/m)*sum([(x_i-mean)**2 for x_i in minibatch])
    
    sum_ = 0
    for j in range(m):
        x_term = (minibatch[i]-minibatch[j])*(minibatch[j]-mean)/std
        sum_ += (dL_dy[i] - dL_dy[j]*(1 + x_term))
    sum_ *= gamma/((m**2)*std)
    
    return sum_

'''Equation 12'''
def delta_F_l(adaptive_lr_l, dL_dF_l, gamma):
    # the update delta for the filter in layer l
    return gamma*adaptive_lr_l*dL_dF_l

'''Equation 13'''
def adaptive_lr_l(dL_dF_l, eta, k):
    # k = number of elements in F_l -- I think equal to len(dL_dF_1)
    # in which case we don't need to explicitly provide it
    
    
    # l2_norm = torch.sqrt([g**2 for g in dL_dF_l])
    l2_norm = K.sqrt([g**2 for g in dL_dF_l]) # make sure torch.sqrt and K.sqrt are equivalents
    
    
    return eta*np.sqrt(k)/l2_norm

In [42]:
'''
most of original addernet -- I messed with it a bit trying 
to get it to work with tensorfow so not fully the original
'''


# import torch
# import torch.nn as nn
# import numpy as np
# from torch.autograd import Function
# import math



# loss           = mean_(layer_output[:,:,:,filter_index])
# grads          = K.gradients(loss,model_input)[0]
# grads          = grads / (sqrt_(mean_(square_(grads))) + 1e-5)
# iterate        = K.function([model_input], [loss,grads])
# input_img_data = np.random.random((1,size,size,3))*20+128

# class adder(Function):

@tf.function
class my_adder:
    @staticmethod
    def forward(ctx, W_col, X_col):
        ctx.save_for_backward(W_col,X_col)
        output = -(W_col.unsqueeze(2)-X_col.unsqueeze(0)).abs().sum(1)
        return output

    @staticmethod
    def backward(ctx,grad_output):
        W_col,X_col = ctx.saved_tensors
        grad_W_col = ((X_col.unsqueeze(0)-W_col.unsqueeze(2))*grad_output.unsqueeze(1)).sum(2)
        grad_W_col = grad_W_col/grad_W_col.norm(p=2).clamp(min=1e-12)*math.sqrt(W_col.size(1)*W_col.size(0))/5
        grad_X_col = (-(X_col.unsqueeze(0)-W_col.unsqueeze(2)).clamp(-1,1)*grad_output.unsqueeze(1)).sum(0)
        
        return grad_W_col, grad_X_col

def adder2d_function(X, W, stride=1, padding=0):
    n_filters, d_filter, h_filter, w_filter = W.size()
    n_x, d_x, h_x, w_x = X.size()

    h_out = (h_x - h_filter + 2 * padding) / stride + 1
    w_out = (w_x - w_filter + 2 * padding) / stride + 1

    h_out, w_out = int(h_out), int(w_out)
    X_col = torch.nn.functional.unfold(X.view(1, -1, h_x, w_x), h_filter, dilation=1, padding=padding, stride=stride).view(n_x, -1, h_out*w_out)
    X_col = X_col.permute(1,2,0).contiguous().view(X_col.size(1),-1)
    W_col = W.view(n_filters, -1)
    
    out = adder.apply(W_col,X_col)
    
    out = out.view(n_filters, h_out, w_out, n_x)
    out = out.permute(3, 0, 1, 2).contiguous()
    
    return out


    
class adder2d(tf.keras.layers.Layer):
    def __init__(self,input_channel,output_channel,kernel_size, stride=1, padding=0, bias = False):
        super(adder2d, self).__init__()
        self.stride = stride
        self.padding = padding
        self.input_channel = input_channel
        self.output_channel = output_channel
        self.kernel_size = kernel_size
        self.adder = torch.nn.Parameter(nn.init.normal_(torch.randn(output_channel,input_channel,kernel_size,kernel_size)))
        self.bias = bias
        if bias:
            self.b = torch.nn.Parameter(nn.init.uniform_(torch.zeros(output_channel)))

    def forward(self, x):
        output = adder2d_function(x,self.adder, self.stride, self.padding)
        if self.bias:
            output += self.b.unsqueeze(0).unsqueeze(2).unsqueeze(3)
        
        return output
    

In [44]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Function
import math

def adder2d_function(X, W, stride=1, padding=0):
    n_filters, d_filter, h_filter, w_filter = W.size()
    n_x, d_x, h_x, w_x = X.size()

    h_out = (h_x - h_filter + 2 * padding) / stride + 1
    w_out = (w_x - w_filter + 2 * padding) / stride + 1

    h_out, w_out = int(h_out), int(w_out)
    X_col = torch.nn.functional.unfold(X.view(1, -1, h_x, w_x), h_filter, dilation=1, padding=padding, stride=stride).view(n_x, -1, h_out*w_out)
    X_col = X_col.permute(1,2,0).contiguous().view(X_col.size(1),-1)
    W_col = W.view(n_filters, -1)
    
    out = adder.apply(W_col,X_col)
    
    out = out.view(n_filters, h_out, w_out, n_x)
    out = out.permute(3, 0, 1, 2).contiguous()
    
    return out

class adder(Function):
    @staticmethod
    def forward(ctx, W_col, X_col):
        ctx.save_for_backward(W_col,X_col)
        output = -(W_col.unsqueeze(2)-X_col.unsqueeze(0)).abs().sum(1)
        return output

    @staticmethod
    def backward(ctx,grad_output):
        W_col,X_col = ctx.saved_tensors
        grad_W_col = ((X_col.unsqueeze(0)-W_col.unsqueeze(2))*grad_output.unsqueeze(1)).sum(2)
        grad_W_col = grad_W_col/grad_W_col.norm(p=2).clamp(min=1e-12)*math.sqrt(W_col.size(1)*W_col.size(0))/5
        grad_X_col = (-(X_col.unsqueeze(0)-W_col.unsqueeze(2)).clamp(-1,1)*grad_output.unsqueeze(1)).sum(0)
        
        return grad_W_col, grad_X_col
    
class adder2d(nn.Module):

    def __init__(self,input_channel,output_channel,kernel_size, stride=1, padding=0, bias = False):
        super(adder2d, self).__init__()
        self.stride = stride
        self.padding = padding
        self.input_channel = input_channel
        self.output_channel = output_channel
        self.kernel_size = kernel_size
        self.adder = torch.nn.Parameter(nn.init.normal_(torch.randn(output_channel,input_channel,kernel_size,kernel_size)))
        self.bias = bias
        if bias:
            self.b = torch.nn.Parameter(nn.init.uniform_(torch.zeros(output_channel)))

    def forward(self, x):
        output = adder2d_function(x,self.adder, self.stride, self.padding)
        if self.bias:
            output += self.b.unsqueeze(0).unsqueeze(2).unsqueeze(3)
        
        return output

In [10]:
''' Template '''


# @tf.custom_gradient
# def custom_op(inputs, activation, weights, biases):
    
#     # forward computation
#     z = tf.matmul(inputs, weights) + biases
#     if activation is not None:
#         z = activation(result)
    
#     # backward computation
#     def grad(upstream):
#         inputs_gradient = tf.matmul(upstream, tf.transpose(weights))
#         weights_gradient = tf.matmul(tf.transpose(inputs), upstream)
#         bias_gradient = upstream
#         return inputs_gradient, weights_gradient, bias_gradient
    
#     return z, grad

In [11]:
f_n, f_d, f_h, f_w = 5,  3, 3, 3
X_n, X_d, X_h, X_w = 10, 3, 5, 5

F_rand  = np.random.random((f_n,f_d,f_h,f_w))
F_torch = torch.tensor(F_rand)
F_tf    = tf.convert_to_tensor(F_rand)

X_rand  = np.random.random((X_n,X_d,X_h,X_w))
X_torch = torch.tensor(X_rand)
X_tf    = tf.convert_to_tensor(X_rand)

In [172]:
def get_windows(tensor_of_images,k_d=3,k_w=3,k_h=3,stride=1,padding=0):
    imgs = np.pad(tensor_of_images,pad_width=padding)
    n = len(imgs)
    return skimage.util.view_as_windows(imgs, (n,k_d,k_w,k_h), step=stride)


# output from get_windows() has dimensions (A,B,C,D,E,F,G,H)
#                                     e.g. (1,1,3,3,1,3,3,3)

# A: a box - not sure what makes this not 1
#     B: a box within a box - not sure what makes this not 1
#         C: 3x3x3 windows for all rows and columns
#            # i think reflects windows at a specific channel depth
#            # and will be something other than one if input channels > filter channels
#             D: 3x3x3 windows for a specific row of an image?
#                # not sure how this will change with different # channels
#                 E: a box within a box -- not sure what would make this value not 1
#                     F: kxkxk -- a 3D window
#                         G: kxk -- a channel of a 3D window
#                             H: 1xk -- a row of a 3D window



In [225]:
'''
scratch paper -- can probably ignore
'''

n_images, n_values, n_channels_in, new_H, new_W = new_images.shape

p = 0
s = 1
W_ = int((W+2*p-k)/s+1)
H_ = int((H+2*p-k)/s+1)
n_channels_out = int((n_channels_in + 2*p - n_channels_in)+1)


out = np.zeros((len(n_filters),n_channels_out, W_, H_))


for f in n_filters:
    for c in range(n_channels_out):
        #print(f'Image {i+1}\n')
        for group in range(n_groups):
            image_group = final_image[i][group]
            added = (image_group + weight)
            all_added=[]
            for a in added:
                all_added.append(K.sum(a))
            print(sum(all_added))

            #print(added.shape)
            #print(added)
            #print(K.sum(added,axis=2))
            print('***')
            #added = group + weights



TypeError: ignored