In [160]:
import numpy as np

# define the number of iterations.
num_itr = 1000

# define batch size.
batchSize = 5

# define the input data dimension.
inputSize = 4

# define the output dimension.
outputSize = 1

# define the dimension of the hidden layer.
hiddenSize = 10



class Neural_Network():
    def __init__(self):     
        #weights
        self.U = np.random.randn(inputSize, hiddenSize) 
        self.W = np.random.randn(hiddenSize, outputSize) 
        self.e = np.random.randn(hiddenSize) 
        self.f = np.random.randn(outputSize) 


    def fully_connected(self, X, U, e):
        '''
        fully connected layer.
        inputs:
            U: weight 
            e: bias
        outputs:
            X * U + e
        '''
        return np.dot(X, U) + e


    def sigmoid(self, s):
        '''
        sigmoid activation function. 
        inputs: s
        outputs: sigmoid(s)  
        '''
        return 1/(1+np.exp(-s))


    def sigmoidPrime(self, s):
        '''
        derivative of sigmoid (Written section, Part a).
        inputs: 
            s = sigmoid(x)
        outputs: 
            derivative sigmoid(x) as a function of s 
        '''
        d_sigmoid = s*(1-s)
        return d_sigmoid


    def forward(self, X):
        '''
        forward propagation through the network.
        inputs:
            X: input data (batchSize, inputSize) 
        outputs:
            c: output (batchSize, outputSize)
        '''
        self.X = X
        self.z = self.fully_connected(self.X, self.U, self.e)
        self.b = self.sigmoid(self.z)
        self.h = self.fully_connected(self.b, self.W, self.f)
        self.c = self.sigmoid(self.h)
        return self.c


    def d_loss_o(self, gt, o):
        '''
        computes the derivative of the L2 loss with respect to 
        the network's output.
        inputs:
            gt: ground-truth (batchSize, outputSize)
            o: network output (batchSize, outputSize)
        outputs:
            d_o: derivative of the L2 loss with respect to the network's 
            output o. (batchSize, outputSize)
        '''
        d_o = o-gt
        return d_o


    def error_at_layer2(self, d_o, o):
        '''
        computes the derivative of the loss with respect to layer2's output
        (Written section, Part b).
        inputs:
            d_o: derivative of the loss with respect to the network output (batchSize, outputSize)
            o: the network output (batchSize, outputSize)
        returns 
            delta_k: the derivative of the loss with respect to the output of the second
            fully connected layer (batchSize, outputSize).
        '''
        delta_k = np.dot(d_o, self.sigmoidPrime(o).T)
        return delta_k


    def error_at_layer1(self, delta_k, W, b):
        '''
        computes the derivative of the loss with respect to layer1's output (Written section, Part e).
        inputs:
            delta_k: derivative of the loss with respect to the output of the second
            fully connected layer (batchSize, outputSize). 
            W: the weights of the second fully connected layer (hiddenSize, outputSize).
            b: the input to the second fully connected layer (batchSize, hiddenSize).
        returns:
            delta_j: the derivative of the loss with respect to the output of the second
            fully connected layer (batchSize, hiddenSize).
        '''
        delta_j = np.dot(delta_k, np.dot(self.sigmoidPrime(b), self.W))
        return delta_j


    def derivative_of_w(self, b, delta_k):
        '''
        computes the derivative of the loss with respect to W (Written section, Part c).
        inputs:
            b: the input to the second fully connected layer (batchSize, hiddenSize).
            delta_k: the derivative of the loss with respect to the output of the second
            fully connected layer's output (batchSize, outputSize).
        returns:
            d_w: the derivative of loss with respect to W  (hiddenSize ,outputSize).
        '''
        d_w = np.dot(delta_k.T,b)
        d_w = np.array(np.sum(d_w, axis=0)).reshape((-1,1))
        return d_w


    def derivative_of_u(self, X, delta_j):
        '''
        computes the derivative of the loss with respect to U (Written section, Part f).
        inputs:
            X: the input to the network (batchSize, inputSize).
            delta_j: the derivative of the loss with respect to the output of the first
            fully connected layer's output (batchSize, hiddenSize).
        returns:
            d_u: the derivative of loss with respect to U (inputSize, hiddenSize).
        '''
        #print(delta_j.shape)
        d_u = np.dot(X.T,delta_j)
        #print(d_u.shape)
        return d_u


    def derivative_of_e(self, delta_j):
        '''
        computes the derivative of the loss with respect to e (Written section, Part g).
        inputs:
            delta_j: the derivative of the loss with respect to the output of the first
            fully connected layer's output (batchSize, hiddenSize).
        returns:
            d_e: the derivative of loss with respect to e (hiddenSize).
        '''
        d_e = np.sum(delta_j, axis=0)
        return d_e


    def derivative_of_f(self, delta_k):
        '''
        computes the derivative of the loss with respect to f (Written section, Part d).
        inputs:
            delta_k: the derivative of the loss with respect to the output of the second
            fully connected layer's output (batchSize, outputSize).
        returns:
            d_f: the derivative of loss with respect to f (outputSize).
        '''
        d_f = np.sum(delta_k)
        return d_f


    def backward(self, X, gt, o):
        '''
        backpropagation through the network.
        Task: perform the 8 steps required below.
        inputs: 
            X: input data (batchSize, inputSize)
            y: ground truth (batchSize, outputSize)
            o: network output (batchSize, outputSize)        
        '''

        # 1. Compute the derivative of the loss with respect to c.
        # Call: d_loss_o
        d_o = self.d_loss_o( gt, o)
        
        # 2. Compute the error at the second layer (Written section, Part b).
        # Call: error_at_layer2
        delta_k = self.error_at_layer2(d_o, o)

        # 3. Compute the derivative of W (Written section, Part c).
        # Call: derivative_of_w
        d_W = self.derivative_of_w(self.b, delta_k)

        # 4. Compute the derivative of f (Written section, Part d).
        # Call: derivative_of_f
        d_f = self.derivative_of_f(delta_k)
        

        # 5. Compute the error at the first layer (Written section, Part e).
        # Call: error_at_layer1 
        delta_j = self.error_at_layer1(delta_k, self.W, self.b)
        

        # 6. Compute the derivative of U (Written section, Part f).
        # Call: derivative_of_u
        d_U = self.derivative_of_u(self.X, delta_j)
        

        # 7. Compute the derivative of e (Written section, Part g).
        # Call: derivative_of_e
        d_e =  self.derivative_of_e(delta_j)
           

        # 8. Update the parameters
        lr = 0.2
        #print(self.W.shape, d_W.shape)
        self.W -= lr*d_W
        self.f -= lr*d_f
        self.U -= lr*d_U
        self.e -= lr*d_e
        
        

    def train (self, X, y):
        o = self.forward(X)
        self.backward(X, y, o)
      


In [161]:

def main():
    """ Main function """
    # generate random input data of dimension (batchSize, inputSize). 
    a = np.random.randint(0, high=10, size=[5,4], dtype='l')

    # generate random ground truth.
    t = np.random.randint(0, high=100, size=[5,1], dtype='l')

    # scale the input and output data.
    a = a/np.amax(a, axis=0) 
    t = t/100 
    error = []

    # create an instance of Neural_Network.
    NN = Neural_Network()
    for i in range(num_itr): 
        print("Input: \n" + str(a)) 
        print("Actual Output: \n" + str(t))
        print("Predicted Output: \n" + str(NN.forward(a)))
        print("Loss: \n" + str(np.mean(np.square(t - NN.forward(a)))))
        print("\n")
        NN.train(a, t)
        error.append(np.mean(np.square(t - NN.forward(a))))
    return error

                     
if __name__ == "__main__":
    error = main()
    

Input: 
[[ 1.          0.125       1.          0.        ]
 [ 0.85714286  0.25        1.          0.88888889]
 [ 0.          0.375       0.66666667  0.33333333]
 [ 1.          0.25        0.66666667  1.        ]
 [ 0.71428571  1.          0.66666667  0.        ]]
Actual Output: 
[[ 0.84]
 [ 0.35]
 [ 0.04]
 [ 0.78]
 [ 0.84]]
Predicted Output: 
[[ 0.90734361]
 [ 0.85524264]
 [ 0.91174142]
 [ 0.82216657]
 [ 0.88583215]]
Loss: 
0.204723399169


Input: 
[[ 1.          0.125       1.          0.        ]
 [ 0.85714286  0.25        1.          0.88888889]
 [ 0.          0.375       0.66666667  0.33333333]
 [ 1.          0.25        0.66666667  1.        ]
 [ 0.71428571  1.          0.66666667  0.        ]]
Actual Output: 
[[ 0.84]
 [ 0.35]
 [ 0.04]
 [ 0.78]
 [ 0.84]]
Predicted Output: 
[[ 0.86101972]
 [ 0.78434812]
 [ 0.85944689]
 [ 0.7437862 ]
 [ 0.82439444]]
Loss: 
0.172429660346


Input: 
[[ 1.          0.125       1.          0.        ]
 [ 0.85714286  0.25        1.          0.88888889]


In [162]:
error

[0.17242966034588192,
 0.14173048855789291,
 0.12366589176577476,
 0.11694262828086613,
 0.11464311199962215,
 0.11346626874839279,
 0.11253337712056091,
 0.1116423732967172,
 0.11074264472236198,
 0.10981803100522902,
 0.10886143244035995,
 0.10786864832055201,
 0.10683656550360568,
 0.10576252405437103,
 0.10464407731118874,
 0.10347890735300658,
 0.10226480789538579,
 0.10099969866791247,
 0.099681655948580833,
 0.098308952573703853,
 0.096880104343289725,
 0.095393921135298515,
 0.093849561451489732,
 0.092246589081328795,
 0.090585030330600183,
 0.088865429934250617,
 0.087088903425152908,
 0.085257183412030341,
 0.083372656977716036,
 0.081438391290885148,
 0.079458144577094963,
 0.077436359859272136,
 0.075378139381832993,
 0.073289198384129811,
 0.071175797868299431,
 0.069044657162690537,
 0.06690284833220525,
 0.064757675722620839,
 0.062616545024395134,
 0.060486827081509618,
 0.058375722151405476,
 0.056290130378654213,
 0.054236533861843114,
 0.052220894906893536,
 0.05024