In [142]:
import numpy as np

# define the number of iterations.
num_itr = 1000

# define batch size.
batchSize = 3.

# define the input data dimension.
inputSize = 2

# define the output dimension.
outputSize = 1

# define the dimension of the hidden layer.
hiddenSize = 3



class Neural_Network():
    def __init__(self):     
        #weights
        self.U = np.random.randn(inputSize, hiddenSize) 
        self.W = np.random.randn(hiddenSize, outputSize) 
        self.e = np.random.randn(hiddenSize) 
        self.f = np.random.randn(outputSize) 


    def fully_connected(self, X, U, e):
        '''
        fully connected layer.
        inputs:
            U: weight 
            e: bias
        outputs:
            X * U + e
        '''
        return np.dot(X, U) + e


    def sigmoid(self, s):
        '''
        sigmoid activation function. 
        inputs: s
        outputs: sigmoid(s)  
        '''
        return 1/(1+np.exp(-s))


    def sigmoidPrime(self, s):
        '''
        derivative of sigmoid (Written section, Part a).
        inputs: 
            s = sigmoid(x)
        outputs: 
            derivative sigmoid(x) as a function of s 
        '''
        d_sigmoid = s*(1-s)
        return d_sigmoid


    def forward(self, X):
        '''
        forward propagation through the network.
        inputs:
            X: input data (batchSize, inputSize) 
        outputs:
            c: output (batchSize, outputSize)
        '''
        self.X = X
        self.z = self.fully_connected(self.X, self.U, self.e)
        self.b = self.sigmoid(self.z)
        self.h = self.fully_connected(self.b, self.W, self.f)
        self.c = self.sigmoid(self.h)
        return self.c


    def d_loss_o(self, gt, o):
        '''
        computes the derivative of the L2 loss with respect to 
        the network's output.
        inputs:
            gt: ground-truth (batchSize, outputSize)
            o: network output (batchSize, outputSize)
        outputs:
            d_o: derivative of the L2 loss with respect to the network's 
            output o. (batchSize, outputSize)
        '''
        d_o = o-gt
        return d_o


    def error_at_layer2(self, d_o, o):
        '''
        computes the derivative of the loss with respect to layer2's output
        (Written section, Part b).
        inputs:
            d_o: derivative of the loss with respect to the network output (batchSize, outputSize)
            o: the network output (batchSize, outputSize)
        returns 
            delta_k: the derivative of the loss with respect to the output of the second
            fully connected layer (batchSize, outputSize).
        '''
        delta_k = np.dot(d_o, self.sigmoidPrime(o).T)
        return delta_k


    def error_at_layer1(self, delta_k, W, b):
        '''
        computes the derivative of the loss with respect to layer1's output (Written section, Part e).
        inputs:
            delta_k: derivative of the loss with respect to the output of the second
            fully connected layer (batchSize, outputSize). 
            W: the weights of the second fully connected layer (hiddenSize, outputSize).
            b: the input to the second fully connected layer (batchSize, hiddenSize).
        returns:
            delta_j: the derivative of the loss with respect to the output of the second
            fully connected layer (batchSize, hiddenSize).
        '''
        delta_j = np.dot(delta_k, np.dot(self.sigmoidPrime(b), self.W))
        return delta_j


    def derivative_of_w(self, b, delta_k):
        '''
        computes the derivative of the loss with respect to W (Written section, Part c).
        inputs:
            b: the input to the second fully connected layer (batchSize, hiddenSize).
            delta_k: the derivative of the loss with respect to the output of the second
            fully connected layer's output (batchSize, outputSize).
        returns:
            d_w: the derivative of loss with respect to W  (hiddenSize ,outputSize).
        '''
        d_w = np.dot(delta_k.T,b)
        d_w = np.array(np.sum(d_w, axis=0)).reshape((-1,1))
        return d_w


    def derivative_of_u(self, X, delta_j):
        '''
        computes the derivative of the loss with respect to U (Written section, Part f).
        inputs:
            X: the input to the network (batchSize, inputSize).
            delta_j: the derivative of the loss with respect to the output of the first
            fully connected layer's output (batchSize, hiddenSize).
        returns:
            d_u: the derivative of loss with respect to U (inputSize, hiddenSize).
        '''
        #print(delta_j.shape)
        d_u = np.dot(X.T,delta_j)
        #print(d_u.shape)
        return d_u


    def derivative_of_e(self, delta_j):
        '''
        computes the derivative of the loss with respect to e (Written section, Part g).
        inputs:
            delta_j: the derivative of the loss with respect to the output of the first
            fully connected layer's output (batchSize, hiddenSize).
        returns:
            d_e: the derivative of loss with respect to e (hiddenSize).
        '''
        d_e = np.sum(delta_j, axis=0)
        return d_e


    def derivative_of_f(self, delta_k):
        '''
        computes the derivative of the loss with respect to f (Written section, Part d).
        inputs:
            delta_k: the derivative of the loss with respect to the output of the second
            fully connected layer's output (batchSize, outputSize).
        returns:
            d_f: the derivative of loss with respect to f (outputSize).
        '''
        d_f = np.sum(delta_k)
        return d_f


    def backward(self, X, gt, o):
        '''
        backpropagation through the network.
        Task: perform the 8 steps required below.
        inputs: 
            X: input data (batchSize, inputSize)
            y: ground truth (batchSize, outputSize)
            o: network output (batchSize, outputSize)        
        '''

        # 1. Compute the derivative of the loss with respect to c.
        # Call: d_loss_o
        d_o = self.d_loss_o( gt, o)
        
        # 2. Compute the error at the second layer (Written section, Part b).
        # Call: error_at_layer2
        delta_k = self.error_at_layer2(d_o, o)

        # 3. Compute the derivative of W (Written section, Part c).
        # Call: derivative_of_w
        d_W = self.derivative_of_w(self.b, delta_k)

        # 4. Compute the derivative of f (Written section, Part d).
        # Call: derivative_of_f
        d_f = self.derivative_of_f(delta_k)
        

        # 5. Compute the error at the first layer (Written section, Part e).
        # Call: error_at_layer1 
        delta_j = self.error_at_layer1(delta_k, self.W, self.b)
        

        # 6. Compute the derivative of U (Written section, Part f).
        # Call: derivative_of_u
        d_U = self.derivative_of_u(self.X, delta_j)
        

        # 7. Compute the derivative of e (Written section, Part g).
        # Call: derivative_of_e
        d_e =  self.derivative_of_e(delta_j)
           

        # 8. Update the parameters
        lr = 0.2
        #print(self.W.shape, d_W.shape)
        self.W -= lr*d_W
        self.f -= lr*d_f
        self.U -= lr*d_U
        self.e -= lr*d_e
        
        

    def train (self, X, y):
        o = self.forward(X)
        self.backward(X, y, o)
      


In [143]:

def main():
    """ Main function """
    # generate random input data of dimension (batchSize, inputSize). 
    a = np.random.randint(0, high=10, size=[3,2], dtype='l')

    # generate random ground truth.
    t = np.random.randint(0, high=100, size=[3,1], dtype='l')

    # scale the input and output data.
    a = a/np.amax(a, axis=0) 
    t = t/100 
    error = []

    # create an instance of Neural_Network.
    NN = Neural_Network()
    for i in range(num_itr): 
        print("Input: \n" + str(a)) 
        print("Actual Output: \n" + str(t))
        print("Predicted Output: \n" + str(NN.forward(a)))
        print("Loss: \n" + str(np.mean(np.square(t - NN.forward(a)))))
        print("\n")
        NN.train(a, t)
        error.append(np.mean(np.square(t - NN.forward(a))))
    return error

                     
if __name__ == "__main__":
    error = main()
    

Input: 
[[ 1.          0.25      ]
 [ 0.5         0.25      ]
 [ 0.33333333  1.        ]]
Actual Output: 
[[ 0.41]
 [ 0.17]
 [ 0.2 ]]
Predicted Output: 
[[ 0.98310018]
 [ 0.97990754]
 [ 0.97343041]]
Loss: 
0.527529544388


Input: 
[[ 1.          0.25      ]
 [ 0.5         0.25      ]
 [ 0.33333333  1.        ]]
Actual Output: 
[[ 0.41]
 [ 0.17]
 [ 0.2 ]]
Predicted Output: 
[[ 0.98200091]
 [ 0.97868502]
 [ 0.97183129]]
Loss: 
0.525626682919


Input: 
[[ 1.          0.25      ]
 [ 0.5         0.25      ]
 [ 0.33333333  1.        ]]
Actual Output: 
[[ 0.41]
 [ 0.17]
 [ 0.2 ]]
Predicted Output: 
[[ 0.98076816]
 [ 0.97732137]
 [ 0.97005158]]
Loss: 
0.523507838591


Input: 
[[ 1.          0.25      ]
 [ 0.5         0.25      ]
 [ 0.33333333  1.        ]]
Actual Output: 
[[ 0.41]
 [ 0.17]
 [ 0.2 ]]
Predicted Output: 
[[ 0.97937819]
 [ 0.97579254]
 [ 0.96806133]]
Loss: 
0.521137113516


Input: 
[[ 1.          0.25      ]
 [ 0.5         0.25      ]
 [ 0.33333333  1.        ]]
Actual Output: 
[[

In [144]:
error

[0.5256266829189431,
 0.52350783859142469,
 0.52113711351572578,
 0.51847075817368915,
 0.51545505250234824,
 0.51202351576963334,
 0.50809320827774562,
 0.50355980186541804,
 0.49829098434599378,
 0.49211762792111857,
 0.48482201463380631,
 0.47612234411701398,
 0.46565294192335155,
 0.45294050768638217,
 0.43737944333187895,
 0.41821593470967294,
 0.39456473145974535,
 0.36550842049394211,
 0.33036159205186194,
 0.28918027071050434,
 0.24344604644302384,
 0.19645430007884068,
 0.15262957995351781,
 0.11570670471315836,
 0.087200973437971105,
 0.066475563762364229,
 0.051869366742697126,
 0.041668416516556415,
 0.034511318270989112,
 0.029430967726539969,
 0.025771624207383284,
 0.023094988276715186,
 0.02110752182988293,
 0.019610666892760142,
 0.018468275936524985,
 0.017585596167668298,
 0.016895690972233465,
 0.016350588775814551,
 0.015915441141333562,
 0.015564612580760794,
 0.015279025213916414,
 0.01504432963450286,
 0.014849627436215254,
 0.014686567261792722,
 0.014548697219