In [6]:
import numpy as np

class NeuralNetwork:
    def __init__(self, layers, alpha=0.1):
    # initialize the list of weights matrices, then store the
    # network architecture and learning rate
        self.W = []
        self.layers = layers
        self.alpha = alpha
    
        #looping from the index of the first layer but stop before last 2 layers
        for i in np.arange(0, len(layers) - 2):
            # initialize a weight matrix connecting the number of nodes in each 
            # layer together, add an extra node for the bias
            w = np.random.randn(layers[i]+1, layers[i+1]+1)
            self.W.append(w / np.sqrt(layers[i]))
            print("w.shape:{}".format(w.shape))

            # the last two layers are a special case where the input 
            # connections need a bias term but the output does not
            w = np.random.randn(layers[-2] + 1, layers[-1])
            self.W.append(w / np.sqrt(layers[-2]))
    
    def __repr__(self):
        # construct and return a string that represents the network architecture
        return "NeuralNetwork: {}".format("_".join(str(l) for l in self.layers))
        
    def sigmoid(self, x):
        return 1.0 / (1 + np.exp(-x))
    
    def sigmoid_deriv(self, x):
        # compute the derivative of the sigmoid function ASSUMING that 'x'
        # has already been passed through the sigmoid function
        return x * (1 - x)
    
    def fit(self, X, y, epochs=1000, displayUpdate=100):
        # add bias
        X = np.c_[X, np.ones((X.shape[0]))]
        #loop over the desired number of epochs
        for epoch in np.arange(0, epochs):
            # loop over each individual data point and train
            for (x, target) in zip(X,y):
                self.fit_partial(x, target)
            
            #check to see if we should display a training update
            if epoch == 0 or (epoch + 1) % displayUpdate == 0:
                loss = self.calculate_loss(X,y)
                print("[INFO] epoch={}, loss={:.7f}".format(epoch + 1, loss))
                
    def fit_partial(self, x, y):
        # construct out list of output activations for each layer as data point
        # flows through the network; the first activation is a special case -- its
        # just the input feature vector itself
        A = [np.atleast_2d(x)]
        
        #Feedforward
        #loop over layers in the network
        for layer in np.arange(0,len(self.W)):
            print("len of W:{}".format(len(self.W)))
            # feedforward the activation at the current layer by
            # Activation * weight, this is called 'net input' 
            # A is computed from last layer
            net = A[layer].dot(self.W[layer])
            
            out = self.sigmoid(net)
            
            #once we have the net output, add it to our list of activations
            A.append(out)
        
        # Backpropagation
        # the first phase of bp is compute the error between 
        # prediction and target
        
        error = A[-1] - y
        
        # from here, we need to apply the chain rule and build our
        # list of deltas ‘D‘; the first entry in the deltas is
        # simply the error of the output layer times the derivative
        # of our activation function for the output value
        
        D = [error * self.sigmoid_deriv(A[-1])]
        
        #once given the D for the final layer, we can work backforward by loop
        for layer in np.arange(len(A) - 2, 0 , -1):
            # the delta for the current layer is equal to the delta of the
            # *previous layer* dotted by *weight matrix* of the current layer,
            # followed by multiplying the delta by the derivative of the activation
            delta = D[-1].dot(self.W[layer].T)
            delta = delta * self.sigmoid_deriv(A[layer])
            D.append(delta)
            
            #since we looped over all the layers in reverse order we need to reverse
            # the deltas
            D = D[::-1]

            # WEIGHT UPDATE PHASE
            # loop over the layers
            for layer in np.arange(0, len(self.W)):
                self.W[layer] += -self.alpha * A[layer].T.dot(D[layer])
    
    def predict(self, X, addBias = True):
        # initialize the output prediction as the input features -- this value
        # will be (forward) propagated through the network to obtain the final
        # prediction
        
        p = np.atleast_2d(X)
        
        # check to see if the bias should be added
        
        if addBias:
            p = np.c_[p, np.ones((p.shape[0]))]
        
        # loop over our layers in the network
        for layer in np.arange(0, len(self.W)):
            p = self.sigmoid(np.dot(p, self.W[layer]))
            # just dot from first layer to final layer
            
        return p
    
    def calculate_loss(self, X, targets):
        targets = np.atleast_2d(targets)
        predictions = self.predict(X, addBias=False)
        loss = 0.5 * np.sum((predictions - targets) ** 2)
        
        return loss
            