In [21]:
import numpy as np
import numpy.random as npr

#one hidden layer

class NNet(object):
    def __init__(self,n_features, n_hidden, n_output, eta = 0.01): 
        '''
        n_features: number of features of the very first layer, constant 
                    not included
        
        n_hidden:   number of features of the hidden layer
        
        n_output:   number of output (classification) of the last layer
        '''
        
        self.eta = eta
    
    
    #####################################################
    #  
    #         Forward computation utility functions
    #
    #####################################################
        
        
    def initializeWeight(self):
        
        #from layer one to the hidden layer
        w1 = npr.uniform(-1.0, 1.0, size = self.n_hidden*(self.n_features + 1))
        w1.reshape(self.n_hidden, self.n_features + 1)
        
        #from hidden layer to the output layer
        w2 = npr.uniform(-1.0, 1.0, size = self.n_output*(self.n_hidden+1))
        w2.reshape(self.n_output, self.n_hidden+1)
        
        return w1, w2
        
        
        #phi function at each cell
    def sigmoid(self, z):
        return 1.0/(1.0+np.exp(-z))
    
    def forward(self, X, w1, w2):
        # X is a N x n_features data matrix
        #add constant = 1 to each data, to the first column
        a_input = np.ones(X.shape[0], X.shape[1] + 1)
        a_input[:, 1:] = X
        #new_X is a N x (n_features + 1) data matrix
        
        #compute the output of the hidden layer
        #a_hidden is a n_hidden x N matrix
        z_hidden = w1.dot(a_input.T)
        a_hidden = self.sigmoid(z_hidden)
        
        #add const one to each data
        #new_a_hidden is a (n_hidden+1)x N matrix
        new_a_hidden = np.ones(n_hidden + 1, a.hidden.shpae[1])
        new_a_hidden[1: , :] = a_hidden
        
        #compute the output
        z_output = w2.dot(new_a_hidden)
        a_output = self.sigmoid(z_output)
        
        #return result of every layer
        return a_input, z_hidden, new_a_hidden, z_output, a_output 
    
    #######################################################
    #  
    #         Backward computation utility functions
    #
    ########################################################
    
    #derivative of tanh
    def sigmoid_gradient(self,z):
        return self.sigmoid(z)*(1-self.sigmoid(z))
    
    #why do we need z_hidden?????
    def w_gradient(self, w1, w2, y, a_input, new_a_hidden, a_output, z_hidden):
        delta3 = y - a_output
        
        #new_a_hidden has one more row of one's, so is w2
        #so we also add one row into to z_hidden
        #after computing delta2, we get rid of this row
        new_z_hidden = np.ones(z_hidden.shape[0]+1, z_hidden.shape[1])
        new_z_hidden [1:, :] = z_hidden
        delta2  = w2.T.dot(delta3)*self.sigmoid_gradient(new_z_hidden)
        delta2 = delta2[1:, :]   
    
    
    ########################################
    #  
    #         Regularization
    #
    #######################################
    def L2(self, w1, w2):
        
        return np.sum(w1*w1)+ np.sum(w2*w2)
        
        

The pipeline is
$$a \rightarrow ^w z \rightarrow \tanh \rightarrow a \rightarrow ^w z \cdots$$

To do backpropagation, we need to compute
$$\delta ^{(l)}_j := \frac{\partial Error}{\partial z^{(l)}_j}
=\sum_k \delta ^{(l+1)}_k w _{jk}^{(l+1)}\tanh'(z^{(l)}_j) $$

[ 0.          0.88079708  0.95257413  0.98201379  0.99330715]


  from ipykernel import kernelapp as app
