In [3]:
import numpy as np

In [27]:
class Network(object):
    def __init__(
        self,
        nodes,
        step_size=1
    ):
        """
        nodes represents the number of nodes per layer. 
        eg: [2,3,5] is 2 feature input, 
        3 neurons in the first layer,
        5 neurons in the final layer.
        5 also represents the number of classes
        
        nb_layers includes both the output and input layers
        outputs is the output matrix of each layer. An output matrix is of shape (p, n),
            where p is the number of examples given to the feedforward, and n is the nb of nodes in the layer
        derivates is the derivates of each layer. Each row is a different layer
        errod_ds is the derivate of the error function
        
        Default cost function set to cross entropy, therefore, weights[-1] == nb of classes
        """
        self.step_size = step_size
        
        self.nb_layers = len(nodes)
        self.nodes = nodes
        self.weights = [ np.ones((n+1, m), dtype=float) for n,m in zip(nodes[:-1], nodes[1:])]
        self.outputs = [ 0 for n in nodes[1:]]
        self.derivatives = [ 0 for n in nodes[1:]]
        self.error_ds = np.zeros(nodes[-1])
        self.gradients = [ 0 for n in nodes[1:]]
        
    def fforw(self, inputs, labels):
        shape = np.shape(inputs)
        cur_input = np.ones((shape[0], shape[1] + 1))
        cur_input[:, :-1] = inputs
        for l,w in enumerate(self.weights):
            out = self.sigmoid(np.dot(cur_input, w))
            self.outputs[l] = out
            self.derivatives[l] = self.sigmoid(out, True)
            
            shape = np.shape(out)
            cur_input = np.ones((shape[0], shape[1] + 1))
            cur_input[:, :-1] = out
        
        #Calculating the derivative of the error function for backprop
        self.error_ds = self.softmaxLoss(cur_input[:, :-1], labels)
        
        #appending the input as output[-1] for future use
        self.outputs.append(inputs)
        
        return   

    def backprop(self):
        """
        Each gradient is of shape p x m,
            where p is the number of examples, m is the number of output nodes from the layer
        """
        self.gradients[-1] = self.derivatives[-1] * self.error_ds
        
        for i in range(1, len(self.derivatives)):
            index = len(self.derivatives) - 1 - i
            example_gradients = np.zeros(np.shape(self.derivatives[index]))
            for j,example in enumerate(self.derivatives[index]):
                example_gradients[j] = np.dot(np.diag(self.derivatives[index][j]), self.weights[index + 1][:-1]).dot(self.gradients[index+1][j])        
            self.gradients[index] = example_gradients
    
    def update_weights(self, inputs, labels):
        """
        inputs is the given input for the network.
        Shape of inputs should be (n x m)
            Where n is the number of examples,
            m is the number of features
        labels are the correct labels for each example of shape (n,)
        """
        self.fforw(inputs, labels)
        self.backprop()
        for i,w in enumerate(self.weights):
            shape = np.shape(self.outputs[i-1])
            hat_o = np.ones((shape[0], shape[1] + 1))
            hat_o [:, :-1] = self.outputs[i-1]
            for e,g in enumerate(self.gradients[i]):
                single_grad = self.gradients[i][e]
                single_grad.shape = (len(single_grad), 1)
                single_hat_o = hat_o[e]
                single_hat_o.shape = (1, len(single_hat_o))
                single_update = -self.step_size*(np.dot(single_grad, single_hat_o)).T
                self.weights[i] += single_update
    
    def sigmoid(self, x, derivative=False):
        if derivative:
            return x*(1-x)
        else:
            return 1/(1+np.exp(-x))

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def softmaxLoss(self, X, y):
        m = y.shape[0]
        p = softmax(X)
#         log_likelihood = -np.log(p[range(m), y])
#         loss = np.sum(log_likelihood) / m

        dx = p.copy()
        dx[range(m), y] -= 1
        dx /= m
        return dx

In [28]:
#Example of 1 hidden layer, 1 output layer NN. 
#Input is 2 features, nb of classes is 5 in this case
n = Network([2,3,5])

#Updates the weights given a mini-batch of 2 examples in this case 
    #Eg: [1,2] are the features for the first example, [3,4] are the features for the second example
    #[0,4] represents the labels of the given examples, where 0 means [1,2] represents class 1, 
    # and 4 means [3,4] is class 5.
n.update_weights([[1,2], [3,4]], np.array([0,4]))