## An Explanation


In [43]:
import numpy as np
from scipy.special import expit as sigmoid
from scipy.optimize import fmin_cg

In [50]:
def forward(x, *Thetas):
    """Put a training sample through the network.
    
    Each layer's activation units will be calculated. Said another way,
    the data point will be fed into the NN as the input layer. The input
    will be "fed" into each unit in each layer by multiplying against the
    unit's weight vector. The result is the unit having an output 
    "activation" value. 
    
    This activation value roughly corresponds to its predictions for the 
    given sample. As the size of layers drifts from the original number of
    features this becomes more abstract; in fact, you could say each layer
    in the NN is a higher-level abstraction of the previous level. The
    output layer presents the _final_ abstraction, reducing _n_ features
    into a prediction across _K_ classes, in the case of a classification 
    problem.
    
    :arg vector x: (m x 1) row vector representing a data point with 
        _m_ features. It is assumed the bias value has _not_ been inserted.
    :arg [matrix] Theta: Matrices of weights for the connections between 
        layers. Size should be L-1, where L is the number of layers.
    :rtype: 2D array
    :return: a 2D array of activation values. Each row represents a layer, 
        and columns represent units in the array. Thus, the output layer's
        activations can be found at ``return_arr[-1]``.
    """
    assert x.ndim == 1, "Expected row vector, got {}".format(x)
    #: Activation units
    # Since 2D numpy arrays must have the same dimensions, we have to
    # our layer activation units in a normal Python list.
    a_units = list([x])
#     print(a_units)
    for i in range(len(Thetas)):
        # Add the bias value
        biased_a = np.append(np.ones(1), a_units[i])
#         assert biased_a.shape == (3,), "a_bias = {}".format(biased_a.shape)
#         print(biased_a)
        # Compute (a_i * Theta.T)
#         print(Thetas[i].T)
        z = biased_a.dot(Thetas[i].T)
#         print(z)
        a_units.append(sigmoid(z.T))
#         print(a_units)
    return a_units
    
    
def test_forward():
    # 2 features => 4 neuron => 4 outputs (classes)
    theta_1 = np.array([np.sin(np.arange(0, 5.9, 0.5))]).reshape((4, 3))
    theta_2 = np.array([np.sin(np.arange(0, 5.9, 0.3))]).reshape((4, 5))
    X = np.array([np.sin(np.arange(1,17))]).reshape([8, 2]);
    # Accumulate predictions
    preds = np.zeros((len(X), theta_2.shape[0]))
    for i, sample in enumerate(X):
        preds[i] = forward(sample, theta_1, theta_2)[-1]
    print(preds)
    # Select only the maxes
    max_p = np.argmax(preds, axis=1)
    exp_p = np.array([[4, 1, 1, 4, 4, 4, 4, 2]]).T
    assert np.array_equal(max_p+1, exp_p)

test_forward()

[[ 0.74086829  0.94067339  0.34084823  0.05432868]
 [ 0.79127816  0.92556804  0.27367472  0.06546035]
 [ 0.80134333  0.91853201  0.25884502  0.07100224]
 [ 0.74134848  0.94022222  0.34003037  0.05471739]
 [ 0.78070552  0.92968906  0.28812528  0.06239245]
 [ 0.80825702  0.91575985  0.2495236   0.07297472]
 [ 0.74414057  0.93918548  0.33618091  0.05554377]
 [ 0.77016492  0.93342417  0.30244605  0.05959466]]


AssertionError: 

In [None]:
def backprop(a_units, thetas, y, _lambda=0.01):
    """Perform backpropagation to determine layer gradients.
    
    :arg a_units: activated unit values, where each row represents a 
        layer.
    :arg thetas: Weight matrices.
    :arg y: Target values.
    :kwarg float _lambda: Regularization hyperparameter.
    :return: Gradients calculated per weight matrix. Same dimensions
        as the corresponding weight matrix, bias value included.
    """
    Deltas = list()
    # Calculate error between output layer and target values
    delta_output = a_units[-1, np.newaxis] - y
    # For each hidden layer, calculate gradient _going backwards_
    for i in range(len(a_units)-1, 0):
        
    
    return Deltas

In [10]:
class NeuralNetwork:
    
    def __init__(self, layers, _lambda=0.01):
        """Initialize the neural network's hyperparameters."""
        self._lambda = _lambda
        self.layers = layers
        self.thetas = None
        
    def init_layers(self, X, layers):
        """Initialize theta values for the layers inside the network."""
        # Determine theta matrice values
        # Randomly initialize weights
        return np.zeros((1,1))

    def train(self, X, y, verbose=False):
        """Train a neural network. 

        :arg matrix X: Training data points.
        :arg vector y: Training data outcomes.
        :arg [int] layers: Number of layers. The size of the first layer will
            be initialized from the # of columns in X. The size of y will not 
            be used, in case an array of predictions is returned.
        :kwarg float _lambda: Regularization parameter.
        :kwarg bool verbose: Verbosity.
        """
        if not self.thetas:
            self.thetas = init_layers(X, self.layers)
        for sample in X:
            # Forward propagation
            h = self.forward(sample)
            # Backward's propagation
            gradients = self.backward(h)
            # Gradient checking using grad_approx
            approx_grads = grad_approx(h) 
        # Gradient Descent at the end? Or every iteration?
        result = fmin_cg()
        k
    def forward(self, x):
        """Forward propagation for a single data point.
        
        :arg ndarray x: Data point as a row vector.
        :return: Column vector of predictions. Length is dependent
            on the number of rows in the last theta matrix.
        """
        return np.zeros((1,1))
    
    def backward(self, preds):
        """Backwards propagate a set of predictions through the network.
        
        The error of the predictions, expressed by gradients, will be used to 
        tune the weights for each layer.
        
        There is no output, as the model weights are updated in-place.
        """
        pass
        
    def predict(self, X):
        """Make predictions on the dataset.
        
        Very similar to forward propagation, except the prediction probabilities
        are translated into a vectorized binary outcome.
        """
        preds = np.zeros(X.shape)
        assert preds.shape == (X.shape[0], 1) # assert it's a column vector
        maxes = np.argmax(preds, axis=1)
        # Assert exactly one class was predicted 
        for p in maxes:
            assert p.sum() == 1
        return maxes