In [1]:
%matplotlib inline

# MNIST FFD NN
## Classifying MNIST dataset with Feed Forward Network

For our case we are going to use the MNIST dataset, a standard benchmark for neural networks. For our small network we can get great performance on this dataset. MNIST is  a collection of handwritten digits from 0 to 9, so this is a multiclass classification probllem. Each instance is a 28 by 28 image unwrapped into 784 pixels each. 

In [2]:
import numpy as np
import matplotlib.pyplot as plt

### Lets load our dataset

In [3]:
import _pickle as pkl
import gzip

Unpacks the MNIST data and chunks it into minibatches

In [4]:
def label_to_bit_vector(labels, nbits):
    """Returns label in bit vector format"""
    bv = np.zeros((labels.shape[0], nbits))
    for i in range(labels.shape[0]):
        bv[i, labels[i]] = 1.0
        
    return bv

In [5]:
def create_minibatches(data, labels, batch_size, create_bit_vector=False):
    N = data.shape[0]
    
    print("Total number of examples: {}".format(N))
    
    if N % batch_size != 0:
        print("create_minibatches(): batch size {} does not" \
             "evenly divide number of examples {}".format(batch_size, N))
    
    chunked_data = []
    chunked_labels = []
    idx = 0
    
    while idx+batch_size <= N:
        chunked_data.append(data[idx:idx+batch_size, :])
        if not create_bit_vector:
            chunked_labels.append(labels[idx:idx+batch_size])
        else:            
            bv = label_to_bit_vector(labels[idx:idx+batch_size], 10)
            chunked_labels.append(bv)
        
        idx += batch_size
        
    return chunked_data,chunked_labels
        

In [6]:
f = gzip.open('mnist.pkl.gz')
train_set, valid_set, test_set = pkl.load(f,encoding='iso-8859-1')
f.close()

In [7]:
minibatch_size = 100
print("Creating minibatch of size {}".format(minibatch_size))
print("Training:")
train_data, train_labels = create_minibatches(train_set[0], train_set[1],
                                             minibatch_size,
                                             create_bit_vector=True)
print("Testing:")
valid_data, valid_labels = create_minibatches(valid_set[0], valid_set[1],
                                             minibatch_size,
                                             create_bit_vector=True)

print("Minibatch of size {} created".format(minibatch_size))
print("Length of training data:",len(train_data))

Creating minibatch of size 100
Training:
Total number of examples: 50000
Testing:
Total number of examples: 10000
Minibatch of size 100 created
Length of training data: 500


### Lets Plot our data

In [8]:
def bit_vector_to_label(bv):
    label = []
    j = 0
    for i in bv:
        if i != 0:
            label.append(str(j))
        j += 1
    return ",".join(label)        
    
def show_a_single_mnist_digit(data):
    pixels = data.copy().reshape((28, 28))
    plt.imshow(pixels, cmap='gray')
    plt.show()
    
def show_mnist_digits(data,label):
    """
    Input: 
        a) data:
                type:     array
                contains: array of {nd-array(28x28) representing a digit}
        b)label:
                type:     array
                contains: array of labels corresponding to each item in data in bit vector form
    """
    l = len(data)
    
    fig =  plt.figure(figsize=(16, ((l//20 + 1) * 3)))
    for i,example in enumerate(data):
        subplot = fig.add_subplot(l//10 + 1,10,i+1)
        pixels = example.copy().reshape((28, 28))
        subplot.imshow(pixels, cmap='gray')
        subplot.set_title("Predicted:{}".format( (bit_vector_to_label(label[i])) ))
        subplot.axis('off')
    plt.show()    

#show_mnist_digits(train_data[0],train_labels[0])

### Lets impliment our Neural Network

#### Activation functions
During training we need to compute the activation fuction values for each layer in addition to their derivatives. 

In [9]:
def f_sigmoid(X, deriv=False):
    if not deriv:
        return 1 / (1 + np.exp(-X))
    else:
        return f_sigmoid(X)*(1 - f_sigmoid(X))

def f_softmax(X):
    Z = np.sum(np.exp(X), axis=1)
    Z = Z.reshape(Z.shape[0], 1)
    return np.exp(X) / Z

#### Initializing the network
For this case the only hyper parameters are the network size and the minibatch size (default=100). In case of MNIST the input layer has 784 units and the output layer has 10 units. So the network size is specified in the format [784, A, B, 10] where A is the number of units in the first hidden layer and B is that of in second layer. The learning rate selection is also crutial. 

In [10]:
class Layer:
    def __init__(self, size, minibatch_size, is_input=False, is_output=False, activation=f_sigmoid):
        self.is_input = is_input
        self.is_output = is_output
        
        # Z matrix stores the output values for the network
        self.Z = np.zeros((minibatch_size, size[0]))
        
        # the activation fuction is an externally defined function  with a derivative 
        # that is stored here
        self.activation = activation
        
        # W is the outgoing weight matrix for this layer
        self.W = None
        
        # S is the matrix that holds the inputs for this layer
        self.S = None
        
        # D is the matrix holding the deltas for this layer
        self.D  = None
        
        # Fp is a matrix containin the derivatives of the activation function
        self.fp = None
        
        if not is_input:
            self.S = np.zeros((minibatch_size, size[0]))
            self.D = np.zeros((minibatch_size, size[0]))
        
        if not is_output:
            self.W = np.random.normal(size=size, scale=1E-4)
            
        if not is_input and not is_output:
            self.Fp = np.zeros((size[0], minibatch_size))

    def forward_propagate(self):
        if self.is_input:
            return self.Z.dot(self.W)

        self.Z = self.activation(self.S)
        if self.is_output:
            return self.Z
        else:
            # For hidden layers, we add the bias values here
            self.Z = np.append(self.Z, np.ones((self.Z.shape[0], 1)), axis=1)
            self.Fp = self.activation(self.S, deriv=True).T
            return self.Z.dot(self.W)

In [11]:
class MLP:
    def __init__(self, layer_config, minibatch_size=100):
        self.layers = []
        self.num_layers = len(layer_config)
        self.minibatch_size = minibatch_size

        for i in range(self.num_layers-1):
            if i == 0:
                print( "Initializing input layer with size {0}.".format(
                    layer_config[i]
                ))
                # Here, we add an additional unit at the input for the bias
                # weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         minibatch_size,
                                         is_input=True))
            else:
                print( "Initializing hidden layer with size {0}.".format(
                    layer_config[i]
                ))
                # Here we add an additional unit in the hidden layers for the
                # bias weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         minibatch_size,
                                         activation=f_sigmoid))

        print( "Initializing output layer with size {0}.".format(
            layer_config[-1]
        ))
        self.layers.append(Layer([layer_config[-1], None],
                                 minibatch_size,
                                 is_output=True,
                                 activation=f_softmax))
        print( "Done!")

    def forward_propagate(self, data):
        # We need to be sure to add bias values to the input
        self.layers[0].Z = np.append(data, np.ones((data.shape[0], 1)), axis=1)

        for i in range(self.num_layers-1):
            self.layers[i+1].S = self.layers[i].forward_propagate()
        return self.layers[-1].forward_propagate()

    def backpropagate(self, yhat, labels):
        self.layers[-1].D = (yhat - labels).T
        for i in range(self.num_layers-2, 0, -1):
            # We do not calculate deltas for the bias values
            W_nobias = self.layers[i].W[0:-1, :]

            self.layers[i].D = W_nobias.dot(self.layers[i+1].D) * \
                               self.layers[i].Fp

    def update_weights(self, eta):
        for i in range(0, self.num_layers-1):
            W_grad = -eta*(self.layers[i+1].D.dot(self.layers[i].Z)).T
            self.layers[i].W += W_grad

    def evaluate(self, train_data, train_labels, test_data, test_labels,
                 num_epochs=5, eta=0.05, eval_train=False, eval_test=True):

        N_train = len(train_labels)*len(train_labels[0])
        N_test = len(test_labels)*len(test_labels[0])

        print( "Training for {0} epochs...".format(num_epochs))
        for t in range(0, num_epochs):
            out_str = "[{0:4d}] ".format(t)

            for b_data, b_labels in zip(train_data, train_labels):
                output = self.forward_propagate(b_data)
                self.backpropagate(output, b_labels)
                self.update_weights(eta=eta)

            if eval_train:
                errs = 0
                for b_data, b_labels in zip(train_data, train_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = "{0} Training error: {1:.5f}".format(out_str,
                                                           float(errs)/N_train)

            if eval_test:
                errs = 0
                for b_data, b_labels in zip(test_data, test_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = "{0} Test error: {1:.5f}".format(out_str,
                                                       float(errs)/N_test)

            print(out_str)


In [12]:
mlp = MLP(layer_config=[784, 100, 100, 10], minibatch_size=minibatch_size)

Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!


In [13]:
mlp.evaluate(train_data, train_labels, valid_data, valid_labels, eval_train=True)

Training for 5 epochs...
[   0]  Training error: 0.61094 Test error: 0.61530
[   1]  Training error: 0.08876 Test error: 0.07980
[   2]  Training error: 0.05556 Test error: 0.05560
[   3]  Training error: 0.05538 Test error: 0.05440
[   4]  Training error: 0.03196 Test error: 0.03730


In [22]:
mlp.layers[0].W

array([[ -3.07780350e-05,   1.13844859e-04,  -1.23798734e-04, ...,
         -2.68208778e-06,  -1.84540163e-04,   5.47450227e-05],
       [ -9.88863253e-05,  -1.05963993e-04,   9.05792993e-05, ...,
         -1.46710669e-04,   1.58891427e-04,   1.16185031e-04],
       [  1.21174084e-04,   1.85879528e-05,   1.33748243e-05, ...,
          1.32356735e-04,  -2.43572419e-05,  -2.58250311e-05],
       ..., 
       [  1.11701816e-04,   1.61332145e-07,  -1.13474152e-04, ...,
         -3.21694462e-05,   6.76692134e-05,  -8.00345519e-05],
       [  6.62623873e-05,   2.52190236e-05,  -4.72522387e-05, ...,
         -2.81403650e-04,  -7.16287270e-05,   5.30416326e-05],
       [ -1.39383856e+00,  -8.01127133e-01,  -1.12604888e-01, ...,
         -3.75319786e-01,  -8.08160311e-01,   1.89651562e-02]])