In [19]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist

# Load the dataset
(X1, Y1), (X2, Y2) = mnist.load_data()


In [20]:

# Process the dataset
m_train = X1.shape[0]
m_test = X2.shape[0]
X_train = (X1.reshape(X1.shape[0],-1).T)/255
Y_train_temp = Y1.reshape(Y1.shape[0],)
Y_train = np.zeros((Y_train_temp.size,10))
Y_train[np.arange(Y_train_temp.size),Y_train_temp] = 1
Y_train = Y_train.T
X_test = (X2.reshape(X2.shape[0],-1).T)/255
Y_test_temp = Y2.reshape(Y2.shape[0],)
Y_test = np.zeros((Y_test_temp.size,10))
Y_test[np.arange(Y_test_temp.size),Y_test_temp] = 1
Y_test = Y_test.T

In [21]:

# Defining activation functions
def sigmoid(z):
    s = 1/(1+np.exp(-z))
    return s

def leakyrelu(z):
    s = np.where(z>0 , z , z*0.01)
    activation_cache = (z)
    return s, activation_cache

def softmax(z):
    s = np.exp(z)/np.sum(np.exp(z), axis = 0, keepdims = True)
    activation_cache = (z)
    return s, activation_cache

In [22]:
def initialize_parameters(X,Y):
    """
    Argument:
    X -- Input image
    Y -- Label of input image
    
    Returns:
    parameters -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
    """

    W1 = np.random.randn(100,X.shape[0])*0.01
    b1 = np.zeros((100,1), dtype = float)
    W2 = np.random.randn(50,W1.shape[0])*0.01
    b2 = np.zeros((50,1), dtype = float)
    W3 = np.random.randn(25,W2.shape[0])*0.01
    b3 = np.zeros((25,1), dtype = float)
    W4 = np.random.randn(10,W3.shape[0])*0.01
    b4 = np.zeros((10,1), dtype = float)

    parameters = {"W1" : W1, "b1" : b1,"W2" : W2, "b2" : b2,"W3" : W3, "b3" : b3,"W4" : W4, "b4" : b4,}

    return parameters


In [23]:
def linear_forward(A, W, b):
     """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python tuple containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    
    Z = np.dot(W,A) + b
    cache = (A, W, b)
    return Z, cache

In [24]:
def linear_activation_forward(A_prev, W, b, activation):
     """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python tuple containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    if activation == "leakyrelu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = leakyrelu(Z)
    
    if activation == "softmax":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = softmax(Z)
        
    cache = (linear_cache, activation_cache)
    return A, cache


In [25]:
def L_forward(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    
    Returns:
    AL -- activation value from the output (last) layer
    caches -- list of caches containing:
                every cache of linear_activation_forward() (there are L of them, indexed from 0 to L-1)
    """
    

    caches = []
    A = X
    L = len(parameters)//2

    for l in range (1,L):
        A_prev = A

        A,cache = linear_activation_forward(A_prev,parameters["W" + str(l)], parameters["b" + str(l)], "leakyrelu")
        caches.append(cache)

    AL, cache =  linear_activation_forward(A,parameters["W" + str(L)], parameters["b" + str(L)], "softmax")
    caches.append(cache)

    return AL, caches

In [26]:
def compute_cost(AL, Y):
     """
    Implement the cost function defined by equation (7).

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """

    m = Y.shape[1]
    cost = - np.sum(Y*np.log(AL))/m
    np.squeeze(cost)

    return cost

In [27]:
def linear_backward(dZ , cache):
     """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """

    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = np.dot(dZ, A_prev.T)/m
    db = np.sum(dZ, axis = 1, keepdims=True)/m
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

In [28]:
def softmax_backward(AL, Y):
    dZ = AL- Y
    return dZ

In [29]:
def leakyrelu_backward(dA, activation_cache):
    Z = activation_cache
    Z_temp = np.where(Z>0, 1, 0.01)
    dZ = dA * Z_temp

    return dZ

In [30]:
def linear_activation_backward(Y, AL, dA, cache, activation):
     """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    
    linear_cache, activation_cache = cache

    if activation == "leakyrelu":
        dZ = leakyrelu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)

    elif activation == "softmax":
        dZ = softmax_backward(AL, Y)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)


    return dA_prev, dW, db
        

In [31]:
def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    

    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    dAL = -Y/AL

    current_cache = caches[L-1]
    dA_prev_temp, dW_temp, db_temp = linear_activation_backward(Y,AL,dAL,current_cache, "softmax")
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp

    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(Y,AL,dA_prev_temp, current_cache, "leakyrelu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l+1)] = dW_temp
        grads["db" + str(l+1)] = db_temp

    return grads
    

In [32]:
def update_parameters(params,grads,learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    params -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    parameters = params.copy()

    L = len(parameters)//2

    for l in range(L):
        parameters["W" + str(l+1)] = params["W" + str(l+1)] - learning_rate*grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = params["b" + str(l+1)] - learning_rate*grads["db" + str(l+1)]

    return parameters

In [33]:
def random_mini_batches(X,Y, mini_batch_size = 64):
    m = X.shape[1]
    mini_batches = []

    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation]

    inc = mini_batch_size
    num_complete_minibatches = m // mini_batch_size
   
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:,k*inc:(k+1)*inc]
        mini_batch_Y = shuffled_Y[:,k*inc:(k+1)*inc]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: ]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: ]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [34]:
def get_predictions(Y_hat):
    return np.argmax(Y_hat,0)

def get_accuracy(predictions,Y):
    predictions = predictions.reshape(1,predictions.shape[0])
    #print(predictions.shape)
    ans = 0
    for i in range(Y.shape[1]) :
        predict = predictions[0,i]
        if Y[predict,i]==1 :
            ans+=1
    print(ans)
    return str((ans/Y.shape[1])*100) + '%'

In [35]:

def model(X, Y, learning_rate = 0.0075, num_iterations = 3000, print_cost = False):
    """
    Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.
    
    Arguments:
    X -- data, numpy array of shape (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    costs -- An array of the costs of every iteration
    Y_predict -- A one hot encoded numpy array for calculating accuracy.
    """
    grads = {}
    costs = []
    m = X.shape[1]
    parameters = initialize_parameters(X,Y)
    cost = 2.5

    for i in range(0, num_iterations):
        AL, caches = L_forward(X,parameters)
        grads = L_model_backward(AL, Y, caches)
        parameters = update_parameters(parameters,grads,learning_rate)
        cost = compute_cost(AL,Y)
        Y_predict = np.zeros(AL.shape)
        Y_predict[np.argmax(AL, axis = 0), np.arange(AL.shape[1])] = 1
        
        if print_cost and i % 100 == 0 or i == num_iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
            #print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_predict - Y)) * 100))
            print("accuracy : " , get_accuracy(get_predictions(AL),Y))        

        if i % 100 == 0 or i == num_iterations:
            costs.append(cost)

    return parameters,costs, Y_predict

In [36]:
parameters, costs, Y_predict = model(X_train,Y_train, 0.8, 2500, True)

Cost after iteration 0: 2.302585642974819
5156
accuracy :  8.593333333333334%
Cost after iteration 100: 2.3011558163568706
6742
accuracy :  11.236666666666666%
Cost after iteration 200: 2.3011516013839075
6742
accuracy :  11.236666666666666%
Cost after iteration 300: 2.301141747687725
6742
accuracy :  11.236666666666666%
Cost after iteration 400: 2.3011033446568465
6742
accuracy :  11.236666666666666%
Cost after iteration 500: 2.300350834684254
6742
accuracy :  11.236666666666666%
Cost after iteration 600: 2.070144149579087
13211
accuracy :  22.018333333333334%
Cost after iteration 700: 2.251270035035066
7157
accuracy :  11.928333333333335%
Cost after iteration 800: 1.2424526680424275
28185
accuracy :  46.975%
Cost after iteration 900: 1.5757071058293326
20467
accuracy :  34.111666666666665%
Cost after iteration 1000: 0.5046285050816177
49035
accuracy :  81.72500000000001%
Cost after iteration 1100: 3.2116518083707457
28107
accuracy :  46.845%
Cost after iteration 1200: 0.2576492593134

In [49]:
def accuracy_test(parameters, X, Y):
    AL, caches = L_forward(X,parameters)
    print("accuracy : " , get_accuracy(get_predictions(AL),Y))

    return AL, Y_predict

In [52]:
AL, Y_predict = accuracy_test(parameters, X_test, Y_test)



9551
accuracy :  95.50999999999999%
