In [3]:
# NN to recognize hand-written digits using the MNIST data
using DelimitedFiles
using StatsBase
using Distributions
using LinearAlgebra

# read the MNIST data
const testx = readdlm("testx.csv", ',', Int, '\n')
const testy = readdlm("testy.csv", ',', Int, '\n')
const trainx = readdlm("trainx.csv", ',', Int, '\n')
const trainy = readdlm("trainy.csv", ',', Int, '\n')

const L = 3                 # number of layers including input and output
const sizes = [784, 30, 10] # number of neurons in each layer

# the activation function
@. f(z) = 1/(1 + exp(-z))      # sigmoid activation
@. fprime(z) = f(z) * (1-f(z))

# convert a digit d to a 10-element vector
# e.g. 6 is converted to [0,0,0,0,0,0,1,0,0,0]
function digit2vector(d)
    vcat( repeat([0], d), 1, repeat([0], 9-d) )
end

# a feedforward function that returns the activations
# from each layer and the weighted inputs to each layer
# so that they can be used during backpropagation.
# W,b contain the weights, biases in the network.
# x is the input of a single training example (a vector of length 784).
function feedforward(W, b, x)
    # TO BE COMPLETED.
    z = [zeros(sizes[1]), zeros(sizes[2]), zeros(sizes[3])]
    a = [zeros(sizes[1]), zeros(sizes[2]), zeros(sizes[3])]
    a[1] = x
    z[2] = W[1] * x + b[1]
    a[2] = f(z[2])
    z[3] = W[2] * a[2] + b[2]
    a[3] = f(z[3])
    return a, z
end

# given an input vector, return the predicted digit
function classify(W, b, x)
    # TO BE COMPLETED.
    a,z = feedforward(W, b, x)
    predicted_digit = 0
    predicted_digit_activation = 0
    for i=1:10
        if a[3][i] > predicted_digit_activation
            predicted_digit = i-1
            predicted_digit_activation = a[3][i]
        end
    end
    return predicted_digit
end

# helper function for backprop().
# this function computes the error for a single training example.
# W contains the weights in the network.
# a contains the activations.
# z contains the weighted inputs.
# y is the correct digit.
# returns δ = the error. the size of δ is [ 784, 30, 10 ]
function compute_error(W, a, z, y)
    δ = [ zeros(sizes[1]), zeros(sizes[2]), zeros(sizes[3]) ]
    # note that δ[1] is junk. we put it there so that the indices make sense.

    # at the output layer L
    δ[3] = -(digit2vector(y) .- a[3]) .* fprime(z[3])

    # for each earlier layer L-1,L-2,..,2 (for the HW, this means only layer 2)
    δ[2] = W[2]' * δ[3] .* fprime(z[2])

    return δ
end

# helper function for backprop(). given the errors δ and the
# activations a for a single training example, this function returns
# the gradient components ∇W and ∇b.
# this function implements the equations BP3 and BP4.
function compute_gradients(δ, a)
    # TO BE COMPLETED.
    ∇W = [ zeros(sizes[2], sizes[1]),
             zeros(sizes[3], sizes[2]) ]
    ∇b = [ zeros(sizes[2]), zeros(sizes[3]) ]
    ∇W[1] = δ[2]*transpose(a[1])
    ∇b[1] = δ[2]
    ∇W[2] = δ[3]*transpose(a[2])
    ∇b[2] = δ[3]
    return ∇W, ∇b
end

# backpropagation. returns ∇W and ∇b for a single training example.
function backprop(W, b, x, y)
    (a, z) = feedforward(W, b, x)
    δ = compute_error(W, a, z, y)
    (∇W, ∇b) = compute_gradients(δ, a)
    return ∇W, ∇b
end

# gradient descent algorithm.
# W = weights in the network
# b = biases in the network
# batch = the indices of the observations in the batch, i.e. the rows of trainx
# α = step size
# λ = regularization parameter
function GD(W, b, batch; α=0.01, λ=0.01)
    m = length(batch)    # batch size

    # data structure to accumulate the sum over the batch.
    # in the notes and in Ng's article sumW is ΔW and sumb is Δb.
    sumW = [ zeros(sizes[2], sizes[1]),
             zeros(sizes[3], sizes[2]) ]
    sumb = [ zeros(sizes[2]), zeros(sizes[3]) ]
    ∇W = [ zeros(sizes[2], sizes[1]),  # layer 1 to 2
          zeros(sizes[3], sizes[2]) ] # layer 2 to 3
    ∇b = [ zeros(sizes[2]),   # layer 2
          zeros(sizes[3]) ]   # layer 3

    # for each training example in the batch, use backprop
    # to compute the gradients and add them to the sum
    # THIS FUNCTION IS INCOMPLETE.
    for i=1:m
        x = trainx[batch[i],:]
        y = trainy[batch[i]]
        ∇W, ∇b = backprop(W, b, x, y)
        sumW += ∇W
        sumb += ∇b
    end

    # make the update to the weights and biases and take a step
    # of gradient descent. note that we use the average gradient.
    W = W - α*((1/m)*sumW + λ*W)
    b = b - α*(1/m)*sumb

    # return the updated weights and biases. we also return the gradients
    return W, b, ∇W, ∇b
end

# classify the test data and compute the classification accuracy
function accuracy(W, b) 
    ntest = length(testy)
    yhat = zeros(Int, ntest)
    for i in 1:ntest
        yhat[i] = classify(W, b, testx[i,:])
    end
    sum(testy .== yhat)/ntest # hit rate
end

# train the neural network using batch gradient descent.
# this is a driver function to repeatedly call GD().
# N = number of observations in the training data.
# m = batch size
# α = learning rate / step size
# λ = regularization parameter
function BGD(N, m, epochs; α=0.01, λ=0.01) 
    # random initialization of the weights and biases
    d = Normal(0, 1)
    # each is a vector of 2 matrices
    W = [ rand(d, sizes[2], sizes[1]),  # layer 1 to 2
          rand(d, sizes[3], sizes[2]) ] # layer 2 to 3
    b = [ rand(d, sizes[2]),   # layer 2
          rand(d, sizes[3]) ]  # layer 3
    ∇W = [ zeros(sizes[2], sizes[1]),  # layer 1 to 2
          zeros(sizes[3], sizes[2]) ] # layer 2 to 3
    ∇b = [ zeros(sizes[2]),   # layer 2
          zeros(sizes[3]) ]   # layer 3

    # THIS FUNCTION IS INCOMPLETE.
    #
    # you should print out messages to monitor the progress of the
    # training. for example, you could print the epoch number and the
    # accuracy after completion of each epoch.
    for i in 1:epochs
        batch_start = 1
        batch_end = m
        num_batches = N/m
        batch = collect(batch_start:batch_end)
        for j in 1:num_batches
            W, b, ∇W, ∇b = GD(W, b, batch, α=α, λ=λ)
            batch_start = batch_start + m
            batch_end = batch_end + m
            batch = collect(batch_start:batch_end)
        end
        println("epoch: ", i, ", accuracy: ", accuracy(W, b)) 
    end
    return W, b, ∇W, ∇b
end




BGD (generic function with 1 method)

In [5]:
# some tuning parameters
N = length(trainy)
m = 25       # batch size
epochs = 20  # number of complete passes through the training data
α = 0.01     # learning rate / step size
λ = 0.01     # regularization parameter
W, b, ∇W, ∇b = BGD(N, m, epochs, α=α, λ=λ)


epoch: 1, accuracy: 0.1434
epoch: 2, accuracy: 0.1864
epoch: 3, accuracy: 0.2282
epoch: 4, accuracy: 0.2923
epoch: 5, accuracy: 0.367
epoch: 6, accuracy: 0.4481
epoch: 7, accuracy: 0.5342
epoch: 8, accuracy: 0.6093
epoch: 9, accuracy: 0.6882
epoch: 10, accuracy: 0.7352
epoch: 11, accuracy: 0.7876
epoch: 12, accuracy: 0.831
epoch: 13, accuracy: 0.8555
epoch: 14, accuracy: 0.8751
epoch: 15, accuracy: 0.8873
epoch: 16, accuracy: 0.893
epoch: 17, accuracy: 0.8978
epoch: 18, accuracy: 0.8966
epoch: 19, accuracy: 0.8982
epoch: 20, accuracy: 0.9015


([[-0.002054729257847839 -0.0002632413771160021 … 0.01168977000423505 0.00850796961506093; -0.0005258110341069333 -0.005487683134411914 … 0.011861372335399923 0.0022917182150787863; … ; 0.00574914029846047 0.003223744209855617 … -0.0009772940898019895 -0.0023555221446821613; -0.011725435317082217 -0.010350120709560602 … 0.0124116899185815 0.005708423289927708], [-0.34809282643010725 -0.21694735451044755 … 0.12623793043554943 -0.18152038986360894; 0.18608913957296672 0.31601205289281215 … -0.33871735026340083 0.3536741046885771; … ; -0.32535437566742675 -0.2587736006878181 … 0.07160674657452111 -0.23545397692809056; -0.3415061761207853 -0.16060706294450464 … 0.05821835203555815 -0.2647092838700584]], [[1.2373395949501236, -1.3066808761539621, 0.3235473836038311, -0.16792332170400903, -0.06018646674486312, -1.1635500622827033, -0.08906716505267216, -0.7541810832909471, 0.2458851777458032, 1.5127465730967131  …  0.8298779381547795, -1.1845556290483528, 0.3820014142331733, -0.4021178360284

In [None]:
# After iterating through the data 20 times in batches of 25 observations in the gradient descent algorithm, 
# I achieved an accuracy of over 90%.
# This meanns that the neural network with the calucated weights and biases will result in the correct predicted digit
# 90% of the time.