In [1]:
# NN to recognize hand-written digits using the MNIST data
using DelimitedFiles
using StatsBase
using Distributions
using LinearAlgebra

# read the MNIST data
const testx = readdlm("testx.csv", ',', Int, '\n')
const testy = readdlm("testy.csv", ',', Int, '\n')
const trainx = readdlm("trainx.csv", ',', Int, '\n')
const trainy = readdlm("trainy.csv", ',', Int, '\n')

const L = 3                 # number of layers including input and output
const sizes = [784, 30, 10] # number of neurons in each layer

# the activation function
@. f(z) = 1/(1 + exp(-z))      # sigmoid activation
@. fprime(z) = f(z) * (1-f(z))



fprime (generic function with 1 method)

In [2]:
# convert a digit d to a 10-element vector
# e.g. 6 is converted to [0,0,0,0,0,0,1,0,0,0]
function digit2vector(d)
    vcat( repeat([0], d), 1, repeat([0], 9-d) )
end



digit2vector (generic function with 1 method)

In [3]:
# a feedforward function that returns the activations
# from each layer and the weighted inputs to each layer
# so that they can be used during backpropagation.
# W,b contain the weights, biases in the network.
# x is the input of a single training example (a vector of length 784).
function feedforward(W, b, x)
    a = [x,zeros(30), zeros(10)]
    z = [x, zeros(30), zeros(10)]
    z[2] = W[1] * x + b[1]
    a[2] = f.(z[2])
    z[3] = W[2] * a[2] + b[2]
    a[3] = f.(z[3])
    return a, z
end

feedforward (generic function with 1 method)

In [None]:
Feed forward is calculated by hard coding the appropriate layers.

In [4]:
# given an input vector, return the predicted digit
function classify(W, b, x)
    find_max = findmax(x)[2]
    return find_max
end


classify (generic function with 1 method)

In [5]:
# helper function for backprop().
# this function computes the error for a single training example.
# W contains the weights in the network.
# a contains the activations.
# z contains the weighted inputs.
# y is the correct digit.
# returns δ = the error. the size of δ is [ 784, 30, 10 ]
function compute_error(W, a, z, y)
    δ = [ zeros(sizes[1]), zeros(sizes[2]), zeros(sizes[3]) ]
    # note that δ[1] is junk. we put it there so that the indices make sense.

    # at the output layer L
    δ[3] = -(digit2vector(y) .- a[3]) .* fprime(z[3])

    # for each earlier layer L-1,L-2,..,2 (for the HW, this means only layer 2)
    δ[2] = W[2]' * δ[3] .* fprime(z[2])

    return δ
end

compute_error (generic function with 1 method)

In [6]:
# helper function for backprop(). given the errors δ and the
# activations a for a single training example, this function returns
# the gradient components ∇W and ∇b.
# this function implements the equations BP3 and BP4.
function compute_gradients(δ, a)

    ∇W = δ[3] .* transpose(a[3])
    ∇b = δ[3]
    return ∇W, ∇b
end


compute_gradients (generic function with 1 method)

This computes the gradients following the BG3 and BG4 methods 

In [7]:
# backpropagation. returns ∇W and ∇b for a single training example.
function backprop(W, b, x, y)
    (a, z) = feedforward(W, b, x)
    δ = compute_error(W, a, z, y)
    (∇W, ∇b) = compute_gradients(δ, a)
    return ∇W, ∇b
end

backprop (generic function with 1 method)

In [20]:
# gradient descent algorithm.
# W = weights in the network
# b = biases in the network
# batch = the indices of the observations in the batch, i.e. the rows of trainx
# α = step size
# λ = regularization parameter
function GD(W, b, batch; α=0.01, λ=0.01)
    m = length(batch)    # batch size
    
    # data structure to accumulate the sum over the batch.
    # in the notes and in Ng's article sumW is ΔW and sumb is Δb.
    sumW = [ zeros(sizes[2], sizes[1]),
            zeros(sizes[3], sizes[2]) ]
    sumb = [ zeros(sizes[2]), zeros(sizes[3]) ]

    # for each training example in the batch, use backprop
    # to compute the gradients and add them to the sum

    
    for i in batch
        x = trainx[i, :]
        y = trainy[i]
        ∇W, ∇b = backprop(W, b, x, y)
        for j in 1:2
            sumW[j] .+= ∇W[j]
            sumb[j] .+= ∇b[j]
        end
    end
    
    # make the update to the weights and biases and take a step
    # of gradient descent. note that we use the average gradient.

    for k in 1:length(W)
    
        W[k] = W[k] .- α * ((1/m) * sumW[k] + λ*W[k])
        b[k] = b[k] .- α * (1/m) * sumb[k]
    end

    
    # return the updated weights and biases. we also return the gradients
    return W, b, sumW, sumb
end


GD (generic function with 1 method)

This is the gradient descent function. x and y are initialized through the MTRAIN data set that we imported in the beginning, and their values are summed for each element in the batch. I then iterate through the length of the weights in the network (the amount of weights), and once again sum the average gradient to each weight. These values are then returned.

In [21]:
# classify the test data and compute the classification accuracy
function accuracy(W, b)
    ntest = length(testy)
    yhat = zeros(Int, ntest)
    for i in 1:ntest
        yhat[i] = classify(W, b, testx[i,:])
    end
    sum(testy .== yhat)/ntest # hit rate
end


accuracy (generic function with 1 method)

In [22]:
function BGD(N, m, epochs; α=0.01, λ=0.01)
    # random initialization of the weights and biases
    d = Normal(0, 1)
    W = [ rand(d, sizes[2], sizes[1]),  # layer 1 to 2
          rand(d, sizes[3], sizes[2]) ] # layer 2 to 3
    b = [ rand(d, sizes[2]),   # layer 2
          rand(d, sizes[3]) ]  # layer 3
    ∇W = [ zeros(sizes[2], sizes[1]),  # layer 1 to 2
          zeros(sizes[3], sizes[2]) ] # layer 2 to 3
    ∇b = [ zeros(sizes[2]),   # layer 2
          zeros(sizes[3]) ]   # layer 3

    for e in epochs
        remaining = 1:N
        while length(remaining) > 0
            batch = sample(remaining, m, replace=false)
            remaining = setdiff(remaining, batch)
            W, b, ∇W, ∇b = GD(W,b,batch)
        end
        acc = accuracy(W,b)
        println(acc)
    end

    return W, b, ∇W, ∇b
end

# some tuning parameters
N = length(trainy)
m = 25       # batch size
epochs = 10  # number of complete passes through the training data
α = 0.01     # learning rate / step size
λ = 0.01     # regularization parameter
W, b, ∇W, ∇b = BGD(N, m, epochs, α=α, λ=λ)



LoadError: BoundsError: attempt to access 2-element Vector{Matrix{Float64}} at index [3]