# HW3, problem 4

Finite difference gradient approximation to check the gradient of the cost function with respect to the weights and biases in the network

In [1]:
include("hw3_q3.jl")

# unroll the weights and biases into a single vector.
# note this function will also work for unrolling the gradient.
# note that this is hard-coded for a 3-layer NN.
function unroll(W, b)
    vcat(vec(W[1]), vec(W[2]), vec(b[1]), vec(b[2]))
end

# given a single vector θ, reshape the parameters into the data
# structures that are used for backpropagation, that is, W and b, or
# ∇w and ∇b.  note that this is hard-coded for a 3-layer NN.
function reshape_params(θ)
    n1 = sizes[1]  # number of nodes in layer 1
    n2 = sizes[2]  # number of nodes in layer 2
    n3 = sizes[3]
    W1 = reshape(θ[1:(n2*n1)], n2, n1)
    W2 = reshape(θ[(n2*n1 + 1):(n2*n1 + n2*n3)], n3, n2)
    b1 = θ[(n2*n1 + n2*n3 + 1):(n2*n1 + n2*n3 + n2)]
    b2 = θ[(n2*n1 + n2*n3 + n2 + 1):length(θ)]
    W = [ W1, W2 ]
    b = [ b1, b2 ]
    return W, b
end

Epoch number 1, accuracy rate 0.1866
Epoch number 2, accuracy rate 0.2737
Epoch number 3, accuracy rate 0.3897
Epoch number 4, accuracy rate 0.5041
Epoch number 5, accuracy rate 0.5957
Epoch number 6, accuracy rate 0.6573
Epoch number 7, accuracy rate 0.7102
Epoch number 8, accuracy rate 0.7557
Epoch number 9, accuracy rate 0.8067
Epoch number 10, accuracy rate 0.8395
Epoch number 11, accuracy rate 0.8644
Epoch number 12, accuracy rate 0.8789
Epoch number 13, accuracy rate 0.8842
Epoch number 14, accuracy rate 0.8905
Epoch number 15, accuracy rate 0.868
Epoch number 16, accuracy rate 0.8863
Epoch number 17, accuracy rate 0.8968
Epoch number 18, accuracy rate 0.8958
Epoch number 19, accuracy rate 0.9001
Epoch number 20, accuracy rate 0.8882
Epoch number 21, accuracy rate 0.8958
Epoch number 22, accuracy rate 0.8992
Epoch number 23, accuracy rate 0.8948
Epoch number 24, accuracy rate 0.9018
Epoch number 25, accuracy rate 0.8984
Epoch number 26, accuracy rate 0.8956
Epoch number 27, accur

reshape_params (generic function with 1 method)

In [2]:
# evaluate the cost function for a batch of training examples
# θ is the unrolled vector of weights and biases.
# batch is the set of indices of the batch of training examples.
function J(θ, batch, λ)
    
    m = length(batch)
    sumJ = 0.0  # to accumulate the sum for the batch.
    # we need to pass W, b to feedforward, so we re-create W, b from θ
    W, b = reshape_params(θ)
    for i in 1:m
        # grab training example i
        x_i = trainx[i, :]
        y_i = trainy[i]
        # feedforward to obtain a, z
        (a,z) = feedforward(W, b, x_i)
        # accumulate the cost function
        sumJ = sumJ + 1/2 * norm(a[3] - digit2vector(y_i))^2
    end

    # return the cost function. note that the regularization term only
    # applies to the weights, not the biases
    return 1/m * sumJ + λ/2 * (sum(W[1].^2) + sum(W[2].^2))
end

J (generic function with 1 method)

In [3]:
# create the ith basis vector
function e(i)
    e = zeros(sizes[2]*sizes[1] + sizes[3]*sizes[2] + sizes[2] + sizes[3])
    e[i] = 1
    return e
end

θplus(v, i; ϵ=1e-4) = v .+ ϵ*e(i)
θminus(v, i; ϵ=1e-4) = v .- ϵ*e(i)

θminus (generic function with 1 method)

In [4]:
# compute the difference between the ith element of the gradient as
# computed from backpropagation (this is ∇θ[i]) and the approximation of
# the ith element of the gradient as obtained from finite differencing.
# the idea is to see if the backpropagation code is correctly computing
# the gradient of the cost function.
function compare1(i, θ, ∇θ, batch, λ; ϵ=1e-4)
    # i is the ith element of the unrolled gradient θ,
    return ∇θ[i] - ( J(θplus(θ, i, ϵ=ϵ), batch, λ) - J(θminus(θ, i, ϵ=ϵ), batch, λ) )/(2*ϵ)
end

compare1 (generic function with 1 method)

In [5]:
# compare each element of the gradient as computed from
# backpropagation to its estimate as obtained from finite
# differencing.
function compare(W, b, ∇W, ∇b, λ)
    θ = unroll(W, b)
    ∇θ = unroll(∇W, ∇b)
    m = length(trainy)

    # create a batch of 5000 training examples to evaluate the cost function.
    # we really just need the indices of the batch.
    batch = sample(1:m, 5000)

    # random sample of 200 gradient components to check
    components = sample(1:length(θ), 200)

    # loop over the 200 gradient components.
    # for each gradient component
    #   perform finite differencing by calling compare1
    #   if the difference exeeds 0.001
    #      print a message
    #   end
    # print number of components that exceeded the tolerance of 0.001
    count = 0
    for i in 1:length(components)
        difference = compare1(components[i], θ, ∇θ, batch, λ)
        if abs(difference) > 0.001
            println("difference exceeding 0.001: ", difference)
            count = count + 1
        end
    end
    println("Number of components that exceeded the tolerance: ", count)
end

# Note: W, b, ∇W, ∇b have been already been
# computed. Use your code from problem 3 to do this.
# λ should be same as the λ that was used for problem 3.
compare(W, b, ∇W, ∇b, λ)

difference exceeding 0.001: -0.0017222017197629736
difference exceeding 0.001: 0.09502166425766669
difference exceeding 0.001: -0.04651520520531268
difference exceeding 0.001: -0.006357655681201752
difference exceeding 0.001: 0.057783665731816924
difference exceeding 0.001: -0.0035090734959547106
difference exceeding 0.001: 0.017294729970658382
difference exceeding 0.001: 0.4971653724017286
difference exceeding 0.001: 0.0016031548483228962
difference exceeding 0.001: 0.001237423307966085
difference exceeding 0.001: -0.0020834912100179395
difference exceeding 0.001: 0.0010240464342660045
difference exceeding 0.001: -0.00151329751356005
difference exceeding 0.001: -0.002015308614627713
difference exceeding 0.001: -0.0039903406663976786
difference exceeding 0.001: -0.01249970254486283
difference exceeding 0.001: 0.2955195798302165
difference exceeding 0.001: 0.0015268001391765872
difference exceeding 0.001: -0.00264758153517917
difference exceeding 0.001: 0.0017482259022032801
difference 

There are 63 gradient components exceeding the tolerance of 0.001, but the majority of them are not far off 0.001. This means that the backpropagation code computed the gradient of the cost function pretty accurately.