In [1]:
import numpy as np
import pandas as pd

import data
import neuralnet

from matplotlib import pyplot as plt

np.random.seed(seed = 100) # fix random seed to make things reproducible

## (a) Loading Data

In [2]:
train_val = data.load_data()
train_val[0].shape, train_val[1].shape

((60000, 784), (60000,))

In [3]:
test = data.load_data(train=False)
test[0].shape, test[1].shape

((10000, 784), (10000,))

### Pre-process
#### Normalization

In [4]:
train_x, train_y = train_val
train_x, _ = data.z_score_normalize(train_x)
train_val = train_x, train_y

test_x, test_y = test
test_x, _ = data.z_score_normalize(test_x)
test_val = test_x, test_y

## (b) Check gradient implementation

We only use a small subset of data to do this part. Thus, we use 1000 random data from the train/validation set.

##### get a small subset

In [5]:
# shuffle dataset
imgs, labs = train_val

shuffled_idx = np.random.permutation(len(train_val[1]))

imgs = imgs[shuffled_idx]
labs =labs[shuffled_idx]

# get a small subset
small_set = imgs[: 1], labs[:1]
print(small_set[0].shape)
print(small_set[1].shape)

(1, 784)
(1,)


##### get some functions to find weight and gradient

In [6]:
# get a network first
config = data.load_config(("./config.yaml"))
nn = neuralnet.NeuralNetwork(config)

small_x, small_y = small_set[0], small_set[1]

# get a nn
output = nn(small_x, targets=small_y)

# get weights by backpropagation
nn.backward()

In [7]:
epsilon = 0.01

def get_weight_grad(name, layer, idx, nn, is_bias=False):
    if is_bias:
        weight = nn.layers[layer].b[0][idx]
        grad = nn.layers[layer].d_b[idx]
    else:
        weight = nn.layers[layer].w[0][idx]
        grad = nn.layers[layer].d_w[0][idx]

    print(name, ":(", "weight:", weight, "gradient:", grad, ")")
    return weight, grad

def get_loss(weight, layer, idx, nn, is_bias=False):
    higher = weight + epsilon
    lower = weight - epsilon

    if is_bias:
        nn.layers[layer].b[0][idx] = higher
        _, higher_loss = nn.forward(small_x, targets=small_y)
        nn.layers[layer].b[0][idx] = lower
        _, lower_loss = nn.forward(small_x, targets=small_y)

        # reset nn
        nn.layers[layer].b[0][idx] = weight
    else:
        nn.layers[layer].w[0][idx] = higher
        _, higher_loss = nn.forward(small_x, targets=small_y)
        nn.layers[layer].w[0][idx] = lower
        _, lower_loss = nn.forward(small_x, targets=small_y)

        # reset nn
        nn.layers[layer].w[0][idx] = weight

    print("Higher loss is:", higher_loss)
    print("Lower loss is:", lower_loss)
    return higher_loss, lower_loss

def get_estimate(higher, lower):
    est = (higher - lower) / (2 * epsilon)
    print("estimated gradient is:", est)
    return est

def diff_grad(grad, est):
    diff = np.abs((grad - est))
    print("difference between estimated and gradient is:", diff)
    return diff

def check_grad(name, layer, idx, nn, is_bias=False):
    weight, grad = get_weight_grad(name, layer, idx, nn, is_bias=is_bias)
    higher, lower = get_loss(weight, layer, idx, nn, is_bias=is_bias)
    est = get_estimate(higher, lower)
    diff = diff_grad(grad, est)

    return diff, grad, est

##### bias of output weights

In [8]:
# bias of output
b_o_diff, b_o_grad, b_o_est = check_grad("output bias", 2, 7, nn, is_bias=True)

output bias :( weight: 0.0 gradient: 0.012671104872593866 )
Higher loss is: 2.153077486743764
Lower loss is: 2.152824060581794
estimated gradient is: 0.0126713080984997
difference between estimated and gradient is: 2.0322590583467248e-07


##### bias of hidden weights

In [9]:
b_h_diff, b_h_grad, b_h_est = check_grad("hidden bias", 0, 7, nn, is_bias=True)

hidden bias :( weight: 0.0 gradient: -0.1182257632568207 )
Higher loss is: 2.1517643388554943
Lower loss is: 2.1541287967614338
estimated gradient is: -0.11822289529697105
difference between estimated and gradient is: 2.8679598496478276e-06


##### weight of hidden to output

In [10]:
w_ho_1_diff, w_ho_1_grad, w_ho_1_est = check_grad("hidden-output weight_1", 2, 7, nn, is_bias=False)
w_ho_2_diff, w_ho_2_grad, w_ho_2_est = check_grad("hidden-output weight_2", 2, 8, nn, is_bias=False)

hidden-output weight_1 :( weight: 0.0959293411441136 gradient: 0.009273564165974427 )
Higher loss is: 2.153043219621119
Lower loss is: 2.152857746744473
estimated gradient is: 0.009273643832297118
difference between estimated and gradient is: 7.966632269151841e-08
hidden-output weight_2 :( weight: -0.058684298241869014 gradient: 0.009660359079762365 )
Higher loss is: 2.153047101390139
Lower loss is: 2.152853892551448
estimated gradient is: 0.009660441934533637
difference between estimated and gradient is: 8.285477127133178e-08


##### weight of input to hidden

In [11]:
w_ih_1_diff, w_ih_1_grad, w_ih_1_est = check_grad("input-hidden weight_1", 0, 7, nn, is_bias=False)
w_ih_2_diff, w_ih_2_grad, w_ih_2_est = check_grad("input-hidden weight_2", 0, 8, nn, is_bias=False)

input-hidden weight_1 :( weight: 0.03876130722157737 gradient: 0.005871529380401278 )
Higher loss is: 2.153008854589536
Lower loss is: 2.152891424008954
estimated gradient is: 0.00587152902908894
difference between estimated and gradient is: 3.513123377277272e-10
input-hidden weight_2 :( weight: -0.023712037277713923 gradient: -0.005488742979949136 )
Higher loss is: 2.1528952505826426
Lower loss is: 2.153005025436887
estimated gradient is: -0.005488742712222994
difference between estimated and gradient is: 2.677261424707811e-10


##### Report Table

In [12]:
report_table = np.array([["output bias", b_o_diff, b_o_grad, b_o_est],
                         ["hidden bias", b_h_diff, b_h_grad, b_h_est],
                         ["hidden-output weight_1", w_ho_1_diff, w_ho_1_grad, w_ho_1_est],
                         ["hidden-output weight_2", w_ho_2_diff, w_ho_2_grad, w_ho_2_est],
                         ["input-hidden weight_1", w_ih_1_diff, w_ih_1_grad, w_ih_1_est],
                         ["input-hidden weight_2", w_ih_2_diff, w_ih_2_grad, w_ih_2_est]
                         ])

pd.DataFrame(report_table, columns=["name of weight", "difference", "actual gradient", "estimated gradient"])

Unnamed: 0,name of weight,difference,actual gradient,estimated gradient
0,output bias,2.032259058346725e-07,0.0126711048725938,0.0126713080984997
1,hidden bias,2.867959849647828e-06,-0.1182257632568207,-0.118222895296971
2,hidden-output weight_1,7.966632269151842e-08,0.0092735641659744,0.0092736438322971
3,hidden-output weight_2,8.285477127133178e-08,0.0096603590797623,0.0096604419345336
4,input-hidden weight_1,3.513123377277272e-10,0.0058715293804012,0.0058715290290889
5,input-hidden weight_2,2.677261424707811e-10,-0.0054887429799491,-0.0054887427122229
