We import the data primarily for its shape parameters. The main focus of this particular notebook is otherwise on specifying a NN model parametrized to the data shape.

In [1]:
from fastai.vision.all import *

pickle_path = URLs.path('mnist_png')/'mnist_png.pkl'
path = untar_data(URLs.MNIST)/'training'

if not pickle_path.exists():
    pickle_path.parent.mkdir(parents=True, exist_ok=True)
    ds = DataBlock(
        blocks = (ImageBlock(PILImageBW), CategoryBlock),
        get_items = get_image_files,
        get_y = parent_label,
        splitter = RandomSplitter(1/6, seed=0)
    ).datasets(path)

    xs, ys = zip(*ds.train, *ds.valid)
    xs = np.stack(L(map(lambda x: np.array(x, dtype=np.float32).reshape(-1), xs))) / 255.
    ys = np.array(ys, dtype=np.int64)

    x_train, x_valid = xs[:len(ds.train)], xs[len(ds.train):]
    y_train, y_valid = ys[:len(ds.train)], ys[len(ds.train):]

    save_pickle(pickle_path, [x_train, y_train, x_valid, y_valid])

    del ds, xs, ys, x_train, y_train, x_valid, y_valid

x_train, y_train, x_valid, y_valid = map(tensor, load_pickle(pickle_path))


In [2]:
n, m = x_train.shape
c = y_train.max() + 1
nh = 50

In [3]:
def lin(x, w, b):
    return x @ w + b

In [4]:
def relu(x):
    return x.clamp_min(0.)

In [5]:
w1 = torch.randn(m, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

Define the gradient calculation of each input, weight and bias with respect to the output gradients.

In [6]:
def lin_grad(inp, out, w, b):
    # D_X[out(X W + b)] = D[out] * W^T; using matmul to sum over each output coordinate of each input feature (D_xi[out(xi * W_(i,:) + b)]) = sum(D[out] * W(i, :))
    # grad of matmul with respect to input
    inp.g = out.g @ w.t()
    # D_W[out(Wx + b)] = D[our] * x; summed over records.
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    # D_b[out(Wx + b)] = D[out]; then summed over records
    b.g = out.g.sum(0)


In [7]:
def forward_and_backward(inp, targ):
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    # we have to memorize the per-record differences in order to
    # figure out how much the correction as determined by the loss
    # should be allocated to each training example (including if a sign change is necessary). Recall also that for simplicity we have
    # used the MSE loss instead of categorical, so we regard a prediction
    # of 5 is better for the target 6 than 2.
    diff = out[:, 0] - targ
    loss = diff.pow(2).mean()
    
    # backward pass:
    # D[x^2 / N] = 2x / N
    out.g = 2. * diff[:, None] / inp.shape[0]
    lin_grad(l2, out, w2, b2)
    # l2.g being the downstream gradient in the chain rule
    l1.g = (l1 > 0).float() * l2.g
    lin_grad(inp, l1, w1, b1)



In [8]:
forward_and_backward(x_train, y_train)

We save the gradients as we will now be recomputing them using autograd for testing.


In [9]:
def get_grad(x):
    return x.g.clone()
chks = w1, w2, b1, b2, x_train
grads = w1g, w2g, b1g, b2g, ig = tuple(map(get_grad, chks))


We also use autograd to check our results

In [10]:
def mkgrad(x):
    return x.clone().requires_grad_(True)
ptgrads = w12, w22, b12, b22, xt2 = tuple(map(mkgrad, chks))


In [11]:
def mse(output, targ):
    return (output[:, 0] - targ).pow(2).mean()


In [12]:
def forward(inp, targ):
    l1 = lin(inp, w12, b12)
    l2 = relu(l1)
    out = lin(l2, w22, b22)
    return mse(out, targ)


In [13]:
loss = forward(xt2, y_train)
loss.backward()


In [14]:
for a,b in zip(grads, ptgrads):
    test_close(a, b.grad, eps=0.01)
