### Downloading and Exploring Data

In [33]:
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor

MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

In [34]:
path = datasets.download_data(MNIST_URL, ext='.gz'); path

PosixPath('/Users/jeff/.fastai/data/mnist.pkl.gz')

In [35]:
with gzip.open(path, 'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

In [36]:
x_train,y_train,x_valid,y_valid = map(tensor, (x_train,y_train,x_valid,y_valid))

### Predicting with Random Weights

In [37]:
def normalize(x, m, s): return (x-m)/s

In [38]:
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [39]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [40]:
nh = 50

In [63]:
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

In [64]:
x_valid.mean(),x_valid.std()

(tensor(-0.0057), tensor(0.9924))

In [65]:
def lin(x, w, b): return x.numpy() @w.numpy() + b.numpy()

In [66]:
def relu(x): return tensor(x).clamp_min(0.)

In [67]:
t = relu(lin(x_valid, w1, b1))

In [68]:
t.mean()

tensor(0.3944)

In [69]:
t.std()

tensor(0.6040)

In [70]:
w1 = torch.randn(m,nh)*math.sqrt(2./m )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(0.6131), tensor(0.8657))

In [71]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [73]:
predictions = model(x_valid)

In [80]:
predictions

array([[ 0.199652],
       [ 0.087199],
       [-0.260253],
       [-0.141052],
       ...,
       [ 0.706234],
       [-0.252871],
       [ 0.257694],
       [-0.230753]], dtype=float32)

* So could imagine that want numbers between 0 and 10.

### Gradients and Loss - a love story

In [92]:
import numpy as np
def mse(pred, targ): 
    diff = (pred.squeeze(-1) - targ)
    return np.square(diff).mean()

In [93]:
y_train,y_valid = y_train.float(),y_valid.float()

In [94]:
preds = model(x_train)

In [95]:
mse(preds, y_train.numpy())

28.519163

* Gradients

In [96]:
# inp -> X
def forward_and_backward(X, targ):
    # forward pass:
    l1 = X @ w1 + b1
    l2 = relu(l1)
    preds = l2 @ w2 + b2
    # we don't actually need the loss in backward!
    loss = mse(preds, targ)
    
    # backward pass:
    mse_grad(preds, targ)
    lin_grad(l2, preds, w2, b2)
    relu_grad(l1, l2)
    lin_grad(X, l1, w1, b1)

In [None]:
def mse_grad(preds, targ): 
    # grad of loss with respect to output of previous layer
    preds.g = 2. * (preds.squeeze() - targ).unsqueeze(-1) / preds.shape[0]

In [None]:
def relu_grad(l1, out):
    # grad of relu with respect to input activations
    inp.g = (inp>0).float() * out.g

In [None]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)