We now demonstrate training on a batch.

In [1]:
from fastai.vision.all import *

pickle_path = URLs.path('mnist_png')/'mnist_png.pkl'
path = untar_data(URLs.MNIST)/'training'

if not pickle_path.exists():
    pickle_path.parent.mkdir(parents=True, exist_ok=True)
    ds = DataBlock(
        blocks = (ImageBlock(PILImageBW), CategoryBlock),
        get_items = get_image_files,
        get_y = parent_label,
        splitter = RandomSplitter(1/6, seed=0)
    ).datasets(path)

    xs, ys = zip(*ds.train, *ds.valid)
    xs = np.stack(L(map(lambda x: np.array(x, dtype=np.float32).reshape(-1), xs))) / 255.
    ys = np.array(ys, dtype=np.int64)

    x_train, x_valid = xs[:len(ds.train)], xs[len(ds.train):]
    y_train, y_valid = ys[:len(ds.train)], ys[len(ds.train):]

    save_pickle(pickle_path, [x_train, y_train, x_valid, y_valid])

    del ds, xs, ys, x_train, y_train, x_valid, y_valid

x_train, y_train, x_valid, y_valid = map(tensor, load_pickle(pickle_path))


In [2]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
        
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x


In [3]:
n, m = x_train.shape
c = y_train.max() + 1
nh = 50

model = Model(m, nh, c)
bs = 50                # batch size

xb = x_train[:bs]      # a mini-batch from x
preds = model(xb)      # predictions


The predictions consist of a batch of probability distributions (as logprobs).

In [4]:
preds[0], preds.shape

(tensor([-0.0516,  0.0596,  0.1423, -0.0491,  0.0674, -0.1193,  0.1274,  0.0075,
         -0.0963,  0.0147], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [5]:
loss_func = F.cross_entropy


In [6]:
yb = y_train[:bs]
yb


tensor([3, 8, 7, 2, 8, 5, 6, 9, 2, 2, 5, 1, 8, 5, 6, 5, 6, 0, 3, 6, 8, 6, 2, 9,
        9, 1, 4, 5, 7, 3, 2, 0, 7, 5, 3, 2, 0, 2, 5, 6, 9, 7, 0, 3, 9, 4, 6, 1,
        2, 6])

In [7]:
loss_func(preds, yb)


tensor(2.2986, grad_fn=<NllLossBackward0>)

The following finds the max log-likelihood predictions.

In [8]:
preds.argmax(dim=1)


tensor([2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2])

We define the `accuracy` measure as the mean number of times the MLL predictions match.

In [9]:
def accuracy(out, yb):
    return (out.argmax(dim=1) == yb).float().mean()


In [10]:
accuracy(preds, yb)


tensor(0.1600)

In [12]:
def report(loss, preds, yb):
    print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')


We demonstrate reporting the result metrics of one minibatch.

In [13]:
xb, yb = x_train[:bs], y_train[:bs]
preds = model(xb)
report(loss_func(preds, yb), preds, yb)


2.30, 0.16


Finally, we demonstrate learning over epochs; an epoch is one pass of the data in minibatch sizes.

Observe the backpropagation.

In [11]:
lr = 0.5   # learning rate
epochs = 3 # how many epochs to train for


We explicitly perform learning for each epoch. We use the fact that `.weight` and `.bias` are standard names for these respective weights in a linear layer, and that the results from autograd are kept in `.grad`. One then subtracts the appropriate amount multiplied by `lr`, and then zero out the `.grad` since these are now stale (they are now wrt weights that no longer exist), so that they do not pollute the next batch's computation.

In [14]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i + bs))
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad   * lr
                    l.weight.grad.zero_()
                    l.bias  .grad.zero_()
    report(loss, preds, yb)


0.22, 0.94
0.25, 0.96
0.25, 0.96
