We encapsulate the gradient update on parameters with a class that handles it for us, paving the way for generalization into algorithms with more advanced optimization/descent.


In [1]:
from fastai.vision.all import *


We first import shape parameters.

In [2]:
pickle_path = URLs.path('mnist_png')/'mnist_png.pkl'
path = untar_data(URLs.MNIST)/'training'

if not pickle_path.exists():
    pickle_path.parent.mkdir(parents=True, exist_ok=True)
    ds = DataBlock(
        blocks = (ImageBlock(PILImageBW), CategoryBlock),
        get_items = get_image_files,
        get_y = parent_label,
        splitter = RandomSplitter(1/6, seed=0)
    ).datasets(path)

    xs, ys = zip(*ds.train, *ds.valid)
    xs = np.stack(L(map(lambda x: np.array(x, dtype=np.float32).reshape(-1), xs))) / 255.
    ys = np.array(ys, dtype=np.int64)

    x_train, x_valid = xs[:len(ds.train)], xs[len(ds.train):]
    y_train, y_valid = ys[:len(ds.train)], ys[len(ds.train):]

    save_pickle(pickle_path, [x_train, y_train, x_valid, y_valid])

    del ds, xs, ys, x_train, y_train, x_valid, y_valid

x_train, y_train, x_valid, y_valid = map(tensor, load_pickle(pickle_path))


In [3]:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        
    def forward(self, x):
        for l in self.layers: x = l(x)
        return x


In [4]:
n, m = x_train.shape
c = y_train.max() + 1
nh = 50

bs = 50                # batch size

lr = 0.5   # learning rate
epochs = 3 # how many epochs to train for

layers = lambda: [nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10)]
model = SequentialModel(layers())


In [5]:
class Optimizer():
    def __init__(self, params, lr=0.5):
        self.params, self.lr=list(params), lr

    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()


In [6]:
loss_func = F.cross_entropy
def accuracy(out, yb):
    return (out.argmax(dim=1) == yb).float().mean()
def report(loss, preds, yb):
    print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')
def fit(model):
    opt = Optimizer(model.parameters())
    for epoch in range(epochs):
        for i in range(0, n, bs):
            s = slice(i, min(n, i + bs))
            xb, yb = x_train[s], y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        report(loss, preds, yb)

fit(model)



0.23, 0.92
0.22, 0.94
0.22, 0.98


We show how to do this in PyTorch.

In [7]:
def get_model():
    model = nn.Sequential(*layers())
    return model, torch.optim.SGD(model.parameters(), lr=lr)


In [8]:
model, opt = get_model()
xb, yb = x_train[:bs], y_train[:bs]
loss_func(model(xb), yb)


tensor(2.3065, grad_fn=<NllLossBackward0>)

In [9]:
def fit(model, opt, loss_func):
    opt = Optimizer(model.parameters())
    for epoch in range(epochs):
        for i in range(0, n, bs):
            s = slice(i, min(n, i + bs))
            xb, yb = x_train[s], y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        report(loss, preds, yb)

fit(model, opt, loss_func)

0.21, 0.94
0.18, 0.96
0.17, 0.96
