In [1]:
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor

In [5]:
from torch import nn

In [2]:

MNIST_URL='https://github.com/mnielsen/rmnist/blob/master/data/mnist.pkl.gz?raw=true'
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [3]:
x_train, y_train, x_valid, y_valid = get_data()

In [4]:
x_train = normalize(x_train, x_train.mean(), x_train.std())
x_valid = normalize(x_valid, x_train.mean(), x_train.std())

In [11]:
n,m = x_train.shape
nh = 50

In [60]:
model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))

In [78]:
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

In [61]:
pred = model(x_train)

In [15]:
def log_softmax(x): return (x.exp()/x.exp().sum(-1, keepdim=True)).log()

In [None]:
def log_softmax(x): return (x.exp()/x.exp().sum(-1, keepdim=True)).log() 

In [16]:
sm_pred = log_softmax(pred)

In [17]:
y_train

tensor([5, 0, 4,  ..., 8, 4, 8])

In [22]:
sm_pred[[0,1,2],[5,0,4]]

tensor([-2.3227, -2.6613, -2.1355], grad_fn=<IndexBackward0>)

In [23]:
def nll(inp, target): return -inp[range(target.shape[0]), target].mean()

In [24]:
loss = nll(sm_pred, y_train)

In [25]:
loss

tensor(2.3278, grad_fn=<NegBackward0>)

In [26]:
def logsumexp(x):
    max_ = x.max()
    return max_ + (x-max_).exp().sum(-1).log()

In [27]:
def log_softmax(x):
    return x - logsumexp(x)

In [28]:
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)

In [30]:
import torch.nn.functional as F

In [33]:
loss =  F.nll_loss(F.log_softmax(pred, -1), y_train)

In [34]:
loss = F.cross_entropy(pred, y_train)

In [40]:
def accuracy(pred, targ): return (pred.argmax(-1)==targ).float().mean()

In [73]:
lr = 0.5
bs = 65
epochs = 1

In [74]:
class Optimizer():
    def __init__(self,params, lr=0.5):
        self.params, self.lr = list(params), lr
        
    def step(self):
        with torch.no_grad():
            for p in self.params: p -= p.grad*self.lr
    
    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()

In [75]:
opt = Optimizer(model.parameters())

In [80]:


def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs + 1):
            start_i = i*bs
            end_i = start_i+bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            loss = F.cross_entropy(model(xb), yb)

            loss.backward()
            with torch.no_grad():
                for p in model.parameters(): p -= p.grad * lr
                model.zero_grad()



In [83]:
fit()
F.cross_entropy(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.2053, grad_fn=<NllLossBackward0>), tensor(0.9333))