In [93]:
import torch
import math
import torch.nn.functional as F
from torchvision.datasets import MNIST

# data preprocessing

In [569]:
ds = MNIST('.',train = True, download= True)
val_ds = MNIST('.',train = False, download= True)

In [570]:
def normalize(x,m,s):
    return (x-m)/s

In [571]:
x = ds.data.reshape(60000,(28*28)).float()#/255. # it is not good to normalize images with 255
y = ds.targets

val_x = val_ds.data.reshape(val_ds.data.shape[0],(28*28)).float()#/255.
val_y = val_ds.targets

In [572]:
mean = x.mean()
std = x.std()
mean , std

(tensor(33.3184), tensor(78.5675))

In [573]:
x = normalize(x,mean,std) # it is better to normalize images with (x-mean) / std
val_x = normalize(val_x,mean,std)

In [574]:
x.mean(),x.std()

(tensor(1.8601e-08), tensor(1.))

In [535]:
n, m = x.shape
c = y.max()+1

In [9]:
n,m,c

(60000, 784, tensor(10))

# weight initialization

In [536]:
n_in = 28 *28
nh = 50
n_out = 1
bs = 64
lr = 0.01

In [537]:
# kaiming init / he init for relu, which can not maintain mean and std
w1 = torch.randn(n_in,nh)* (2/m)**0.5 
b1 = torch.zeros(nh)
w2 = torch.randn(nh,n_out)* (2/m)**0.5 
b2 = torch.zeros(1)

# Function based NN

In [12]:
def lin(x, w, b):
    return x @ w + b

In [13]:
def relu(x):
    return x.clamp_min(0).float()

In [14]:
def mse(pred, y):
    return ((pred.squeeze(-1)-y)**2).float().mean()

In [15]:
#Al 
def mse_grad(pred, y):
    pred.g = 2. * (pred.squeeze() - y).unsqueeze(-1) / y.shape[0]

In [16]:
def relu_grad(Z, A):
    Z.g = (Z > 0).float() * A.g

In [17]:
def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [18]:
def forward_backward(x, y, w1, b1, w2, b2):
    l1 = lin(x, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    
    loss = mse(out, y)
    
    mse_grad(out ,y)
    lin_grad(l2, out, w2, b2)
    relu_grad(l1, l2)
    lin_grad(x, l1, w1, b1)

In [19]:
forward_backward(x,y,w1,b1,w2,b2)

# Class based NN

In [20]:
class Module:
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
        
    def forward(self):
        raise Exception('not implemented')
    def bwd(self):
        raise Exception('not implemented')
        
    def backward(self):
        self.bwd(self.out,*self.args)

In [21]:
class Relu(Module):
    def forward(self,x):
        return x.clamp_min(0.)-0.5
    def bwd(self, out, x):
        x.g = (x > 0).float()*out.g
    

In [22]:
class Lin(Module):
    def __init__(self,w,b):
        self.w = w
        self.b = b
    
    def forward(self, x):
        return ( x @ self.w ) + self.b
    
    def bwd(self,out ,x):
        x.g = out.g @ self.w.T
        self.w.g = x.T @ out.g
        self.b.g = out.g.sum(0)

In [23]:
class Mse(Module):
    def forward(self,pred,y):
        return ((pred.squeeze() -y )**2).mean()
    def bwd(self,out, pred , y):
        pred.g = 2*(pred.squeeze() - y ).unsqueeze(-1)/y.shape[0]
        

In [24]:
class Model:
    def __init__(self,w1,b1,w2,b2):
        self.layers = [Lin(w1,b1),Relu(),Lin(w2,b2)]
        self.loss = Mse()
    def __call__(self,x,y):
        for l in self.layers:
            x = l(x)
        return self.loss(x,y)
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers):
            l.backward()

In [25]:
model = Model(w1,b1,w2,b2)

In [26]:
loss = model(x,y)

In [27]:
model.backward()

# Pytorch NN

In [28]:
import torch.nn as nn

In [29]:
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

In [30]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

In [31]:
model =Model(n_in,nh,n_out)

In [32]:
loss = model(x,y)

In [33]:
loss.backward()

In [34]:
loss

tensor(28.4902, grad_fn=<MeanBackward0>)

# day 9 How to train your model 

In [35]:
nn.modules.conv._ConvNd.reset_parameters??

In [36]:
# x= x.view(-1,1,28,28)

In [538]:
x.shape

torch.Size([60000, 784])

In [539]:
def stats(x): return x.mean(),x.std()

In [540]:
l1 = nn.Conv2d(1, 32, 5)

In [541]:
stats(l1.weight)

(tensor(-0.0042, grad_fn=<MeanBackward0>),
 tensor(0.1126, grad_fn=<StdBackward0>))

In [542]:
def gain(a):
    return math.sqrt(2.0 /(1 + a**2))

In [543]:
gain(5**0.5)

0.5773502691896257

In [544]:
def kaiming2(x, a, use_fan_out=False):
    nf, ni, *_ = x.shape
    rec_fs = x[0,0].shape.numel()
    fan = nf*rec_fs if use_fan_out else ni*rec_fs
    std = gain(a) /math.sqrt(fan)
    bound = math.sqrt(3.) * std
    x.data.uniform_(-bound,bound)

In [545]:
kaiming2(l1.weight,a=0)
stats(x)

(tensor(1.8601e-08), tensor(1.))

# cross entropy loss

In [45]:
def log_softmax1(x):
    return (x.exp()/x.exp().sum(-1,keepdims= True)).log()

In [46]:
def log_softmax2(x):
    return x - x.exp().sum(-1, keepdims= True).log()

In [47]:
log_softmax1(torch.arange(5)),log_softmax2(torch.arange(5)) # same  b/c: log(a/b) == log(a) - log(b) & ln(e^x)==x

(tensor([-4.4519, -3.4519, -2.4519, -1.4519, -0.4519]),
 tensor([-4.4519, -3.4519, -2.4519, -1.4519, -0.4519]))

In [48]:
def nll(perd , y):
    return - pred[range(y.shape[0]), y].mean()

In [79]:
def logsumexp(x):
    m = x.max(-1)[0]
    print(x.shape,m.shape)
    return m + (x - m[:,None]).exp().sum(-1).log()

In [80]:
def log_softmax(x): # now it is stable then previous two version
    return x - logsumexp(x)

In [81]:
m = nn.Sequential(nn.Linear(28*28,256),nn.ReLU(),
                  nn.Linear(256,64),nn.ReLU(),
                  nn.Linear(64,10) )


In [84]:
out = m(x)

In [86]:
pred = log_softmax1(out)

In [87]:
pred.shape,y.shape

(torch.Size([60000, 10]), torch.Size([60000]))

In [88]:
nll(pred,y)

tensor(2.3286, grad_fn=<NegBackward0>)


# Training loop and dataloader

In [546]:
class Model(nn.Module):
    def __init__(self,n_in,nh,n_out):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(n_in,nh,),nn.ReLU(),nn.Linear(nh,n_out)])
    def __call__(self,x):
        for l in self.layers: x = l(x)
        return x

In [547]:
def accuracy(pred, y):
    return (torch.argmax(pred, dim=1)  ==y).float().mean()

In [548]:
class Dataset:
    def __init__(self,x,y): self.x,self.y = x,y
        
    def __len__(self): return len(self.y)
    
    def __getitem__(self,i): return self.x[i] ,self.y[i]

In [549]:
class DataLoader:
    def __init__(self, ds, bs): self.bs, self.ds = bs , ds
    def __iter__(self):
        bs = self.bs
        for i in range(0,len(self.ds),bs): yield self.ds[i:i+bs]

In [550]:
class Sampler:
    def __init__(self,ds,bs,shuffle = True):
        print(len(ds))
        self.m, self.bs, self.shuffle = len(ds),bs,shuffle
    def __iter__(self):
        
        idx = torch.randperm(self.m) if self.shuffle else torch.arange(self.m)
        print(len(idx),self.m)
        for i in (0, self.m, self.bs): yield idx[i:i+self.bs]

In [551]:
class Optimizer:
    def __init__(self,params,lr):
        self.params, self.lr = list(params), lr
        
    def step(self):
        with torch.no_grad():
            for p in self.params: p -= p.grad * self.lr
    def zero_grad(self):
        for p in self.params : p.grad.data.zero_()

In [552]:
dl = DataLoader(ds,64)

In [553]:
def collate(b):
    xs,ys = zip(*b)
    return torch.stack(xs),torch.stack(ys)
class Dataloader:
    def __init(self, ds,sampler,collate_fn= collate):
        self.ds , self,sampler, self.collate_fn = ds, sampler, collate_fn
    def __iter__(self):
        for s in sampler : yield self.collate_fn([ds[i] for i in s])

In [554]:
from torch.utils.data import SequentialSampler, RandomSampler, DataLoader

In [580]:
model = Model(n_in,nh,10)
opt = Optimizer(model.parameters(),0.1)
loss_fn = F.cross_entropy # F.crossentropy(pred,y) == nll(log_softmax(x),y)
ds = Dataset(x,y)
val_ds = Dataset(val_x,val_y)

In [591]:
def get_dls(train_ds,val_ds,bs,**kwargs):
    return (DataLoader(train_ds,batch_size=bs,shuffle=True,**kwargs), #sampler = RandomSampler(ds), collate_fn=collate)
            DataLoader(val_ds,batch_size=bs*2,**kwargs) ) #sampler=SequentialSampler(val_ds),collate_fn=collate)

class DataBunch:
    def __init__(self, dl, val_dl, c=None):
        self.dl, self.val_dl, self.c = dl, val_dl, c
    @property
    def train_ds(self):
        return self.dl.dataset
    @property
    def val_ds(self):
        return self.val_dl.dataset

In [592]:
import torch.optim as optim

In [595]:
def get_model(data,lr=0.1,nh=50):
    m = data.train_ds.x.shape[1]
    model = nn.Sequential(nn.Linear(m,nh),nn.ReLU(),nn.Linear(nh,data.c))
    return model, optim.SGD(model.parameters(),lr=lr)

class Learner:
    def __init__(self,model, opt, loss_fn, data):
        self.model,self.opt,self.loss_fn,self.data = model, opt, loss_fn,data

In [598]:
data = DataBunch(*get_dls(ds,val_ds,bs),10)
model, opt = get_model(data)
learn = Learner(model,opt,loss_fn,data)

In [602]:
def fit(epochs,learn):
    for epoch in range(epochs):
        learn.model.train()
        for xb,yb in learn.data.dl:
            loss = learn.loss_fn(model(xb),yb)
            loss.backward()
            learn.opt.step()
            learn.opt.zero_grad()
            
        learn.model.eval()  
        with torch.no_grad():
            tot_loss, tot_acc = 0.,0.
            for val_xb,val_yb in learn.data.val_dl:
                pred = learn.model(val_xb)
                tot_loss += learn.loss_fn(pred ,val_yb)
                tot_acc  += accuracy(pred,val_yb)
        nv = len(val_dl)    
        print(f'epoch {epoch}: loss {tot_loss/nv}, accuarcy {tot_acc/nv}')
    return tot_loss/nv

In [603]:
fit(5,learn)

epoch 0: loss 0.18881641328334808, accuarcy 0.9434335231781006
epoch 1: loss 0.1419752538204193, accuarcy 0.956289529800415
epoch 2: loss 0.11192645132541656, accuarcy 0.9662777185440063
epoch 3: loss 0.09910941123962402, accuarcy 0.969343364238739
epoch 4: loss 0.11994745582342148, accuarcy 0.9634097814559937


tensor(0.1199)