# Mini-batch Training

In [1]:
from exp.nb_01 import *


x, y = get_mnist_data()

In [2]:
x.shape, y.shape

(torch.Size([60000, 784]), torch.Size([60000]))

In [3]:
def normalize(x, m, s): return (x - m) / s


x_train, x_test = x[:50000], x[50000:]
y_train, y_test = y[:50000], y[50000:]

In [4]:
m, s = x_train.mean(), x_train.std()

x_train = normalize(x_train, m, s)
x_test = normalize(x_test, m, s)  # normalizing the same way as train part

In [5]:
x_train.mean(), x_train.std()

(tensor(2.1126e-08), tensor(1.))

## Model

In [6]:
from torch import nn


class Model(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.lin1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.lin1(x)
        x = self.relu(x)
        x = self.lin2(x)
        return x

In [7]:
model = Model(784, 128, 10)

In [21]:
%timeit -n 10 _=model(x_test)

24.3 ms ± 1.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Loss Function


We will use CE as loss function.


First, given logits we need to compute softmax

$$ p_i = \frac{e^{l_i}}{\sum\limits_{j=1}^{n} e^{l_j}} $$

$$ softmax(l_1, \ldots, l_n) = (p_1, \ldots, p_n) $$

Cross Entropy Loss - given one hot encoding of the correct class $ y = (0, \ldots, 1, \ldots, 0) $, where $1$ in $j$-th position

$$ -\sum\limits_{i=1}^{n} y_i \log p_i = - \log p_j $$

Total loss $ - \frac{1}{l} \sum\limits_{i=1}^{l} \log p_i $

In [27]:
model(x_test[:1]), y_test[0]

(tensor([[ 0.1693, -0.2185, -0.2553,  0.3165, -0.2184, -0.1091,  0.0361, -0.1630,
           0.1680,  0.0583]], grad_fn=<AddmmBackward>), tensor(3))

In [8]:
def nll(p, y):
    return -p[range(y.shape[0]), y].float().mean()

### Tricks

For numerical stability, we don't want to get very hight values of $e^{l_i}$, therefore we can do following

$$ p_i = \frac{e^{l_i - x}}{\sum_j e^{l_j - x}}, ~\text{where}~ x = \max \{ l_i \}$$

Hence log probability if equal to $ \log p_i = l_i - x - \log\left( \sum_j e^{l_j - x} \right) $

In [87]:
l = model(x_test)
l.shape

torch.Size([10000, 10])

In [41]:
l.max(dim=1, keepdim=True).values.shape

torch.Size([10000, 1])

In [9]:
def log_softmax(l):
    x = l.max(dim=1, keepdim=True).values
    l -= x
    l -= l.exp().sum(dim=1, keepdim=True).log()
    return l

In [89]:
import torch.nn.functional as F


near(log_softmax(l), F.log_softmax(l, dim=1))

True

In [10]:
def cross_entropy_loss(l, target):
    return nll(log_softmax(l), target)

In [95]:
near(cross_entropy_loss(l, y_test), torch.nn.CrossEntropyLoss()(l, y_test))

True

### Accuracy

Lets define accuracy metric

In [11]:
def accuracy(l, target):
    _, top_indices = l.topk(k=1, dim=1)
    acc = (target.unsqueeze(-1) == top_indices).float().mean()
    return acc

In [119]:
accuracy(l, y_test)

tensor(0.1034)

## Training Loop

First lets overfit to a single mini-batch

In [123]:
model = Model(784, 128, 10)

criterion = cross_entropy_loss

print(criterion(model(x_test), y_test))

lr = 0.1
batch_size = 64

n_epochs = 1

for e in range(n_epochs):
    total_loss = 0
    
    for i in range(0, x_train.shape[0], batch_size):
    
        x = x_train[i: i + batch_size]
        y = y_train[i: i + batch_size]
        
        output = model(x)
        loss = criterion(output, y)
        total_loss += loss.item()
        
        loss.backward()
        
        with torch.no_grad():
            model.lin1.weight -= model.lin1.weight.grad * lr
            model.lin1.bias -= model.lin1.bias.grad * lr
            
            model.lin2.weight -= model.lin2.weight.grad * lr
            model.lin2.bias -= model.lin2.bias.grad * lr
    
            model.lin1.weight.grad.zero_()
            model.lin1.bias.grad.zero_()
            
            model.lin2.weight.grad.zero_()
            model.lin2.bias.grad.zero_()
            
        acc = accuracy(output, y)
        
    print(total_loss/(x_train.shape[0] // batch_size))

tensor(2.3389, grad_fn=<NegBackward>)
0.2816978511036816


In [124]:
acc

tensor(0.9375)

## Refactoring
### Parameters

First thing we want to do is to make 
```Python
model.lin1.weight -= model.lin1.weight.grad * lr
model.lin1.bias -= model.lin1.bias.grad * lr

model.lin2.weight -= model.lin2.weight.grad * lr
model.lin2.bias -= model.lin2.bias.grad * lr

model.lin1.weight.grad.zero_()
model.lin1.bias.grad.zero_()

model.lin2.weight.grad.zero_()
model.lin2.bias.grad.zero_()
```

more compact by indroducing `model.parameters()`

In [14]:
class Model():
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        self._modules = {}
        
        self.lin1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(hidden_dim, output_dim)
        
    def __setattr__(self, k, v):
        if not k.startswith('_'): 
            self._modules[k] = v
        
        super().__setattr__(k, v)
        
    def __repr__(self):
        return str(self._modules)
        
    def __call__(self, x):
        x = self.lin1(x)
        x = self.relu(x)
        x = self.lin2(x)
        return x
    
    def parameters(self):
        for layer in self._modules.values():
            for p in layer.parameters():
                yield p

In [152]:
model = Model(784, 128, 10)

In [153]:
model

{'lin1': Linear(in_features=784, out_features=128, bias=True), 'relu': ReLU(), 'lin2': Linear(in_features=128, out_features=10, bias=True)}

In [155]:
model = Model(784, 128, 10)

criterion = cross_entropy_loss

print(criterion(model(x_test), y_test))

lr = 0.1
batch_size = 64

n_epochs = 1

for e in range(n_epochs):
    total_loss = 0
    
    for i in range(0, x_train.shape[0], batch_size):
    
        x = x_train[i: i + batch_size]
        y = y_train[i: i + batch_size]
        
        output = model(x)
        loss = criterion(output, y)
        total_loss += loss.item()
        
        loss.backward()
        
        with torch.no_grad():
            for param in model.parameters():
                param -= param.grad * lr
                param.grad.zero_()
            
        acc = accuracy(output, y)
        
    print(total_loss/(x_train.shape[0] // batch_size))

tensor(2.2882, grad_fn=<NegBackward>)
0.28319782536672417


### Optimizer

Lets replace code 

```Python
with torch.no_grad():
    for param in model.parameters():
        param -= param.grad * lr
        param.grad.zero_()
```

with something more compact

In [12]:
class SGD():
    
    def __init__(self, parameters, lr=0.1):
        self.parameters = list(parameters)
        self.lr = lr
        
    def zero_grad(self):
        with torch.no_grad():
            for p in self.parameters:
                p.grad.zero_()
    
    def step(self):
        with torch.no_grad():
            for p in self.parameters:
                p -= p.grad * self.lr

In [16]:
def fit(model, criterion, optim, batch_size = 64, n_epochs = 1):
    for e in range(n_epochs):
        total_loss = 0
        total_acc = 0

        for i in range(0, x_train.shape[0], batch_size):

            x = x_train[i: i + batch_size]
            y = y_train[i: i + batch_size]

            output = model(x)
            loss = criterion(output, y)
            total_loss += loss.item()

            loss.backward()

            optim.step()
            optim.zero_grad()

            acc = accuracy(output, y)
            total_acc += acc

        print(total_loss/(x_train.shape[0] // batch_size))
        print(total_acc/(x_train.shape[0] // batch_size))

In [17]:
model = Model(784, 128, 10)

criterion = cross_entropy_loss
optim = SGD(model.parameters(), lr=0.1)

print(criterion(model(x_test), y_test))

fit(model, criterion, optim)

tensor(2.3469, grad_fn=<NegBackward>)
0.28267738204473264
tensor(0.9187)


### nn.ModuleList and nn.Sequential


If in the previous Model class we will replace 
```Python
self.lin1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.lin2 = nn.Linear(hidden_dim, output_dim)
```

with 

```Python
self.layers = [nn.Linear(784, 128), nn.ReLU(), nn.Linear(128, 10)]
```

it won't work since Python list has no `parameters` attribute because it's not a instance of Module.


To solve this problem lets build simple versions of `nn.ModuleList` and `nn.Sequential`

In [172]:
nn.Module.add_module?

In [18]:
class ModuleList(nn.Module):
    
    def __init__(self, layers):
        super().__init__()
        
        for i, l in enumerate(layers):
            self.add_module(f'layer_{i}', l)
    
    def __iter__(self):
        return iter(self._modules.values())

            
class Sequential():
    
    def __init__(self, *layers):
        self.layers = ModuleList(layers)
        
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x
    
    def parameters(self):
        return self.layers.parameters()

In [19]:
model = Sequential(nn.Linear(784, 128), nn.ReLU(), nn.Linear(128, 10))

criterion = cross_entropy_loss
optim = SGD(model.parameters(), lr=0.1)

print(criterion(model(x_test), y_test))

fit(model, criterion, optim)

tensor(2.3540, grad_fn=<NegBackward>)
0.27868394673862773
tensor(0.9202)


### Dataset and DataLoader

First lets try to turn this

```Python
x = x_train[i: i + batch_size]
y = y_train[i: i + batch_size]
```

into something like this
```Python
x, y = train_ds[i: i + batch_size]
```

In [25]:
class Dataset:
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        return self.x[i], self.y[i]

In [26]:
train_ds = Dataset(x_train, y_train)
assert len(train_ds) == x_train.shape[0]

In [28]:
def fit(train_ds, model, criterion, optim, batch_size = 64, n_epochs = 1):
    for e in range(n_epochs):
        total_loss = 0
        total_acc = 0

        for i in range(0, len(train_ds), batch_size):

            x, y = train_ds[i: i + batch_size]

            output = model(x)
            loss = criterion(output, y)
            total_loss += loss.item()

            loss.backward()

            optim.step()
            optim.zero_grad()

            acc = accuracy(output, y)
            total_acc += acc

        print(total_loss/(x_train.shape[0] // batch_size))
        print(total_acc/(x_train.shape[0] // batch_size))

In [29]:
model = Sequential(nn.Linear(784, 128), nn.ReLU(), nn.Linear(128, 10))

criterion = cross_entropy_loss
optim = SGD(model.parameters(), lr=0.1)

fit(train_ds, model, criterion, optim)

0.28107381748958526
tensor(0.9184)


Now it's good time to simplify 

```Python
for i in range(0, len(train_ds), batch_size):
    x, y = train_ds[i: i + batch_size]
```

and to do
```Python
for x, y in train_dataloader:
    ...
```

In [35]:
class DataLoader:
    
    def __init__(self, ds, batch_size):
        self.ds = ds
        self.batch_size = batch_size
        
    def __len__(self):
        return len(self.ds) // self.batch_size
    
    def __iter__(self):
        for i in range(0, len(self.ds), self.batch_size):
            x, y = self.ds[i: i + self.batch_size]
            yield x, y

In [36]:
train_dataloader = DataLoader(train_ds, 64)

In [32]:
x,y = next(iter(train_dataloader))

In [37]:
def fit(train_dataloader, model, criterion, optim, n_epochs = 1):
    for e in range(n_epochs):
        total_loss = 0
        total_acc = 0

        for x, y in train_dataloader:
            output = model(x)
            loss = criterion(output, y)
            loss.backward()
            optim.step()
            optim.zero_grad()

            total_loss += loss.item()
            
            acc = accuracy(output, y)
            total_acc += acc

        print(total_loss / len(train_dataloader))
        print(total_acc / len(train_dataloader))

In [38]:
model = Sequential(nn.Linear(784, 128), nn.ReLU(), nn.Linear(128, 10))

criterion = cross_entropy_loss
optim = SGD(model.parameters(), lr=0.1)

fit(train_dataloader, model, criterion, optim)

0.28311672429917867
tensor(0.9192)
