In [None]:
import torch
from torch.utils import data
from datautils import FizBuzDataSet, print_out

### Data Utils
All those data-shuffling, indexing-through-data and meddling-with-batch-size had gone

In [None]:
dataset = FizBuzDataSet()

In [None]:
len(dataset)

In [None]:
dataset[1][0].shape

In [None]:
dataset[1][1].shape

In [None]:
dataset.decoder(dataset[3][0])

In [None]:
bsize = 64
loader = data.DataLoader(dataset, batch_size=bsize, num_workers=4, shuffle=True)

In [None]:
for data in loader:
    print(data[0].shape, data[1].shape, len(data))

## Network Designing

In [None]:
from model import FizBuzNet, JITFizBuzNet

- torch.nn.Module
- torch.nn
- forward function

In [None]:
net = FizBuzNet()
print(net)

In [None]:
for param in net.parameters():
    print(param.shape)

In [None]:
sum([param.numel() * param.element_size() for param in net.parameters()])

## Training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

### Hyperparameters, loss, optimizer

In [None]:
outli = ['fizbuz', 'buz', 'fiz', 'number']
epochs = 500
batches = 64
lr = 0.01
net = FizBuzNet()
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=lr)

### Let's Loop

In [None]:
for epoch in range(epochs):
    for x, y in loader:
        optimizer.zero_grad()
        hyp = net(x)
        loss = loss_fn(hyp, y)
        loss.backward()
        optimizer.step()
    if not epoch % 50:
        i = 0
        x = dataset.decoder(x[i])
        y = hyp[i].max(0)[1].item()
        pred = outli[y]
        print_out(epoch, x, pred, loss.item())

### Tasks
- Try with PDB to see the effect of zero_grad()
- Check the value change after `optimizer.step()`
- Try with PDB to see how dynamic graph easify the debugging

In [None]:
import pdb

In [None]:
for epoch in range(epochs):
    for x, y in loader:
        pdb.set_trace()
        optimizer.zero_grad()
        hyp = net(x)
        loss = loss_fn(hyp, y)
        loss.backward()
        optimizer.step()
    if not epoch % 50:
        i = 0
        x = dataset.decoder(x[i])
        y = hyp[i].max(0)[1].item()
        pred = outli[y]
        print_out(epoch, x, pred, loss.item())
"""
check for
net.hidden.weight[0, 0]
net.hidden.weight.grad[0, 0]
"""

### Tracer

In [None]:
trace, out = torch.jit.get_trace_graph(net, x)
print(trace)

### Profiling & JIT

While you can add any profiler to find out the bottle necks, torch profiler gives you more clear stats about the neural network level profiling report

In [None]:
net = FizBuzNet()
x, y = next(loader.__iter__())
print(x.shape, y.shape)

In [None]:
with torch.autograd.profiler.profile() as prof:
    net(x)
print(prof)

In [None]:
net = JITFizBuzNet()

x, y = next(loader.__iter__())
with torch.autograd.profiler.profile() as prof:
    net(x)
print(prof)

In [None]:
x, y = next(loader.__iter__())
with torch.autograd.profiler.profile() as prof:
    net(x)
print(prof)