### Import Packages

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision

from IPython.display import display, clear_output
import pandas as pd
import time
import json

from torch.utils.tensorboard import SummaryWriter
from collections import OrderedDict
from collections import namedtuple
from itertools import product

### Load data

In [2]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

### Build Model CNN

In [3]:
class Network(nn.Module): # line 1
    def __init__(self):
        super().__init__() # line 3
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        
        self.out = nn.Linear(in_features=60, out_features=10)

    def forward(self, t):
        # (1) input layer
        t = t

        # (2) hidden conv layer
        t = self.conv1(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        # (3) hidden conv layer
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        # (4) hidden linear layer
        t = t.reshape(-1, 12 * 4 * 4)
        t = self.fc1(t)
        t = F.relu(t)

        # (5) hidden linear layer
        t = self.fc2(t)
        t = F.relu(t)

        # (6) output layer
        t = self.out(t)
        #t = F.softmax(t, dim=1)

        return t

### Run Builder

In [4]:
class RunBuilder():
    def __init__(self):

        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None

        self.network = None
        self.loader = None
        self.tb = None
        
    def begin_run(self, run, network, loader):

        self.run_start_time = time.time()

        self.run_params = run
        self.run_count += 1
        self.network = network
        self.loader = loader
        
        self.tb = SummaryWriter(comment=f'-{run}')

        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)

        self.tb.add_image('images', grid)
        self.tb.add_graph(
            self.network
        ,images.to(getattr(run, 'device', 'cpu'))
    )
    def end_run(self):
        self.tb.close()
        self.epoch_count = 0
        
    def begin_epoch(self):
        self.epoch_start_time = time.time()

        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        
    def end_epoch(self):

        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)

        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results['loss'] = loss
        results["accuracy"] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)

        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        clear_output(wait=True)
        display(df)
        
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * self.loader.batch_size

    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):

        pd.DataFrame.from_dict(
            self.run_data, orient='columns'
        ).to_csv(f'{fileName}.csv')

        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

    @staticmethod
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

In [5]:
params = OrderedDict( lr = [.01], batch_size = [100, 1000, 10000], num_workers = [0, 1], device = ['cuda','cpu'])

### Training

In [6]:
m = RunBuilder()
for run in RunBuilder.get_runs(params):
    device = torch.device(run.device)
    network = Network().to(device)
    loader = torch.utils.data.DataLoader(train_set, batch_size= run.batch_size, num_workers= run.num_workers)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            images = batch[0].to(device)
            labels = batch[1].to(device)
            preds = network(images)
            loss = F.cross_entropy(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
        m.end_epoch()
    m.end_run()
m.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device
0,1,1,0.656049,0.747567,9.414533,13.443259,0.01,100,0,cuda
1,1,2,0.436714,0.838367,8.62362,22.167106,0.01,100,0,cuda
2,1,3,0.385845,0.858483,8.692266,30.954079,0.01,100,0,cuda
3,1,4,0.364754,0.86575,8.583719,39.619519,0.01,100,0,cuda
4,1,5,0.350999,0.870467,8.500589,48.2008,0.01,100,0,cuda
5,2,1,0.572383,0.7846,12.478429,12.653,0.01,100,0,cpu
6,2,2,0.384201,0.856117,13.694035,26.448288,0.01,100,0,cpu
7,2,3,0.355377,0.8674,12.569609,39.114826,0.01,100,0,cpu
8,2,4,0.343128,0.8717,13.156806,52.385329,0.01,100,0,cpu
9,2,5,0.329644,0.877117,17.275258,69.775318,0.01,100,0,cpu


In [7]:
pd.DataFrame.from_dict(m.run_data, orient='columns').sort_values('epoch duration')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device
51,11,2,1.386024,0.509917,5.590647,17.034662,0.01,10000,1,cuda
31,7,2,0.51504,0.802667,5.601338,13.256019,0.01,1000,1,cuda
50,11,1,2.131919,0.221583,5.68262,11.354253,0.01,10000,1,cuda
33,7,4,0.382343,0.858617,5.712777,24.900415,0.01,1000,1,cuda
52,11,3,0.956195,0.642333,5.718515,22.841458,0.01,10000,1,cuda
32,7,3,0.42754,0.841667,5.743714,19.095881,0.01,1000,1,cuda
30,7,1,0.955606,0.636483,5.975696,7.557769,0.01,1000,1,cuda
53,11,4,0.795016,0.691667,6.016508,28.950819,0.01,10000,1,cuda
34,7,5,0.347851,0.8706,6.097705,31.089571,0.01,1000,1,cuda
54,11,5,0.692239,0.724683,6.259688,35.311236,0.01,10000,1,cuda
