<a href="https://colab.research.google.com/github/jsherrah-aiml/aiml-tests/blob/master/AIML_machine_learning_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIML machine learning test 

In this exercise you will improve the test set accuracy of a deep neural network classifier on the [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) data set.  You may use any means at your disposal, but:
* you may NOT train on the test data
* the model must be trained in your code; you cannot simply download a model and use it for inference

The desired outputs of your answer are:
* Runnable code in a colab notebook
* Test set accuracy before your changes (as per the code below), in %
* Test set accuracy after your changes, in %
* A brief description of your approach

Please use the code below as a starting point.  It loads the CIFAR-100 data and trains a ResNet-18 CNN on it.

Email results to Jamie Sherrah at jamie.sherrah@adelaide.edu.au.  Also email me for questions.

# How good is good enough?

You should aim for an accuracy of at least 75%.  But the higher, the better.  [State-of-the-art accuracy is 91%](https://paperswithcode.com/sota/image-classification-on-cifar-100) The exercise should take 1-3 hours.


# Enable GPU

You can enable GPU for this notebook via:

*Edit -> Notebook Settings -> Hardware Accelerator -> GPU*


# Contact

For questions contact Jamie Sherrah, jamie.sherrah@adelaide.edu.au

June 2019

In [0]:
import os
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets, transforms

In [0]:
class BaseDataLoader(DataLoader):
    """
    Base class for all data loaders. Provides functionality to split into
    train/validation data.
    """
    def __init__(self, dataset, batch_size, shuffle, validation_split, num_workers,
                 collate_fn=default_collate):
        self.validation_split = validation_split
        self.shuffle = shuffle

        self.batch_idx = 0
        self.n_samples = len(dataset)

        self.sampler, self.valid_sampler = self._split_sampler(self.validation_split)

        self.init_kwargs = {
            'dataset': dataset,
            'batch_size': batch_size,
            'shuffle': self.shuffle,
            'collate_fn': collate_fn,
            'num_workers': num_workers
        }
        super().__init__(sampler=self.sampler, **self.init_kwargs)

    def _split_sampler(self, split):
        if split == 0.0:
            return None, None

        idx_full = np.arange(self.n_samples)

        np.random.seed(0)
        np.random.shuffle(idx_full)

        len_valid = int(self.n_samples * split)

        valid_idx = idx_full[0:len_valid]
        train_idx = np.delete(idx_full, np.arange(0, len_valid))

        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)

        # turn off shuffle option which is mutually exclusive with sampler
        self.shuffle = False
        self.n_samples = len(train_idx)

        return train_sampler, valid_sampler

    def split_validation(self):
        if self.valid_sampler is None:
            return None
        else:
            return DataLoader(sampler=self.valid_sampler, **self.init_kwargs)
          

class Cifar10DataLoader(BaseDataLoader):
    """
    CIFAR10 data loading demo using BaseDataLoader
    """
    def __init__(self, data_dir, batch_size, shuffle, validation_split, num_workers, training=True):
        trsfm = transforms.Compose([
            transforms.ToTensor()
        ])
        self.data_dir = data_dir
        self.dataset = datasets.CIFAR10(self.data_dir, train=training, download=True, transform=trsfm)
        super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers)


class Cifar100DataLoader(BaseDataLoader):
    """
    CIFAR10 data loading demo using BaseDataLoader
    """
    def __init__(self, data_dir, batch_size, shuffle, validation_split, num_workers, training=True):
        trsfm = transforms.Compose([
            transforms.ToTensor()
        ])
        self.data_dir = data_dir
        self.dataset = datasets.CIFAR100(self.data_dir, train=training, download=True, transform=trsfm)
        super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers)


In [0]:
class ResNet18(nn.Module):
    """
    Pretrained ResNet18.
    """
  
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes

        self.encoder = models.resnet18(pretrained=False)
        num_feats = self.encoder.fc.out_features
        self.classifier = nn.Sequential(nn.Linear(num_feats, num_classes))

        # Init of last layer
        for m in self.classifier:
            nn.init.kaiming_normal_(m.weight)

    def forward(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [0]:
class Trainer:
    """
    This class is responsible for performing a full training session.
    """
    
    def __init__(self, model, loss, metrics, optimizer, epochs, device,
                 data_loader, valid_data_loader=None, lr_scheduler=None):
        
        self.model = model
        self.device = device
        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.epochs = epochs
        self.start_epoch = 1
        
        self.data_loader = data_loader
        self.valid_data_loader = valid_data_loader
        self.do_validation = self.valid_data_loader is not None
        self.lr_scheduler = lr_scheduler
        self.log_step = int(np.sqrt(data_loader.batch_size))

    def train(self):
        """
        Full training logic
        """
        print('Starting training...')
        for epoch in range(self.start_epoch, self.epochs + 1):
            result = self._train_epoch(epoch)

            # save logged informations into log dict
            log = {'epoch': epoch}
            for key, value in result.items():
                if key == 'val_metrics':
                    log.update({
                        'val_' + mtr.__name__: value[i] for i, mtr in enumerate(self.metrics)})
                else:
                    log[key] = value

            # print logged informations to the screen
            for key, value in log.items():
                print(f'{str(key):15s}: {value}')

    def _train_epoch(self, epoch):
        """
        Training logic for an epoch

        :param epoch: Current training epoch.
        :return: A log that contains all information you want to save.
        """
        self.model.train()

        total_loss = 0
        total_metrics = np.zeros(len(self.metrics))
        for batch_idx, (data, target) in enumerate(self.data_loader):
            data, target = data.to(self.device), target.to(self.device)

            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.loss(output, target)
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()
            total_metrics += self._eval_metrics(output, target)

            if batch_idx % self.log_step == 0:
                self._log_batch(epoch, batch_idx, self.data_loader.batch_size,
                                self.data_loader.n_samples, len(self.data_loader), loss.item())

        log = {
            'loss': total_loss / len(self.data_loader),
            'metrics': (total_metrics / len(self.data_loader)).tolist()
        }

        if self.do_validation:
            val_log = self._valid_epoch(epoch)
            log = {**log, **val_log}

        if self.lr_scheduler is not None:
            self.lr_scheduler.step()

        return log

    def _log_batch(self, epoch, batch_idx, batch_size, n_samples, len_data, loss):
        n_complete = batch_idx * batch_size
        percent = 100.0 * batch_idx / len_data
        msg = f'Train Epoch: {epoch} [{n_complete}/{n_samples} ({percent:.0f}%)] Loss: {loss:.6f}'
        print(msg)

    def _eval_metrics(self, output, target):
        acc_metrics = np.zeros(len(self.metrics))
        for i, metric in enumerate(self.metrics):
            acc_metrics[i] += metric(output, target)
        return acc_metrics
        
    def _valid_epoch(self, epoch):
        """
        Validate after training an epoch

        :return: A log that contains information about validation
        """
        self.model.eval()
        total_val_loss = 0
        total_val_metrics = np.zeros(len(self.metrics))
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(self.valid_data_loader):
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                loss = self.loss(output, target)
                total_val_loss += loss.item()
                total_val_metrics += self._eval_metrics(output, target)

        return {
            'val_loss': total_val_loss / len(self.valid_data_loader),
            'val_metrics': (total_val_metrics / len(self.valid_data_loader)).tolist()
        }

In [0]:
class Runner:
    """
    Top level class to initialise everything and begin training.
    """
  
    def train(self, config):
        self._seed_everything(config['seed'])

        print('Getting data_loader instance')
        data_loader = Cifar100DataLoader(**config['data_loader'])
        valid_data_loader = data_loader.split_validation()

        print('Building model architecture')
        model = ResNet18(num_classes=100)
        model, device = self._prepare_device(model, config['n_gpu'])

        print('Getting loss and metric function handles')
        loss = nn.CrossEntropyLoss()
        metrics = [
          # funcs go here
        ]

        print('Building optimizer and lr scheduler')
        trainable_params = filter(lambda p: p.requires_grad, model.parameters())
        optimizer = torch.optim.SGD(trainable_params, lr=0.001)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=2)

        print('Initialising trainer')
        trainer = Trainer(model, loss, metrics, optimizer,
                        epochs=config['training']['epochs'],
                        device=device,
                        data_loader=data_loader,
                        valid_data_loader=valid_data_loader,
                        lr_scheduler=lr_scheduler)

        trainer.train()
        print('Finished!')

    def _prepare_device(self, model, n_gpu_use):
        device, device_ids = self._get_device(n_gpu_use)
        model = model.to(device)
        if len(device_ids) > 1:
            model = torch.nn.DataParallel(model, device_ids=device_ids)
        return model, device

    def _get_device(self, n_gpu_use):
        """
        setup GPU device if available, move model into configured device
        """
        n_gpu = torch.cuda.device_count()
        if n_gpu_use > 0 and n_gpu == 0:
            print("Warning: There\'s no GPU available on this machine,"
                  "training will be performed on CPU.")
            n_gpu_use = 0
        if n_gpu_use > n_gpu:
            print(f"Warning: The number of GPU\'s configured to use is {n_gpu_use}, "
                  f"but only {n_gpu} are available on this machine.")
            n_gpu_use = n_gpu
        device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
        list_ids = list(range(n_gpu_use))
        print(f'Using device: {device}, {list_ids}')
        return device, list_ids

    def _seed_everything(self, seed):
        print(f'Using random seed: {seed}')
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)


In [0]:
config = {
    'name': 'CIFAR_Demo',
    'n_gpu': 1,                 # use GPU if available
    'seed': 1234,               # random seed to use for everything
    
    'data_loader': {
      'batch_size': 32,        # training batch size
      'data_dir': 'data',       # directory to download dataset to
      'num_workers': 2,         # data loading parallelisation
      'shuffle': 'False',        # shuffle training samples
      'validation_split': 0.1   # split of data to use for validation
    },
    
    'training': {
      'epochs': 1              # N training epochs
    }
}

In [0]:
Runner().train(config)