# Exercise 2 - Custom training loop

## Objective

In this exercise, you have to implement your first training and validation loops from scratch to train
the logistic model you implemented. To do so, you will also have to create an optimizer.

## 1. Introduction

In [None]:
### Importing required modules

In [None]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.preprocessing import image_dataset_from_directory

In [None]:
### Setting environment variables

In [None]:
ENV_COLAB = False                # True if running in Google Colab instance

In [None]:
# Root directory
DIR_BASE = '' if not ENV_COLAB else '/content/'

In [None]:
# Subdirectory to save output files
DIR_OUT = os.path.join(DIR_BASE, 'out/')
# Subdirectory pointing to input data
DIR_SRC = os.path.join(DIR_BASE, 'data/')

## 2. Programming Task

In [None]:
### From J. Moran's `2022-08-27-Logistic-Regression.ipynb`

In [None]:
def softmax(logits, stable=False):
    """Returns the softmax probability distribution.
    
    :param logits: a 1xN tf.Tensor of logits.
    :param stable: optional, flag indicating whether
        or not to normalise the input data.
    returns: soft_logits, a 1xN tf.Tensor of real 
        values in range (0,1) that sum up to 1.0.
    """
    
    assert isinstance(logits, tf.Tensor)
    if stable:
        logits = tf.subtract(logits, tf.reduce_max(logits))
    soft_logits = tf.math.exp(logits)
    soft_logits /= tf.math.reduce_sum(soft_logits)
    return soft_logits

In [None]:
def cross_entropy(scaled_logits, one_hot, use_numpy=True):
    """Returns the cross-entropy loss.
    
    :param scaled_logits: an NxC tf.Tensor of scaled softmax
        distribution values, [n_samples x n_classes].
    :param one_hot: an NxC tf.Tensor of one-hot encoded 
        ground truth labels, [n_samples x n_classes].
    :param use_numpy: optional, uses  Numpy multidimensional
        array indexing on type-casted tf.experimental.numpy
        ndarrays, uses boolean masking if False.
    :returns: loss, a 1x1 tf.Tensor with cross-entropy loss. 
    """
    
    assert isinstance(scaled_logits, tf.Tensor)
    assert isinstance(one_hot, tf.Tensor)
    if use_numpy:
        n_samples = one_hot.shape[0]
        class_labels = tf.math.argmax(one_hot, axis=1)
        preds = tnp.asarray(scaled_logits)[range(n_samples), class_labels]
        log_likelihood = -tf.math.log(preds)
    else:
        n_samples = one_hot.shape[0]
        # For each sample, pick the probability value from the distribution
        # that corresponds to the true class label
        preds = tf.boolean_mask(scaled_logits, one_hot)
        # Taking the negative log-likelihood
        log_likelihood = -tf.math.log(preds)
    # Normalising by the sample size
    loss = tf.math.reduce_sum(log_likelihood) / n_samples
    return loss

In [None]:
def accuracy(y_hat, y):
    """Calculates the average correct predictions.

    :param y_hat: tf.Tensor, NxC tensor-like object of 
        models predictions [n_samples x n_classes].
    :param y: tf.Tensor, N-dimensional tensor of
        ground truth class labels (not one-hot encoded).
    returns: acc, a 1x1 scalar tf.Tensor-like object
        with the accuracy score (correct / total predictions).
    """
    
    assert isinstance(y, tf.Tensor) and isinstance(y_hat, tf.Tensor)
    # Get predicted labels with highest probabilities
    y_preds = tf.cast(tf.math.argmax(y_hat, axis=1), dtype=y.dtype)
    # Get number of correct predictions
    n_correct = tf.math.count_nonzero(tf.cast(tf.math.equal(y_preds, y), dtype=tf.int32))
    # Compute average correct predictions
    acc = n_correct / y_hat.shape[0]
    return acc

In [None]:
### From Udacity's `training.py`

In [None]:
def get_module_logger(mod_name):
    logger = logging.getLogger(mod_name)
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)
    return logger

In [None]:
def sgd(params, grad, lr, bs):
    """
    stochastic gradient descent implementation
    args:
    - params [list[tensor]]: model params
    - grad [list[tensor]]: param gradient such that params[0].shape == grad[0].shape
    - lr [float]: learning rate
    - bs [int]: batch_size
    """
    # IMPLEMENT THIS FUNCTION

In [None]:
def training_loop(train_dataset, model, loss, optimizer):
    """
    training loop
    args:
    - train_dataset: 
    - model [func]: model function
    - loss [func]: loss function
    - optimizer [func]: optimizer func
    returns:
    - mean_loss [tensor]: mean training loss
    - mean_acc [tensor]: mean training accuracy
    """
    
    accuracies = []
    losses = []
    for X, Y in train_dataset:
        with tf.GradientTape() as tape:
            # IMPLEMENT THIS FUNCTION
            pass
    mean_acc = tf.math.reduce_mean(tf.concat(accuracies, axis=0))
    mean_loss = tf.math.reduce_mean(losses)
    return mean_loss, mean_acc

In [None]:
def validation_loop(val_dataset, model):
    """
    training loop
    args:
    - train_dataset: 
    - model [func]: model function
    - loss [func]: loss function
    - optimizer [func]: optimizer func
    returns:
    - mean_acc [tensor]: mean validation accuracy
    """
    # IMPLEMENT THIS FUNCTION
    return mean_acc

In [None]:
### From Udacity's `dataset.py`

In [None]:
def get_datasets(imdir):
    train_dataset = image_dataset_from_directory(imdir, 
                                       image_size=(32, 32),
                                       batch_size=256,
                                       validation_split=0.1,
                                       subset='training',
                                       seed=123)
    val_dataset = image_dataset_from_directory(imdir, 
                                        image_size=(32, 32),
                                        batch_size=256,
                                        validation_split=0.1,
                                        subset='validation',
                                        seed=123)

    return train_dataset, val_dataset

### Evaluating stochastic gradient descent model

#### Considerations for our input data

*Notes here on German Traffic Sign Recognition Benchmark dataset*

In [None]:
### Defining the model parameters

In [None]:
imdir = os.path.join(DIR_SRC, 'GTSRB')
epochs = None
batch_size = None
lr = None
args = {'imdir': , 'epochs': , 'batch_size': batch_size, 'lr': lr}

In [None]:
### Getting the console logger
logger = get_module_logger(__name__)
logger.info(f'Training for {args['batch_size']} epochs using {args['imdir']} data')

In [None]:
### Getting the training and validation data

In [None]:
train_dataset, val_dataset = get_datasets(args['imdir'])

In [None]:
num_inputs = 1024*3
num_outputs = 43

In [None]:
### Initialising the model variables (weights and bias vectors)

In [None]:
W = tf.Variable(tf.random.normal(shape=(num_inputs, num_outputs), mean=0, stddev=0.01))
b = tf.Variable(tf.zeros(num_outputs))

In [None]:
### Run training and validation loop

In [None]:
for epoch in range(epochs):
    logger.info(f'Epoch {epoch}')
    ### Perform stochastic gradient descent over training data
    loss, acc = training_loop(X_train, model, negative_log_likelihood, sgd)
    logger.info(f'Mean training loss: {loss}, mean training accuracy: {acc}')
    ### Compute validation set accuracy
    val_acc = validation_loop(val_datset)
    logger.info(f'Mean validation accuracy {acc}')

## Details

A training loop goes through element of the training dataset and uses it to update the model's weights.
A validation loop goes through each element of the validation dataset and uses it to calculate
the metrics (eg, accuracy). We call **epoch** an iteration of one training loop and one validation loop.

The input to your model should be normalized. You can do this by dividing them by 255: `X /= 255`.

You can run `python training.py` to train your first machine learning model!

You will need to specify the `--imdir`, e.g. `--imdir GTSRB/Final_Training/Images/`, using the provided GTSRB dataset.

## Tips

You don't need `tf.GradientTape` for the validation loop as you will not be updating gradients. 

The `assign_sub` Variable method will be useful to perform the weights update in the sgd optimizer.

Use the `tf.one_hot` function to get the one vector from the ground truth label.