# Deep neural network model training

Adapted from MNIST example from Lasagne [http://github.com/Lasagne/Lasagne].

You will need to download the facial expressions dataset from the facial expressions Kaggle competition [https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge]. Extract the tarball in the same directory as this notebook; ensure that the the CSV file can be found on the path ```fer2013/fer2013.csv```, where the ```fer2013``` directory is in the same path as this notebook.

In [None]:
%matplotlib inline

In [None]:
#!/usr/bin/env python

from __future__ import print_function

import os


# Choose the GPU to train on here
GPU = 0
theano_flags = os.environ.get('THEANO_FLAGS')
gpu_flag = 'device=gpu{0}'.format(GPU)
if theano_flags is None:
    os.environ['THEANO_FLAGS'] = gpu_flag
else:
    os.environ['THEANO_FLAGS'] = theano_flags + ',' + gpu_flag

import gzip
import itertools
import pickle
import os
import sys
import pandas
import numpy as np
import lasagne
import theano
import theano.tensor as T
from matplotlib import pyplot as plt
import time
import joblib
import tempfile
import shutil

from deep_neural_network import dnn_architecture

from urllib import urlretrieve

In [None]:
# The location of the dataset that we use for training
DATASET_PATH = 'fer2013/fer2013.csv'

MODEL_DIR = 'dnn_model'
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
MODEL_PATH = os.path.join(MODEL_DIR, 'dnn')



In [None]:
# Going beyond 250 epochs doesn't really improve things
NUM_EPOCHS = 250
# Train 256 examples in parallel; reducing this may alleviate memory problems
BATCH_SIZE = 256
# 6 mini-batches of validation and test
N_VALID = BATCH_SIZE * 6
N_TEST = BATCH_SIZE * 6



In [None]:
def load_data(seed=12345):
    """Get data with labels, split into training, validation and test set."""
    rng = np.random.RandomState(seed)
    
    df = pandas.read_csv(DATASET_PATH)
    N = df.shape[0]

    # Each input image is 48x48 greyscale
    X = np.zeros((N, 1, 48, 48))
    Y = np.zeros((N,))

    for i in xrange(N):
        # The images pixels come in the form of a string that contains uint8 (0-255) values separated by spaces
        px = df['pixels'][i]
        # Convert the image pixels to floats in the range 0-1
        px = np.array([float(x)   for x in px.split()])/255.0
        # Standardising (zero mean, unit variance) each image seems to work well
        offset = np.mean(px)
        scale = np.std(px)
        if scale < 1.0e-3:
            scale = 1.0
        px = (px - offset) / scale
        X[i,0,:,:] = px.reshape((48, 48))
        Y[i] = df['emotion'][i]

    indices = np.arange(X.shape[0])
    rng.shuffle(indices)
    
    valid_indices = indices[:N_VALID]
    test_indices = indices[N_VALID:N_VALID+N_TEST]
    train_indices = indices[N_VALID+N_TEST:]
    
    X_train = X[train_indices]
    y_train = Y[train_indices]
    X_valid = X[valid_indices]
    y_valid = Y[valid_indices]
    X_test = X[test_indices]
    y_test = Y[test_indices]
    
    
    X_train = X_train.reshape((X_train.shape[0], 1, 48, 48))
    X_valid = X_valid.reshape((X_valid.shape[0], 1, 48, 48))
    X_test = X_test.reshape((X_test.shape[0], 1, 48, 48))

    return dict(
        X_train=lasagne.utils.floatX(X_train),
        y_train=y_train.astype(np.int32),
        X_valid=lasagne.utils.floatX(X_valid),
        y_valid=y_valid.astype(np.int32),
        X_test=lasagne.utils.floatX(X_test),
        y_test=y_test.astype(np.int32),
        num_examples_train=X_train.shape[0],
        num_examples_valid=X_valid.shape[0],
        num_examples_test=X_test.shape[0],
        input_dim=tuple([int(x) for x in X_train.shape[1:]]),
        output_dim=7,
    )

In [None]:
def create_iter_functions(dataset, output_layer,
                          X_tensor_type=T.tensor4,
                          batch_size=BATCH_SIZE):
    """Create functions for training, validation and testing to iterate one
       epoch.
    """
    X_batch = X_tensor_type('x')
    y_batch = T.ivector('y')

    out_train = lasagne.layers.get_output(output_layer, X_batch)
    out_eval = lasagne.layers.get_output(output_layer, X_batch, deterministic=True)
    loss_train = lasagne.objectives.categorical_crossentropy(out_train, y_batch).mean()
    loss_eval = lasagne.objectives.categorical_crossentropy(out_eval, y_batch).mean()

    pred = T.argmax(
        lasagne.layers.get_output(output_layer, X_batch, deterministic=True),
        axis=1)
    accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)

    all_params = lasagne.layers.get_all_params(output_layer)
    updates = lasagne.updates.adadelta(
        loss_train, all_params)

    iter_train = theano.function(
        [X_batch, y_batch], loss_train,
        updates=updates,
    )

    iter_eval = theano.function(
        [X_batch, y_batch], [loss_eval, accuracy],
    )

    return dict(
        train=iter_train,
        eval=iter_eval,
    )

In [None]:
def train(iter_funcs, dataset, batch_size=BATCH_SIZE):
    """Train the model with `dataset` with mini-batch training. Each
       mini-batch has `batch_size` recordings.
    """
    num_batches_train = dataset['num_examples_train'] // batch_size
    num_batches_valid = dataset['num_examples_valid'] // batch_size
    num_batches_test = dataset['num_examples_test'] // batch_size
    
    theano_train = iter_funcs['train']
    theano_eval = iter_funcs['eval']
    
    X_train = dataset['X_train']
    y_train = dataset['y_train']
    X_valid = dataset['X_valid']
    y_valid = dataset['y_valid']
    X_test = dataset['X_test']
    y_test = dataset['y_test']
    
    best_epoch = 0
    best_valid_loss = np.inf
    best_valid_acc = 0.0
    test_loss = np.inf
    test_acc = 0.0

    N = dataset['num_examples_train']
    for epoch in itertools.count(1):
        train_indices = np.arange(N).astype(np.int32)
        np.random.shuffle(train_indices)
        batch_train_losses = []
        for b in range(num_batches_train):
            batch_indices = train_indices[b*batch_size:b*batch_size+batch_size]
            batch_train_loss = theano_train(X_train[batch_indices], y_train[batch_indices])
            batch_train_losses.append(batch_train_loss)

        avg_train_loss = np.mean(batch_train_losses)

        batch_valid_losses = []
        batch_valid_accuracies = []
        valid_indices = np.arange(dataset['num_examples_valid']).astype(np.int32)
        np.random.shuffle(valid_indices)
        for b in range(num_batches_valid):
            batch_indices = valid_indices[b*batch_size:b*batch_size+batch_size]
            batch_valid_loss, batch_valid_accuracy = theano_eval(X_valid[batch_indices], y_valid[batch_indices])
            batch_valid_losses.append(batch_valid_loss)
            batch_valid_accuracies.append(batch_valid_accuracy)

        avg_valid_loss = np.mean(batch_valid_losses)
        avg_valid_accuracy = np.mean(batch_valid_accuracies)
        
        improved = False
        if avg_valid_accuracy > best_valid_acc:
            best_epoch = epoch
            best_valid_loss = avg_valid_loss
            best_valid_acc = avg_valid_accuracy
        
            batch_test_losses = []
            batch_test_accuracies = []
            test_indices = np.arange(dataset['num_examples_test']).astype(np.int32)
            np.random.shuffle(test_indices)
            for b in range(num_batches_test):
                batch_indices = test_indices[b*batch_size:b*batch_size+batch_size]
                batch_test_loss, batch_test_accuracy = theano_eval(X_test[batch_indices], y_test[batch_indices])
                batch_test_losses.append(batch_test_loss)
                batch_test_accuracies.append(batch_test_accuracy)

            test_loss = np.mean(batch_test_losses)
            test_acc = np.mean(batch_test_accuracies)
            improved = True

        yield {
            'number': epoch,
            'train_loss': avg_train_loss,
            'valid_loss': avg_valid_loss,
            'valid_accuracy': avg_valid_accuracy,
            'test_loss': test_loss,
            'test_accuracy': test_acc,
            'best_epoch': best_epoch,
            'best_valid_loss': best_valid_loss,
            'best_valid_accuracy': best_valid_acc,
            'improved': improved,
        }

In [None]:
print("Loading data...")
dataset = load_data(12345)

print("Building model and compiling functions...")
output_layer, input_layer = dnn_architecture.build_model(
    input_dim=dataset['input_dim'],
    output_dim=dataset['output_dim'],
    batch_size=BATCH_SIZE,
)
iter_funcs = create_iter_functions(dataset, output_layer)

print("Starting training...")
start_time = now = time.time()
for epoch in train(iter_funcs, dataset):
    print("Epoch {0}/{1} ({2}) in {3:.3f}s: TRAIN: loss={4:.3f}, VAL: loss={5:.3f} ({6:.3f}), acc={7:.2f}% ({8:.2f}%), TEST: loss={9:.3f}, acc={10:.2f}%".format(
          epoch['number'], NUM_EPOCHS, epoch['best_epoch'], time.time() - now,
          epoch['train_loss'], epoch['valid_loss'], epoch['best_valid_loss'],
          epoch['valid_accuracy'] * 100, epoch['best_valid_accuracy'] * 100,
          epoch['test_loss'], epoch['test_accuracy'] * 100))
    if epoch['improved']:
        # Accuracy improved this epoch; save model to disk
        all_param_values = lasagne.layers.get_all_param_values(output_layer)
        model = {'param_values': all_param_values}
        joblib.dump(model, MODEL_PATH)
    now = time.time()

    if epoch['number'] >= NUM_EPOCHS:
        break
end_time = time.time()

best_valid_loss = epoch['best_valid_loss']
best_valid_accuracy = epoch['best_valid_accuracy']
test_loss = epoch['test_loss']
test_accuracy = epoch['test_accuracy']

print('Best epoch {0} took {1:.1f}s, valid loss={2:.3f}, valid acc={3:.2f}%, test loss={4:.3f}, test acc={5:.2f}%'.format(
        epoch['best_epoch'], end_time-start_time,
        best_valid_loss, best_valid_accuracy * 100,
        test_loss, test_accuracy * 100
    ))

        