# Analysis of Bayesian optimization process
*Note*: In the following I made some changes in the code that I used for training the main model. For example, I changed the default params, or I reduced the total number of evaluations in the bayesian optimization in order to save time for finding optimal hyperparameters.

## load data

In [ ]:
import os
import cv2
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

In [ ]:
def dir_id(directory: str) -> int:
    return {'anger': 0,
            'contempt': 1,
            'disgust': 2,
            'fear': 3,
            'happy': 4,
            'sadness': 5,
            'surprise': 6
            }[directory]


def preprocess_image(img_path: str, width: int, height: int) -> np.ndarray:
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (width, height))
    img = np.array(img)
    img = img.reshape((1,) + img.shape)

    return img


def load_data(data: str, width: int, height: int) -> (np.ndarray, np.ndarray):
    """
    Assuming all pictures are structured in subdirectories names as the corresponding expression
    :param data: str containing the root directory of the data
    :param width: goal width of image
    :param height: goal height of image
    :return: tuple (evidence, labels)
    """
    evidence = []
    labels = []

    subdirectories = [directory for directory in os.listdir(data) if directory[0] != '.']

    for subdirectory in subdirectories:
        if os.path.isdir(os.path.join(data, subdirectory)):
            for img_path in os.listdir(os.path.join(data, subdirectory)):
                img = preprocess_image(os.path.join(data, subdirectory, img_path), width, height)
                evidence.append(img)
                labels.append(dir_id(subdirectory))

    return evidence, labels

## Create CNN based on hyperparameters

In [ ]:
def create_model(width, height, categories,
                 num_convolutions, num_convolution_filters, pool_size, learning_rate,
                 num_dense_layers, num_dense_nodes, activation, dropout):
    """
    Returns a compiled CNN model
    hyperparameters:
    num_convolutions:           number of convolutional layers
    num_convolution_filters:    number of convolutional filters per convolutional layer
    pool_size:                  pool size for pooling layers
    learning_rate:              Learning-rate for the optimizer
    number_dense_layers:        Number of dense layers
    number_dense_nodes:         Number of nodes in each dense layer
    activation:                 Activation function for all layers
    dropout:                    Dropout
    """

    # Start construction of a Keras Sequential model
    model = tf.keras.models.Sequential()

    model.add(tf.keras.layers.Input(shape=(width, height, 1)))

    # convolutional layers and pooling with the respective sizes
    for i in range(num_convolutions):
        model.add(tf.keras.layers.Conv2D(kernel_size=5, strides=1, filters=num_convolution_filters, padding='same',
                                              activation=activation, name=f'layer_conv{i}'))
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(pool_size, pool_size), strides=2))

    # flatten out the data
    model.add(tf.keras.layers.Flatten())

    # fully-connected / dense layers.
    for i in range(num_dense_layers):
        model.add(tf.keras.layers.Dense(num_dense_nodes, activation=activation))

    # Add dropout to prevent over fitting
    model.add(tf.keras.layers.Dropout(rate=dropout))

    # Last fully-connected / dense layer with softmax-activation
    # for use in classification
    model.add(tf.keras.layers.Dense(units=categories, activation='softmax'))

    # Use the Adam method for training the network
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy']
                  )

    return model

## Bayesian optimization

In [ ]:
# Define fix parameters
DATA = 'CK+48'
TEST_SIZE = 0.3
IMG_WIDTH = 48
IMG_HEIGHT = 48
EPOCHS = 10
CATEGORIES = 7
BEST_MODEL_PATH = 'current_best_model.keras'

Define the search space that our bayesian optimization algorithm shall search on

In [ ]:
# define search space
dim_num_convolutions = Integer(low=0, high=4, name='num_convolutions')
dim_num_convolution_filters = Integer(low=4, high=64, name='num_convolution_filters')
dim_pool_size = Integer(low=2, high=6, name='pool_size')
dim_learning_rate = Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate')
dim_number_dense_layers = Integer(low=1, high=5, name='number_dense_layers')
dim_number_dense_nodes = Integer(low=5, high=1024, name='number_dense_nodes')
dim_activation = Categorical(categories=['relu', 'sigmoid'], name='activation')
dim_dropout = Real(low=0, high=0.99, prior='uniform', name='dropout')

dimensions = [
    dim_num_convolutions,
    dim_num_convolution_filters,
    dim_pool_size,
    dim_learning_rate,
    dim_number_dense_layers,
    dim_number_dense_nodes,
    dim_activation,
    dim_dropout
]

Now we define some default parameters, this is the starting point of the bayesian optimization.
Best accuracy will keep track of our current best accuracy and thus the best model we want to store.
Then we load the data and split it into train and test set.

In [ ]:
default_params = [1, 32, 2, 1e-5, 1, 512, 'relu', 0.5]

best_accuracy = 0.0

# load data
evidence, labels = load_data(DATA, IMG_WIDTH, IMG_HEIGHT)

labels = tf.keras.utils.to_categorical(labels)
x_train, x_test, y_train, y_test = train_test_split(np.array(evidence), np.array(labels), test_size=TEST_SIZE)

Now we define an evaluate function. It creates the model, augments the data, trains and evaluates the model on the test set and then returns the weighted accuracy of test and train accuracy to prevent overfitting on one of the test sets.

In [ ]:
@use_named_args(dimensions)
def evaluate(num_convolutions, num_convolution_filters, pool_size, learning_rate, number_dense_layers,
             number_dense_nodes, activation, dropout) -> float:
    """
    hyperparameters:
    num_convolutions:           number of convolutional layers
    num_convolution_filters:    number of convolutional filters per convolutional layer
    pool_size:                  pool size for pooling layers
    learning_rate:              Learning-rate for the optimizer
    number_dense_layers:        Number of dense layers
    number_dense_nodes:         Number of nodes in each dense layer
    activation:                 Activation function for all layers
    dropout:                    Dropout
    """

    # Print the hyperparameters.
    print('num_convolutions: ', num_convolutions)
    print('num_convolution_filters: ', num_convolution_filters)
    print('pool_size: ', pool_size)
    print('learning rate: {0:.1e}'.format(learning_rate))
    print('num_dense_layers:', number_dense_layers)
    print('num_dense_nodes:', number_dense_nodes)
    print('activation:', activation)
    print('dropout:', dropout)
    print()

    # case that pooling would reduce the image to negative dimensions producing an error
    if min(IMG_WIDTH, IMG_HEIGHT) / (pool_size ** num_convolutions) < 1:
        return 1

    # Create the neural network with these hyperparameters.
    model = create_model(width=IMG_WIDTH,
                         height=IMG_HEIGHT,
                         categories=CATEGORIES,
                         num_convolutions=num_convolutions,
                         num_convolution_filters=num_convolution_filters,
                         pool_size=pool_size,
                         learning_rate=learning_rate,
                         num_dense_layers=number_dense_layers,
                         num_dense_nodes=number_dense_nodes,
                         activation=activation,
                         dropout=dropout)

    # data augmentation
    train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rescale=1. / 255,
        brightness_range=[0.8, 1.2],
        rotation_range=10,
        width_shift_range=0.05,
        height_shift_range=0.05,
        fill_mode='nearest',
        horizontal_flip=True
    )

    test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

    global x_train
    global x_test

    # Reshape images for compatibility with the augmentation
    x_train = x_train.reshape((-1, IMG_WIDTH, IMG_HEIGHT, 1))
    x_test = x_test.reshape((-1, IMG_WIDTH, IMG_HEIGHT, 1))

    train_generator = train_datagen.flow(
        x_train,
        y_train,
        batch_size=32,
    )

    test_generator = test_datagen.flow(
        x_test,
        y_test,
        batch_size=32,
    )

    # train model and extract accuracy
    trained = model.fit(
        train_generator,
        epochs=EPOCHS,
        validation_data=test_generator
    )

    trained_accuracy = trained.history['val_accuracy'][-1]
    tested_accuracy = model.evaluate(test_generator)[1]

    # calculated weighted average of tested and trained accuracy to prevent overfitting of the bayesian
    # optimization on one of the test sets
    accuracy = (3*tested_accuracy + trained_accuracy) / 4

    # print the classification accuracy
    print()
    print("Accuracy: {0:.2%}".format(accuracy))
    print()

    # update the best accuracy
    global best_accuracy

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        model.save(BEST_MODEL_PATH)

    del model

    return 1 - accuracy

Now all that is left to do is to use scikit's gp_minimize function, which uses gaussian processes for minimizing the evaluate function, thus maximizing accuracy.

In [ ]:
def optimize():
    """
    Performs bayesian optimization
    """
    result = gp_minimize(func=evaluate,
                         dimensions=dimensions,
                         acq_func='EI',
                         n_calls=40,
                         x0=default_params)

    return result

In [ ]:
res = optimize()

# Analysis

In [ ]:
# optimal hyperparameters found
print(f"{res.x} lead to an accuracy of {res.fun}")

In [ ]:
from skopt.plots import plot_convergence, plot_objective, plot_evaluations

# *note*: the following plots may take a while to render

# plots how the minimum of the objective function converged to the end result
plot_convergence(res)

# shows when in the search process which (combinations of) dimensions where evaluated
plot_evaluations(res)

# shows partial dependencies of dimensions, i.e. the influence a certain hyperparameter has on the objective function
plot_objective(res)