# Time series classification - k-fold cross validation
<a href="https://colab.research.google.com/github/jarusgnuj/ioctm358/blob/master/notebooks/time_series_classification/3_TSC_kfold_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 Load Python modules

In [None]:
import time

import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

# General settings and variables
sns.set_style('whitegrid')
model_palette = ['rebeccapurple', 'mediumspringgreen']

class_names = ['cement', 'carpet']
class_colors = ['darkorange', 'steelblue']

# 2 Functions
Some functions, for convenience.

In [None]:
def load_data(filename):
    ''' Load the data from a file in a GitHub repo '''
    url_root = 'https://raw.githubusercontent.com/jarusgnuj/ai-ml-wksh/master/data/UCR_TSC_archive/SonyAIBORobotSurface1_IoC'
    url = url_root+'/'+filename
    robot_df = pd.read_csv(url, sep='\t', header=None)
    print('Loaded from', url)
    robot_data = robot_df.values
    print('The shape of robot_data is', robot_data.shape)
    return robot_data


def preprocess_data(robot_data):
    ''' Split the data in to data samples and labels. Convert classlabels in to 0 and 1 '''
    labels = robot_data[:,0]
    data_samples = robot_data[:,1:]
    print('The shape of the data matrix is', data_samples.shape)
    print('The shape of the labels vector is', labels.shape)

    # Change from classes 1 and 2 to classes 0 and 1, for convenience of use with Keras
    labels = labels - 1
    labels = labels.astype(int)
    print('Number of samples of class 0', (labels == 0).sum())
    print('Number of samples of class 1', (labels == 1).sum())

    return data_samples, labels

# 3 Load the development dataset

In [None]:
filename = 'SonyAIBORobotSurface1_IoC_DEV.txt'
robot_data = load_data(filename)
data, labels = preprocess_data(robot_data)

# 4 MLP
Create a function that builds our model.

In [None]:
# The size of the input vector
input_dim = data.shape[1]
print('input_dim:', input_dim)

def build_model():
    ''' Return a model with randomly initialised weights '''
    ### CHANGE PARAMETERS HERE ###
    # Change the number of nodes in each layer.
    # Add or remove layers.
    model = Sequential([
        Dense(16, input_dim=input_dim, activation='relu', name='Layer1'), 
        Dense(8, activation='relu', name='Layer2'), 
        Dense(1, activation='sigmoid', name='OutputLayer')
    ])
    ### END OF CHANGE PARAMETERS ###
    optimizer = keras.optimizers.Adam() 
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model
    
    
model = build_model()

# 5 Repeated k-fold cross validation
k-fold cross validation splits the dataset into k equal sets. Training is performed on k-1 of the sets and testing is performed on the remaining set. This is repeated m times.

![Example of splitting the dataset into 5 subsets; 5 folds. Training uses 4 of the folds and testing (validation) is done on the fifth fold. This is then repeated 5 times, using a different fold for testing each time](images/5fold_cross_validation.png "Data splitting for 5-fold cross validation")

In [None]:
dataset_size = 444
k = 3 ### CHANGE PARAMETER HERE ###
subset_size = dataset_size / k
training_set_size = round((k-1)*subset_size)
test_set_size = dataset_size - training_set_size
print('With k =', k, 'we would train using', training_set_size, 'data samples and test using', test_set_size)

## 5.1 Exercise 3a: Set k and m
+ How many data samples do you want to use for training and for testing?
  + Set k accordingly.
+ How many times do you want to repeat the model training and testing?
  + Set m accordingly.


Look at the box plot below. How variable are your results? When you train your final model, you'll train it once, are you confident that you'll get an accurate model?

In [None]:
### CHANGE PARAMETERS HERE ###
k = 3 
m = 5 
batch_size = 10
epochs = 30
### END OF CHANGE PARAMETERS ###

kfold = RepeatedStratifiedKFold(n_splits=k, n_repeats=m, random_state=76)
count = 0
val_acc = list()
start = time.time()
for train, test in kfold.split(data, labels):
    data_train, labels_train, data_test, labels_test = data[train], labels[train], data[test], labels[test]
    # Build and train a model
    model = build_model()
    fold_start = time.time()
    hist = model.fit(data_train, labels_train, batch_size=batch_size, epochs=epochs, validation_data=(data_test, labels_test), verbose=1)
    fold_end = time.time()
    log = pd.DataFrame(hist.history) 
    print('Training of iteration', count, 'complete in', round(fold_end-fold_start), 'seconds')
    val_acc.append(log.iloc[-1]['val_acc'])
    count = count + 1

end = time.time()
val_acc = pd.DataFrame(val_acc, columns=['val_acc'])

In [None]:
print(val_acc)
print(m, 'repeats of', k, '-fold cross validation completed in', round(end-start), 'seconds')

### 5.1.1 Plot the k-fold cross validation results

In [None]:
ax = sns.boxplot(data=val_acc)
ax = sns.swarmplot(data=val_acc, color='black')
plt.title('k-fold cross validation results')
ax.set_ylabel('Validation accuracy')
print('Validation accuracy mean:', val_acc['val_acc'].mean())
print('Sample standard deviation:', val_acc['val_acc'].std())

## End of exercise 3a

# 6 Exercise 3b: Model development
In the build_model function you can change the number of nodes in each layer. You can also add and remove layers to change the number of layers. These are some of the model's "hyperparameters". Change model hyperparameters and evaluate the model using the k-fold cross validation.
+ What hyperparameter settings give the highest mean validation accuracy with the lowest variability?
+ What hyperparameters do you want to use to train and model and then test it on the final test dataset?


Use the dropout layer described below if you wish.

## Competition part I
+ The best development model - highest mean validation accuracy. 
+ Tie-breaker - the lowest sample standard deviation.

## Competition part II 
This is in the next notebook.
+ Best performing model - the highest accuracy when tested on the final test dataset.

# 7 Optional: Dropout
Dropout can improve a model's generalisation. It can prevent a model from overfitting to the training data. Overfitting can result in high training accuracy but low validation accuracy. 
+ Does your model suffer from this?
+ What happens if you increase the number nodes in the model does validation accuracy start to decrease, suggesting that overfitting is occurring?

Try adding dropout layers to your model. An example of such a model is given below.

In [None]:
def build_model_with_dropout(print_summary=False):
    ''' Return a model with randomly initialised weights. The model uses dropout. '''
    ### CHANGE PARAMETERS HERE ###
    # Change the number of nodes and layers.
    # Change the proportion of nodes "dropped out", from 0 up to 1.
    model = Sequential([
        Dense(16, input_dim=input_dim, activation='relu', name='Layer1'), 
        Dropout(0.1,name='Dropout1'),
        Dense(8, activation='relu', name='Layer2'), 
        Dropout(0.1,name='Dropout2'),
        Dense(1, activation='sigmoid', name='OutputLayer')
    ])
    ### END OF CHANGE PARAMETERS ###
    optimizer = keras.optimizers.Adam() 
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    if print_summary:
        print(model.summary())
    return model


model_with_dropout = build_model_with_dropout(True)