The notebook requires a preprocessed dataset. This can be done either by executing the notebook
**data_preprocessing.ipynb** or by running the **data_preprocessing.py** Python script, which can be easily executed like so:
```
python data_preprocessing.py
```

Also, make sure that the **bnci_utils.py** file is in the same directory as this notebook as it contains part of the
functionality. The resulting preprocessed data should not be renamed unless you also change their names in the
notebook.

In [17]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.layers import Dropout, Dense, LSTM
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from keras import backend as K
import pandas as pd
from tensorflow.python.keras.models import Sequential
import bnci_utils as utils

The name of the output files can be changed - change the value of the **iteration_data_file_name** variable to
rename the output file for each iteration, and the name of the **iteration_stats_file_name** to rename the output file
for statistics from the entire simulation. Note that both names **need** to have the .xlsx extension. E.g:

```
iteration_data_file_name = 'simulation_output.xlsx'
```

In [None]:
# All the datasets that can be run with this notebook
#   Entire dataset - all data
#   Female subjects - data from female subjects
#   Male subjects - data from male subjects
datasets = {
    'entire_dataset': 'entire_dataset.npz',
    'female_subjects': 'dataset_female_gender.npz',
    'male_subjects': 'dataset_male_gender.npz'
}

# Here you can change the type of the dataset that will be used:
#   entire_dataset - all samples
#   female_subjects - only samples from female subjects
#   male_subjects - only samples from male subjects
dataset_part = 'entire_dataset'

# Dataset path is by default saved in dataset_result/bci_dataset.npz
dataset_path = os.path.join('dataset_result', datasets[dataset_part])

data_output_folder = 'output' # output path for statistics from the simulation
iteration_data_file_name = 'lstm_entire_dataset.xlsx' # file name of excel file with data from each iteration
iteration_stats_file_name = 'lstm_entire_dataset_stats.xlsx' # file name for statistics from the simulation (i.e,
                                                                    # max and average accuracy, max and average recall...)

In [19]:
# Get features and labels
features, labels = utils.load_dataset(dataset_path)

f'Features shape: {features.shape}, labels shape: {labels.shape}'

'Features shape: (2976, 14, 36, 10), labels shape: (2976,)'

In [20]:
# Reshape the dataset for TensorFlow only
features = features.reshape((features.shape[0], 14, -1))

labels = labels.reshape((-1, 1))
labels = OneHotEncoder().fit_transform(labels).toarray()

f'features shape: {features.shape}, labels_shape: {labels.shape}'


'features shape: (2976, 14, 360), labels_shape: (2976, 2)'

In [21]:
# set seed to produce a consistent result
seed = 1
np.random.seed(seed)
tf.random.set_seed(seed)

In [22]:
# Function to create the LSTM model
def lstm_model():
    model = Sequential([
        LSTM(124, input_shape=(14, 360), activation=tf.nn.relu, return_sequences=True),
        Dropout(0.4),
        LSTM(124, activation=tf.nn.relu),
        Dropout(0.3),
        Dense(64, activation=tf.nn.relu),
        Dropout(0.2),
        Dense(2, activation=tf.nn.softmax, name='output_layer')
    ])

    return model

lstm_model().summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 14, 124)           240560    
_________________________________________________________________
dropout_12 (Dropout)         (None, 14, 124)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 124)               123504    
_________________________________________________________________
dropout_13 (Dropout)         (None, 124)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8000      
_________________________________________________________________
dropout_14 (Dropout)         (None, 64)                0         
_________________________________________________________________
output_layer (Dense)         (None, 2)                

In [23]:
def run_network(model, train, valid, test, iteration, epochs=30):
    x_train, y_train = train[0], train[1]
    x_val, y_val = valid[0], valid[1]
    x_test, y_test = test[0], test[1]

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.losses.BinaryCrossentropy(),
        metrics=['accuracy']
    )

    # Train the model and validate on the validation data
    model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val, y_val),
              callbacks=[EarlyStopping(patience=8, verbose=1, restore_best_weights=True)]
              )

    # Get the statistics
    accuracy, precision, recall, f1, confusion_matrix = utils.get_metrics_keras(model, x_test, y_test, f'{iteration}. LSTM ')

    return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'confusion_matrix': confusion_matrix
        }

In [24]:
results = []
num_splits = 10
iteration = 1

# Run 10-fold CV in the same manner as in the case of the CNN
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, shuffle=True)
for train_idx, val_idx in KFold(n_splits=num_splits).split(x_train):
    x_train_curr, y_train_curr = x_train[train_idx], y_train[train_idx] # get the current training data
    x_val, y_val = x_train[val_idx], y_train[val_idx] # get the current validation data

    model = lstm_model() # create the LSTM model

    # Run the network and save the results
    result = run_network(model, (x_train_curr, y_train_curr), (x_val, y_val), (x_test, y_test), iteration)
    results.append(result)

    # Delete the model
    K.clear_session()
    del model

    iteration += 1

Epoch 1/30

KeyboardInterrupt: 

In [None]:
# Create pandas dataframe with stats from each iterations
df = pd.DataFrame({
    'iterations': [x for x in range(1, num_splits + 1)],
    'accuracy': [x['accuracy'] for x in results],
    'precision': [x['precision'] for x in results],
    'recall': [x['recall'] for x in results],
    'f1': [x['f1'] for x in results],
})

df

In [None]:
os.makedirs(data_output_folder, exist_ok=True)

# Save the dataframe
df.to_excel(os.path.join(data_output_folder, iteration_data_file_name))

In [None]:
# Create a dataframe with statistics
df_stats = pd.DataFrame({
    'average_accuracy': [df['accuracy'].mean()],
    'max_accuracy': [df['accuracy'].max()],
    'accuracy_std': [df['accuracy'].std()],
    'average_precision': [df['precision'].mean()],
    'max_precision': [df['precision'].max()],
    'average_recall': [df['recall'].mean()],
    'max_recall': [df['recall'].max()],
    'average_f1': [df['f1'].mean()],
    'max_f1': [df['f1'].max()],
})

df_stats

In [None]:
# Save the dataframe
df_stats.to_excel(os.path.join(data_output_folder, iteration_stats_file_name))

In [None]:
# Print confusion matrices
utils.print_confusion_matrices(ann=results)