In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
from torchvision import transforms

from sklearn.model_selection import KFold

%matplotlib inline


In [2]:
# Set random seed
# Executing `set_seed(seed=seed)` you are setting the seed to ensure reproducibility.

# for DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

import random
import torch

def set_seed(seed = None, seed_torch = True):
    if seed is None:
        seed = np.random.choice(2 ** 32)
    random.seed(seed)
    np.random.seed(seed)
    if seed_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    print(f'Random seed {seed} has been set.')

# In case that `DataLoader` is used
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


In [3]:
# Experimental settings
N_subject = 4
N_game = 4

Subject_names = ['01', '02', '03', '04']
Game_names = ['1', '2', '3', '4']

if len(Subject_names) != N_subject:
    print('Subject list error!')
if len(Game_names) != N_game:
    print('Game list error!')


In [4]:
# Labels used for prediction: Label_info + Label_electrode --> Label_prediction
# CHANGE these with necessity

# ['subject', 'game', 'gender', 'age', 'disturbance', 'experience', 'memory']
Label_info = []

# ['AF3', 'AF4', 'F3', 'F4', 'F7', 'F8', 'FC5', 'FC6', 'O1', 'O2', 'P7', 'P8', 'T7', 'T8']
Label_electrode = ['AF3', 'AF4', 'F3', 'F4', 'F7', 'F8', 'FC5', 'FC6', 'O1', 'O2', 'P7', 'P8', 'T7', 'T8']

# ['satisfied', 'boring', 'horrible', 'calm', 'funny', 'valence', 'arounsal']
Label_prediction = ['boring', 'horrible', 'calm', 'funny']
# Label_prediction = ['valence', 'arounsal']

# Summarise labels for model
Label_names = Label_info + Label_electrode + Label_prediction


In [5]:
# Model structural settings
N_inputtime = 2 # Time window for input sampling
N_leaptime = 2 # Temporal leap for input sampling
N_outputtime = 1 # Time window for output (FIX to 1 for emotional state prediction!)

N_channel = len(Label_info + Label_electrode)
N_emotion = len(Label_prediction)
N_input = (N_inputtime, N_channel)
N_output = (N_outputtime, N_emotion)
print('Input shape:', N_input, ' Output shape:', N_output)

# Model training settings
test_ratio = 0.2 # proportion of testing data to full dataset
batch_size_train = 64 # number of examples per minibatch during training
batch_size_test = 256 # number of examples per minibatch during validation & testing
k_folds = 5 # number for K-folds

# Set random seed for reproducibility
SEED = 2021
set_seed(seed = SEED) # With this code, do we still need to set random seed afterwards?


Input shape: (2, 14)  Output shape: (1, 4)
Random seed 2021 has been set.


In [6]:
# Read summarised data from csv
def read_smry_data(subject, game, label):
    data = pd.read_csv('Summarised_Data/' + 'S' + subject + 'G' + game + '.csv', index_col = ['time'])
    return pd.DataFrame(data, columns = label)


In [7]:
# Convert time series to supervised learning
def series_to_supervised(data, n_leap = 1, n_in = 1, n_out = 1, col_fix = [], col_in = [], col_out = [], dropnan = True):
    cols, names_input, names_output = list(), list(), list()
    
    # Input non-sequence from col_fix
    if len(col_fix):
        df = pd.DataFrame(data, columns = col_fix)
        cols.append(df)
        names_input += [('%s' % (df.columns[j])) for j in range(df.shape[1])]
    
    # Input sequence (t - n - 1, ..., t - 1, t) from col_in
    if len(col_in):
        df = pd.DataFrame(data, columns = col_in)
        for i in range(n_in - 1, -1, -1):
            cols.append(df.shift(i))
            if i == 0:
                names_input += [('%s(t)' % (df.columns[j])) for j in range(df.shape[1])]
            else:
                names_input += [('%s(t-%d)' % (df.columns[j], i)) for j in range(df.shape[1])]
    
    # Forecast sequence (t, t + 1, ..., t + n) from col_out
    if len(col_out):
        df = pd.DataFrame(data, columns = col_out)
        for i in range(0, n_out):
            cols.append(df.shift(-i))
            if i == 0:
                names_output += [('%s(t)' % (df.columns[j])) for j in range(df.shape[1])]
            else:
                names_output += [('%s(t+%d)' % (df.columns[j], i)) for j in range(df.shape[1])]
    
    # Merge dataframe
    data = pd.concat(cols, axis = 1)
    data.columns = names_input + names_output
    if dropnan:
        data.dropna(inplace = True)
    
    # Temporal leap sampling
    data = data.iloc[::n_leap]
    
    # Return dataframe and input/output labels
    return data.reset_index(drop = True), names_input, names_output


In [8]:
# Convert time series to data for supervised learning
TS_data = [ [] for j in range(N_subject) ]

# Within-subject processing
# Need to change in the future for between-subject processing
for i_subject in range(N_subject):
    subject = Subject_names[i_subject]
    
    for i_game in range(N_game):
        game = Game_names[i_game]
        
        # Read summarised data
        Read_data = read_smry_data(subject, game, Label_names)
        
        # Convert time series to supervised learning
        Temp_data, X_label, Y_label = series_to_supervised(Read_data, n_leap = N_leaptime, n_in = N_inputtime, n_out = N_outputtime, 
                                                           col_fix = Label_info, col_in = Label_electrode, col_out = Label_prediction)
        
        # Add splitted datasets
        if i_game:
            TS_data[i_subject] = pd.concat([TS_data[i_subject], Temp_data], ignore_index = True)
        else:
            TS_data[i_subject] = Temp_data

# Show independent/dependent variables for model prediction
print('Independent labels:', X_label)
print('Dependent labels:', Y_label)
print('Subject dataset size:', TS_data[i_subject].shape)


Independent labels: ['AF3(t-1)', 'AF4(t-1)', 'F3(t-1)', 'F4(t-1)', 'F7(t-1)', 'F8(t-1)', 'FC5(t-1)', 'FC6(t-1)', 'O1(t-1)', 'O2(t-1)', 'P7(t-1)', 'P8(t-1)', 'T7(t-1)', 'T8(t-1)', 'AF3(t)', 'AF4(t)', 'F3(t)', 'F4(t)', 'F7(t)', 'F8(t)', 'FC5(t)', 'FC6(t)', 'O1(t)', 'O2(t)', 'P7(t)', 'P8(t)', 'T7(t)', 'T8(t)']
Dependent labels: ['boring(t)', 'horrible(t)', 'calm(t)', 'funny(t)']
Subject dataset size: (76504, 32)


In [9]:
# Transformation
# YET to be used at this point!!! (waiting for Q&A)
transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 


In [10]:
# MAIN

# Create the corresponding DataLoaders for training and testing
g_seed = torch.Generator()
g_seed.manual_seed(SEED)

# Within-subject training & testing
for i_subject in range(N_subject):
    # Print current subject
    print('%d/%d Subject' % (i_subject + 1, N_subject))
    print('----------------------------')
    
    # Load full dataset from subject
    full_dataset = TS_data[i_subject]
    
    # Split into train/test datasets
    test_size = int(test_ratio * len(full_dataset))
    train_size = len(full_dataset) - test_size
    train_set_orig, test_set_orig = torch.utils.data.random_split(full_dataset, [train_size, test_size], generator = g_seed)
    
    # Test dataset loader
    test_loader = torch.utils.data.DataLoader(test_set_orig,
                                              batch_size = batch_size_test,
                                              num_workers = 2,
                                              worker_init_fn = seed_worker,
                                              generator = g_seed)
    
    # K-fold Cross Validator
    kfold = KFold(n_splits = k_folds, shuffle = True, random_state = SEED)
    for fold, (train_i, val_i) in enumerate(kfold.split(train_set_orig)):
        # Print current fold
        print('%d/%d Fold' % (fold + 1, k_folds))
        
        # Sample train/validation dataset from indices
        train_sampler = torch.utils.data.SubsetRandomSampler(train_i, generator = g_seed)
        val_sampler = torch.utils.data.SubsetRandomSampler(val_i, generator = g_seed)
        
        # Train/Validation dataset loader
        train_loader = torch.utils.data.DataLoader(train_set_orig,
                                                   sampler = train_sampler,
                                                   batch_size = batch_size_train,
                                                   num_workers = 2,
                                                   worker_init_fn = seed_worker,
                                                   generator = g_seed)
        val_loader = torch.utils.data.DataLoader(train_set_orig,
                                                 sampler = val_sampler,
                                                 batch_size = batch_size_test,
                                                 num_workers = 2,
                                                 worker_init_fn = seed_worker,
                                                 generator = g_seed)
        
        # Model training & evalucation
        ...
        
        # Fold output
        ...
        print('Train/Val/Test Dataset Length:', len(train_sampler), len(val_sampler), test_size)
        print('\n')
    

1/4 Subject
----------------------------
1/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


2/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


3/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


4/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


5/5 Fold
Train/Val/Test Dataset Length: 48964 12240 15300


2/4 Subject
----------------------------
1/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


2/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


3/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


4/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


5/5 Fold
Train/Val/Test Dataset Length: 48964 12240 15300


3/4 Subject
----------------------------
1/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


2/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


3/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


4/5 Fold
Train/Val/Test Dataset Length: 48963 12241 15300


5/5 Fold
Train/Val/Test Dataset Lengt