In [1]:
# Basic packages
import math
import numpy as np
import pandas as pd
from scipy.signal import spectrogram

# ML packages
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, Subset, SubsetRandomSampler, random_split
from sklearn.model_selection import KFold

# Setup package manually implemented
from setup import *


In [2]:
class CerealTimeKillersDataset(Dataset):
    """Spectrogram dataset."""

    def __init__(self, df, transform = None):
        self.ori_dataframe = df
        self.transform = transform

    def __len__(self):
        return len(self.ori_dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        spectrogram = self.ori_dataframe.iloc[idx, -1]
        if self.transform:
            spectrogram = self.transform(spectrogram)
        
        labels = self.ori_dataframe.iloc[idx, :-1]
        labels = np.array([labels])
#         if self.transform:
#             labels = self.transform(labels)
        
        sample = {'spectrogram': spectrogram, 'labels': labels}
        return sample
    

In [3]:
def get_specgram(file_dir, labels, winlen = None, stride = 1, nperseg = 256, fs = 129):
    
    # Reading from the csv data set (can do matlab as well) using pandas. 
    df = pd.read_csv(file_dir, sep = ',')
    df = pd.DataFrame(df, columns = labels)
    d = np.array(df, dtype = float) # Switching from pandas to numpy array as this might be more comfortable for people
    
    full_spec = []
    for idx, d2 in enumerate(d.T):
        _, _, Sxx = spectrogram(d2, nperseg = nperseg, fs = fs)
        full_spec.append(Sxx)
        
    #DIMENSIONS OF FULL_SPEC WITHOUT WINDOWING (I.E. FULL WINDOWING)
    #DIMENSION 1: 1                      - FOR DIMENSIONAL CONSISTENCY
    #DIMENSION 2: TIME      (DEFAULT=170) - MIGHT CHANGE AS WELL OK - WE ARE WORKING ON IT
    #DIMENSION 3: CHANNELS  (DEFAULT=14) - MIGHT CHANGE (SO NOT REALLY DEFAULT BUT OK)
    #DIMENSION 4: FREQUENCY (DEFAULT=129)
    
    full_spec = np.vstack([full_spec])
    full_spec = np.moveaxis(full_spec, -1, 0)
    if winlen == None:
        return np.array([full_spec])
    
    i = 0
    full_spec_wind = []
    while i * stride + winlen < full_spec.shape[-1]:
        full_spec_wind.append(full_spec[i * stride : i * stride + winlen, : , :])
        i += 1
    
    #DIMENSIONS OF FULL_SPEC WITH WINDOWING    (FULL_SPEC_WIND) 
    #DIMENSION 1: TIME      (NO DEFAULT - SORRY)
    #DIMENSION 2: WINDOWS   (DEFAULT=1)
    #DIMENSION 3: CHANNELS  (DEFAULT=14) - MIGHT CHANGE (SO NOT REALLY DEFAULT BUT OK)
    #DIMENSION 4: FREQUENCY (DEFAULT=129)
    
    full_spec_wind = np.array(full_spec_wind)
    return full_spec_wind


In [4]:
def CerealTimeKillersDataLoader(dir_class, label_class, dataset_mix = True, 
                                winlen = None, stride = 1, nperseg = 256, fs = 129,
                                transform = None):
    
    specgram_name = 'full_specgram_1'
    
    # Load label & EEG data
    labels_df = pd.read_csv(dir_class.label)
    if dataset_mix:
        spec_df = pd.DataFrame(columns = label_class.fixed + [specgram_name], dtype = float)
        index_df = pd.DataFrame(columns = ['subject', 'game'], dtype = int)
    else:
        spec_df = pd.DataFrame(labels_df, columns = label_class.fixed + [specgram_name], dtype = float)
        spec_df[specgram_name] = [[]] * len(spec_df)
        index_df = pd.DataFrame(labels_df, columns = ['subject', 'game'], dtype = int)
    
    # Create spectrogram dataframe
    for idx in range(labels_df.shape[0]): 
        subject = labels_df['subject'].iloc[idx]
        game = labels_df['game'].iloc[idx]
    
        # You can also just paste in the Directory of the csv file - on windows you may have to change the slash direction
        DirComb = f'{dir_class.game}/(S{str(subject).zfill(2)})/Preprocessed EEG Data/.csv format/S{str(subject).zfill(2)}G{str(game)}AllChannels.csv'
        
        # Get EEG spectrogram
        spec_EEG = get_specgram(DirComb, label_class.electrode, 
                                winlen = winlen, stride = stride, nperseg = nperseg, fs = fs)
        if dataset_mix:
            # Add new data to dataframe
            new_spec_list, new_index_list = list(), list()
            for i in range(spec_EEG.shape[0]):
                new_spec_list.append(list(labels_df[label_class.fixed].iloc[idx]) + [spec_EEG[i]])
                new_index_list.append([subject, game])
            new_spec_df = pd.DataFrame(new_spec_list, columns = label_class.fixed + [specgram_name], dtype = float)
            spec_df = pd.concat([spec_df, new_spec_df], ignore_index = True)    
            new_index_df = pd.DataFrame(new_index_list, columns = ['subject', 'game'], dtype = int)
            index_df = pd.concat([index_df, new_index_df], ignore_index = True)
        else:
            spec_df[specgram_name].iloc[idx] = spec_EEG

    return CerealTimeKillersDataset(df = spec_df, transform = transform), index_df


In [5]:
def CerealTimeKillersDataSplitter(full_dataset, exp_index, 
                                  allocation_test = None, 
                                  test_ratio = 0.2, target_test = [], k_folds = 10, 
                                  batch_size_train = 64, batch_size_test = 256, 
                                  seed = 0, generator = None):
    
    # Split into train/val and test datasets
    train_set_index, test_set_index = list(), list()
    if allocation_test == None:
        test_size = int(test_ratio * len(full_dataset))
        train_size = len(full_dataset) - test_size
        train_set_orig, test_set_orig = random_split(full_dataset, 
                                                     [train_size, test_size], 
                                                     generator = generator)
    elif (allocation_test == 'subject') or (allocation_test == 'game'):
        train_set_index = exp_index[~exp_index[allocation_test].isin(target_test)].index.tolist()
        test_set_index = exp_index[exp_index[allocation_test].isin(target_test)].index.tolist()
        train_set_orig = Subset(full_dataset, train_set_index)
        test_set_orig = Subset(full_dataset, test_set_index)
    else:
        print("Allocate testing dataset based on one of the 'Subject', 'Game', or None.")
        return None
    
    # Test dataset loader
    test_loader = DataLoader(test_set_orig,
                             batch_size = batch_size_test,
                             num_workers = 2,
                             generator = g_seed)
    
    # K-fold Cross Validator
    train_loader, val_loader = [[]] * k_folds, [[]] * k_folds
    kfold = KFold(n_splits = k_folds, shuffle = True, random_state = seed)
    for fold, (train_i, val_i) in enumerate(kfold.split(train_set_orig)):
        
        # Sample train/test dataset from indices
        train_sampler = SubsetRandomSampler(train_i, generator = g_seed)
        val_sampler = SubsetRandomSampler(val_i, generator = g_seed)
        
        # Train/Validation dataset loader
        train_loader[fold] = DataLoader(train_set_orig,
                                        sampler = train_sampler,
                                        batch_size = batch_size_train,
                                        num_workers = 2,
                                        generator = generator)
        val_loader[fold] = DataLoader(train_set_orig,
                                      sampler = val_sampler,
                                      batch_size = batch_size_test,
                                      num_workers = 2,
                                      generator = generator)
    
    # return datasplitter
    data_loader = {'train': train_loader, 'val': val_loader, 'test': test_loader}
    return data_loader, (len(train_sampler), len(val_sampler), len(test_set_orig))


In [6]:
class CerealTimeKillersLabels:
    """Select labels for model prediction."""
    # Labels used for prediction: Label_info + Label_electrode --> Label_prediction
    # CHANGE these with necessity
    
    # ['subject', 'game', 'gender', 'age', 'disturbance', 'experience', 'memory']
    info = []
        
    # ['AF3', 'AF4', 'F3', 'F4', 'F7', 'F8', 'FC5', 'FC6', 'O1', 'O2', 'P7', 'P8', 'T7', 'T8']
    electrode = ['AF3', 'AF4', 'F3', 'F4', 'F7', 'F8', 'FC5', 'FC6', 'O1', 'O2', 'P7', 'P8', 'T7', 'T8']
        
    # ['satisfied', 'boring', 'horrible', 'calm', 'funny', 'valence', 'arounsal']
    prediction = ['boring', 'horrible', 'calm', 'funny']
    # prediction = ['valence', 'arounsal']
    
    # Fixed variables
    fixed = info + prediction
    
    # Summarise labels for model
    label = info + electrode + prediction

    
class CerealTimeKillersDir:
    """Directionary for folders."""
    base = ''
    label = f'{base}GameLabels.csv'
    game = f'{base}GAMEEMO'
    

In [7]:
# General settings
# Whether to allow between-subject and between-game dataset mixture
Is_between_subject = True # Default is True
# Which to be based for allocating testing dataset (only when Is_between_subject = True)
Allocation_test = None # [None, 'subject', 'game'] # Default is None
test_ratio = 0.2 # Proportion of data used for testing when Allocation_test == None
Target_test = [25, 26, 27] # Int list for allocating corresponding game/subject as testing dataset

# Model structural settings
N_inputtime = None # Time window for input sampling (Default is None for the whole timepoints)
N_stridetime = 1 # Temporal leap for input sampling
N_perseg = 256 # N per seg of spectrogram
N_framerate = 128 # Framerate of spectrogram

# Model training settings
batch_size_train = 16 # Number of examples per minibatch during training
batch_size_test = 32 # Number of examples per minibatch during validation/testing
k_folds = 10 # Number for K-folds for training vs validation

# Data transformation
if Is_between_subject:
    data_transform = transforms.Compose([transforms.ToTensor()])
else:
    data_transform = None

# Set random seed for reproducibility
SEED = 2021
set_seed(seed = SEED)
g_seed = torch.Generator()
g_seed.manual_seed(SEED)

# Set device
# DEVICE = set_device()
# print('Current device:', DEVICE)


Random seed 2021 has been set.


<torch._C.Generator at 0x7fd245fe33b0>

In [8]:
# Implement Dataloader
FullDataset, ExpIndex = CerealTimeKillersDataLoader(CerealTimeKillersDir,
                                                    CerealTimeKillersLabels,
                                                    dataset_mix = Is_between_subject,
                                                    winlen = N_inputtime,
                                                    stride = N_stridetime,
                                                    nperseg = N_perseg,
                                                    fs = N_framerate,
                                                    transform = data_transform)

# Implement DataSplitter
SplittedDataset, SplittedDataLength = CerealTimeKillersDataSplitter(FullDataset, 
                                                                    exp_index = ExpIndex, 
                                                                    allocation_test = Allocation_test,
                                                                    test_ratio = test_ratio,
                                                                    target_test = Target_test,
                                                                    k_folds = k_folds,
                                                                    batch_size_train = batch_size_train,
                                                                    batch_size_test = batch_size_test,
                                                                    seed = SEED,
                                                                    generator = g_seed)


In [9]:
(TrainDataLoader, ValDataLoader, TestDataLoader) = (SplittedDataset['train'],
                                                    SplittedDataset['val'],
                                                    SplittedDataset['test'])

print('Train/Val/Test Dataset length:', SplittedDataLength)
for fold in range(k_folds):
    idx = 0
    print('\n%d/%d Fold' % (fold + 1, k_folds))
    print('----------------------------')
    print('Input data size:', TrainDataLoader[fold].dataset[idx]['spectrogram'].size())
    print('Output data size:', TrainDataLoader[fold].dataset[idx]['labels'].shape)
    print('Output label example:', TrainDataLoader[fold].dataset[idx]['labels'])


Train/Val/Test Dataset length: (79, 8, 21)

1/10 Fold
----------------------------
Input data size: torch.Size([129, 170, 14])
Output data size: (1, 4)
Output label example: [[7.0 1.0 8.0 3.0]]

2/10 Fold
----------------------------
Input data size: torch.Size([129, 170, 14])
Output data size: (1, 4)
Output label example: [[7.0 1.0 8.0 3.0]]

3/10 Fold
----------------------------
Input data size: torch.Size([129, 170, 14])
Output data size: (1, 4)
Output label example: [[7.0 1.0 8.0 3.0]]

4/10 Fold
----------------------------
Input data size: torch.Size([129, 170, 14])
Output data size: (1, 4)
Output label example: [[7.0 1.0 8.0 3.0]]

5/10 Fold
----------------------------
Input data size: torch.Size([129, 170, 14])
Output data size: (1, 4)
Output label example: [[7.0 1.0 8.0 3.0]]

6/10 Fold
----------------------------
Input data size: torch.Size([129, 170, 14])
Output data size: (1, 4)
Output label example: [[7.0 1.0 8.0 3.0]]

7/10 Fold
----------------------------
Input data 