## Generate dataset and split in a subject-independent way

In [1]:
###########################################################
import numpy as np
random_seed = 37
np.random.seed(random_seed)
from tensorflow import set_random_seed
set_random_seed(random_seed)
###########################################################

import os
import glob
import time
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle
from imblearn.under_sampling import RandomUnderSampler


def split_dataset(val_size=0.2, test_size=0.15):
    '''
    Split dataset (csv files of recordings) into train/val/test paritions.
    '''
    
    input_annotated_features_dir = '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe'
    dataset_split_filename = '/home/ICT2000/jondras/datasets/vra1/dataset_split'
    
    input_filenames = sorted(glob.glob(input_annotated_features_dir + '/*.csv'))
    # Shuffle filenames in place
    input_filenames = shuffle(input_filenames, random_state=random_seed)
#     np.random.shuffle(input_filenames)
    
    # Split
    n = len(input_filenames)
    n_val = int(n * val_size)
    n_test = int(n * test_size)
    n_train = n - n_val - n_test
    print(f'train:val:test = {n_train}:{n_val}:{n_test} = {n_train/n}:{n_val/n}:{n_test/n}\n')
    print(f'Train: \n\t{input_filenames[:n_train]}\n')
    print(f'Val: \n\t{input_filenames[n_train:n_train + n_val]}\n')
    print(f'Test: \n\t{input_filenames[n_train + n_val:]}\n')
    
    # Save the dataset split
    np.savez(dataset_split_filename, train=input_filenames[:n_train], val=input_filenames[n_train:n_train + n_val], 
             test=input_filenames[n_train + n_val:])
        
    
def segment_dataset(selected_features, past_window_size, future_window_size):
    '''
    For each dataset partition (train/val/test):
        Segment dataset into same-length (PAST_WINDOW_SIZE + FUTURE_WINDOW_SIZE) sequences
            - if needed, sequences are pre-padded with (PAST_WINDOW_SIZE - 1) mask_value-s 
            and post-padded with (FUTURE_WINDOW_SIZE) mask_value-s 
        Undersample majority class (0 / not a nod)
        Save the data
    '''
    
    # Past window includes the current frame for which the annotation is considered
    PAST_WINDOW_SIZE = past_window_size 
    FUTURE_WINDOW_SIZE = future_window_size
    seq_len = PAST_WINDOW_SIZE + FUTURE_WINDOW_SIZE
    
    # Mask value (if all features for a given sample timestep are equal to MASK_VALUE, 
    # then the sample timestep will be masked (skipped))
    # Cannot use MASK_VALUE=0.0, as it corresponds to no movement (derivatives are zero)
    # Cannot use MASK_VALUE=np.inf, as RandomUnderSampler cannot handle NaNs and Inf values
    MASK_VALUE = 7777777.7777777
    
    print(f'Selected features: \n\t{selected_features}')

    dataset_split_filename = '/home/ICT2000/jondras/datasets/vra1/dataset_split'
    dataset_output_filename = f'/home/ICT2000/jondras/datasets/vra1/segmented_datasets/dataset_rus_{PAST_WINDOW_SIZE}pws_{FUTURE_WINDOW_SIZE}fws_{len(selected_features)}f'
    
    # For each dataset partition (train/val/test)
    X = defaultdict(list)
    y = defaultdict(list)
    start_time = time.time() 
    dataset_split = np.load(dataset_split_filename + '.npz')
    for dataset_partition in ['train', 'val', 'test']:
        
        print(f'\t{dataset_partition.upper()}')
        
        # Segment dataset into same-length sequences
        for annotated_features_file in dataset_split[dataset_partition]:       
            
            df = pd.read_csv(annotated_features_file)

            # Pre-pad all features with (PAST_WINDOW_SIZE - 1) mask_value-s 
            # and post-pad with (FUTURE_WINDOW_SIZE) mask_value-s
            padded_df = np.pad(df[selected_features].values.tolist(), 
                               ((PAST_WINDOW_SIZE - 1, FUTURE_WINDOW_SIZE), (0, 0)), mode='constant', 
                               constant_values=(MASK_VALUE, MASK_VALUE))
            assert len(padded_df) - seq_len + 1 == len(df), 'Padding failed!'

            # Slide window of length seq_len over the padded dataframe
            for i in range(0, len(padded_df) - seq_len + 1):       
                # Features come from past and future frames
                # (need to flatten the 2D array (#frames x #features) for random under-sampling, 
                #  will be restored to 3D array later)
                X[dataset_partition].append( padded_df[i:i + seq_len].flatten() )
                # Label comes from the current frame
                y[dataset_partition].append(df['nod'][i])

        # Undersample majority class (0 / not a nod)
        print(f'\t\tBefore random under-sampling, number of examples per class: {np.unique(y[dataset_partition], return_counts=True)}')
        rus = RandomUnderSampler(random_state=random_seed)
        X[dataset_partition], y[dataset_partition] = rus.fit_resample(X[dataset_partition], y[dataset_partition])
        print(f'\t\tAfter random under-sampling, number of examples per class: {np.unique(y[dataset_partition], return_counts=True)}')
        # Restore to 3D array (#sequences x #frames x #features)
        X[dataset_partition] = np.reshape(X[dataset_partition], (X[dataset_partition].shape[0], seq_len, len(selected_features)))
        y[dataset_partition] = np.array(y[dataset_partition])
        print(f'\t\tX shape: {X[dataset_partition].shape}, y shape: {y[dataset_partition].shape}\n')

    # Save the dataset
    np.savez(dataset_output_filename, 
             X_train=X['train'], X_val=X['val'], X_test=X['test'], 
             y_train=y['train'], y_val=y['val'], y_test=y['test'], 
             selected_features=selected_features, 
             PAST_WINDOW_SIZE=PAST_WINDOW_SIZE, FUTURE_WINDOW_SIZE=FUTURE_WINDOW_SIZE, MASK_VALUE=MASK_VALUE)
    
    n = len(y['train']) + len(y['val']) + len(y['test'])
    print(f"\ttrain:val:test = {len(y['train'])}:{len(y['val'])}:{len(y['test'])} = {len(y['train'])/n}:{len(y['val'])/n}:{len(y['test'])/n}")
    print(f'\tTime taken: {time.time() - start_time} s\n')
    print('====================================================================================================')

In [2]:
# Split dataset train/val/test: 0.70/0.15/0.15
# DONE
split_dataset()

train:val:test = 30:9:6 = 0.6666666666666666:0.2:0.13333333333333333

Train: 
	['/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES144.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES148.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES101.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES161.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES142.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES39.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES98.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES109.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES95.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES102.csv', '/home/ICT2000/jondras/datasets/vra1/listener_annotat

In [3]:
# Generate datasets at 3 scales

selected_features = [
#         ' pose_Tx', 
#         ' pose_Ty', 
#         ' pose_Tz', 

    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',

#     'diff2_ pose_Tx', 
#     'diff2_ pose_Ty', 
#     'diff2_ pose_Tz',

#         ' pose_Rx', 
#         ' pose_Ry', 
#         ' pose_Rz',

    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',

#     'diff2_ pose_Rx', 
#     'diff2_ pose_Ry', 
#     'diff2_ pose_Rz',

#         ' p_rx', 
#         ' p_ry',
#         ' p_rz',

#         'diff_ p_rx', 
#         'diff_ p_ry',
#         'diff_ p_rz'

    # add landmarks!
]

selected_features_2 = [
#         ' pose_Tx', 
#         ' pose_Ty', 
#         ' pose_Tz', 

    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',

    'diff2_ pose_Tx', 
    'diff2_ pose_Ty', 
    'diff2_ pose_Tz',

#         ' pose_Rx', 
#         ' pose_Ry', 
#         ' pose_Rz',

    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',

    'diff2_ pose_Rx', 
    'diff2_ pose_Ry', 
    'diff2_ pose_Rz',

#         ' p_rx', 
#         ' p_ry',
#         ' p_rz',

#         'diff_ p_rx', 
#         'diff_ p_ry',
#         'diff_ p_rz'

    # add landmarks!
]

for sf in [selected_features, selected_features_2]:
    for pws, fws in [(16, 0), (32, 0), (64, 0)]:
        segment_dataset(selected_features=sf, past_window_size=pws, future_window_size=fws)


Selected features: 
	['diff_ pose_Tx', 'diff_ pose_Ty', 'diff_ pose_Tz', 'diff_ pose_Rx', 'diff_ pose_Ry', 'diff_ pose_Rz']
	TRAIN
		Before random under-sampling, number of examples per class: (array([0, 1]), array([110292,  17082]))
		After random under-sampling, number of examples per class: (array([0, 1]), array([17082, 17082]))
		X shape: (34164, 16, 6), y shape: (34164,)

	VAL
		Before random under-sampling, number of examples per class: (array([0, 1]), array([34494,  3750]))
		After random under-sampling, number of examples per class: (array([0, 1]), array([3750, 3750]))
		X shape: (7500, 16, 6), y shape: (7500,)

	TEST
		Before random under-sampling, number of examples per class: (array([0, 1]), array([23191,  2572]))
		After random under-sampling, number of examples per class: (array([0, 1]), array([2572, 2572]))
		X shape: (5144, 16, 6), y shape: (5144,)

	train:val:test = 34164:7500:5144 = 0.7298752350025637:0.1602290206802256:0.10989574431721073
	Time taken: 39.1502966880798

In [4]:
# Inspect dataset
import numpy as np

PAST_WINDOW_SIZE = 16
FUTURE_WINDOW_SIZE = 0
N_FEATURES = 6
dataset_filename = f'/home/ICT2000/jondras/datasets/vra1/segmented_datasets/dataset_rus_{PAST_WINDOW_SIZE}pws_{FUTURE_WINDOW_SIZE}fws_{N_FEATURES}f.npz'
data = np.load(dataset_filename)
print(data['selected_features'])
print(data['X_train'].shape, data['y_train'].shape)
print(data['X_test'].shape, data['y_test'].shape)

['diff_ pose_Tx' 'diff_ pose_Ty' 'diff_ pose_Tz' 'diff_ pose_Rx'
 'diff_ pose_Ry' 'diff_ pose_Rz']
(34164, 16, 6) (34164,)
(5144, 16, 6) (5144,)
