In [None]:
#######################################################################################################################
# Project: Deep Virtual Rapport Agent (data preprocessing)
#
#     Jan Ondras (jo951030@gmail.com)
#     Institute for Creative Technologies, University of Southern California
#     April-October 2019
#
#######################################################################################################################
# Generate segmented/sequenced dataset from nvb dataset
#
#     For head gestures nod, shake, and tilt.
#
#     Run after the annotate_features.ipynb script was run.
#     Also, perform dataset split into train/val/(test) partitions.
#
#     Input features: dvra_datasets/nvb/annotated_features
#     Output dataset: dvra_datasets/nvb/segmented_datasets
#
#     The generated dataset was used for the development of the Head Gesture Detector.
#######################################################################################################################

In [1]:
###########################################################
import numpy as np
random_seed = 37
np.random.seed(random_seed)
from tensorflow import set_random_seed
set_random_seed(random_seed)
###########################################################

import os
import glob
import time
import warnings
import pandas as pd
from collections import defaultdict


# Mask value (if all features for a given sample timestep are equal to MASK_VALUE, 
# then the sample timestep will be masked (skipped))
# Cannot use MASK_VALUE=0.0, as it corresponds to no movement (derivatives are zero)
# Cannot use MASK_VALUE=np.inf, as RandomUnderSampler cannot handle NaNs and Inf values
MASK_VALUE = 7777777.7777777
    
    
def generate_dataset(selected_features, window_size, val_size, head_gesture):
    '''
    Split dataset (csv files of recordings) into train/val partitions (to train final model to be used for cross-dataset prediction).
    Also prepare test partition that contains all the data for cross-dataset testing. 
    Segment both partitions of the dataset.
        Dataset is segmented into same-length (window_size) sequences.
        Feature segments are pre-padded with MASK_VALUE-s and label segments with 0 (not a nod/shake/tilt).
    For other datasets need to modify to include all available annotations (for nod, shake, tilt).
    One output file is saved. 
    ''' 
    
    dataset_output_filename_prefix = f'/home/ICT2000/jondras/dvra_datasets/nvb/segmented_datasets/'
    
    dataset_type = f'{window_size}ws_{len(selected_features)}f'
    if not os.path.exists(dataset_output_filename_prefix):
        os.makedirs(dataset_output_filename_prefix)
    
    print(f'Head gesture: {head_gesture}')
    print(f'Window size: {window_size}')
    n_features = len(selected_features)
    print(f'Selected features: \n\t{selected_features}')
        
    
    def get_segments(df):
        '''
        Generate segments (X (features) and Y (labels)) from the dataframe. 
        
        Returns 2 lists of 2D arrays.
        '''
        
        X = []
        Y = []

        # Pre-pad all features and labels with (window_size - 1) MASK_VALUE-s 
        padded_features = np.pad(df.values[:, :-1], ((window_size - 1, 0), (0, 0)), 
                                 mode='constant', constant_values=(MASK_VALUE, MASK_VALUE))
        # Labels are padded with 0 mask value (indicating not a nod)
        padded_labels   = np.pad(df.values[:, -1],  (window_size - 1, 0), 
                                 mode='constant', constant_values=(0, 0))
        
        assert padded_features.shape[1] == n_features
        assert padded_labels.shape[0] == padded_features.shape[0]
        assert len(padded_features) - window_size + 1 == len(df), 'Padding failed!'

        # Slide window of length window_size over the padded features/labels
        for i in range(len(df)):       
            X.append( padded_features[i:i + window_size] )
            Y.append( padded_labels[i:i + window_size] )

        return X, Y
    
    
    # Load the annotated feature files
    input_annotated_features_dir = '/home/ICT2000/jondras/dvra_datasets/nvb/annotated_features'
    input_filenames = np.array(sorted(glob.glob(input_annotated_features_dir + '/*.csv')))
    n_subjects = len(input_filenames)
    print(f'\t {n_subjects} subjects/sessions')
    
    # Segment into train+val set
    start_time = time.time()
    segments = defaultdict(list)
    for annotated_features_file in input_filenames:
        # Take only selected features and annotation columns.
        df = pd.read_csv(annotated_features_file)[selected_features + [head_gesture]]

        # Split recording into train and val partitions
        # Validation part starts at a random start index and has the length int(val_size * len(df))
        # start_idx (included), end_idx (excluded)
        val_len = int(val_size * len(df))
        start_idx = np.random.randint(0, len(df) - val_len + 1)
        if val_len < window_size:
            warnings.warn(f'Validation size {val_len} is less than {window_size}!')

        # Get segments from the val partition
        X_val, Y_val = get_segments(df=df.iloc[start_idx:start_idx + val_len])
        # Get segments from the LHS of the val partition
        X_train_1, Y_train_1 = get_segments(df=df.iloc[0:start_idx])
        # Get segments from the RHS of the val partition
        X_train_2, Y_train_2 = get_segments(df=df.iloc[start_idx + val_len:len(df)])

        assert len(X_val) == val_len
        assert len(X_train_1) + len(X_train_2) == len(df) - val_len

        segments['X_train'].extend(X_train_1)
        segments['X_train'].extend(X_train_2)
        segments['X_val'].extend(X_val)

        segments['Y_train'].extend(Y_train_1)
        segments['Y_train'].extend(Y_train_2)
        segments['Y_val'].extend(Y_val)

        # Record lengths of each batch of segments (needed for correct evaluation)
        segments['train_len'].append((len(X_train_1), len(X_train_2)))
        segments['val_len'].append(len(X_val))
        
        # Get all segments for cross-dataset testing
        X_test, Y_test = get_segments(df=df)
        assert len(X_test) == len(df)
        segments['X_test'].extend(X_test)
        segments['Y_test'].extend(Y_test)
        segments['test_len'].append(len(X_test))

    # Convert lists to numpy arrays and reshape Y to be 3D (as needed for training)
    for key in segments.keys():
        segments[key] = np.array(segments[key])
        if key[0] == 'Y':
            segments[key] = np.expand_dims(segments[key], axis=-1)               
        print(key, segments[key].shape)

    # Save train/val/test segmented data for this fold
    segments['selected_features'] = selected_features
    segments['WINDOW_SIZE'] = window_size
    segments['MASK_VALUE'] = MASK_VALUE
    np.savez(dataset_output_filename_prefix + f'nvb_{head_gesture}_{dataset_type}', **segments)

    n_examples = len(segments['X_train']) + len(segments['X_val'])
    print(f"\t\t train:val = {len(segments['X_train'])}:{len(segments['X_val'])} = {len(segments['X_train'])/n_examples}:{len(segments['X_val'])/n_examples}")
    for partition in ['train', 'val', 'test']:
        print(f'\t\t Number of {partition} examples per class: \t{np.unique(segments[f"Y_{partition}"][:, -1], return_counts=True)}')        
    print(f'\t\t Total time taken: {time.time() - start_time} s')
    print('====================================================================================================')


In [2]:
selected_features_1 = [
    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',

    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',
]
selected_features_2 = [
    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',
    
    'diff2_ pose_Tx', 
    'diff2_ pose_Ty', 
    'diff2_ pose_Tz',

    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',
    
    'diff2_ pose_Rx', 
    'diff2_ pose_Ry', 
    'diff2_ pose_Rz',
]

for sf in [selected_features_1, selected_features_2]:
    for ws in [32]:
        for hg in ['nod', 'shake', 'tilt']:
            generate_dataset(selected_features=sf, window_size=ws, val_size=0.15, head_gesture=hg)

Head gesture: nod
Window size: 32
Selected features: 
	['diff_ pose_Tx', 'diff_ pose_Ty', 'diff_ pose_Tz', 'diff_ pose_Rx', 'diff_ pose_Ry', 'diff_ pose_Rz']
	 38 subjects/sessions
X_train (193832, 32, 6)
X_val (34182, 32, 6)
Y_train (193832, 32, 1)
Y_val (34182, 32, 1)
train_len (38, 2)
val_len (38,)
X_test (228014, 32, 6)
Y_test (228014, 32, 1)
test_len (38,)
		 train:val = 193832:34182 = 0.8500881524818652:0.14991184751813486
		 Number of train examples per class: 	(array([0., 1.]), array([181300,  12532]))
		 Number of val examples per class: 	(array([0., 1.]), array([31484,  2698]))
		 Number of test examples per class: 	(array([0., 1.]), array([212784,  15230]))
		 Total time taken: 51.197590827941895 s
Head gesture: shake
Window size: 32
Selected features: 
	['diff_ pose_Tx', 'diff_ pose_Ty', 'diff_ pose_Tz', 'diff_ pose_Rx', 'diff_ pose_Ry', 'diff_ pose_Rz']
	 38 subjects/sessions
X_train (193832, 32, 6)
X_val (34182, 32, 6)
Y_train (193832, 32, 1)
Y_val (34182, 32, 1)
train_le