In [1]:
#######################################################################################################################
# Project: Deep Virtual Rapport Agent (data preprocessing)
#
#     Jan Ondras (jo951030@gmail.com)
#     Institute for Creative Technologies, University of Southern California
#     April-October 2019
#
#######################################################################################################################
# Generate segmented/sequenced dataset from vra1, sewa, hatice2010, and nvb datasets
#
#     For all head gestures (nod, shake, tilt)
#
#     The annotate_features.ipynb scripts of these datasets need to be run first! 
#
#     Single subject-independent split into train-val-test
#
#     Optionally perform augmentations. Sample outputs:
#         4comb_nod_32ws_12f.npz (without augmentations)
#         4comb_nod_32ws_12f_A1.npz (augmentation 1 applied)
#
#     Assumes that the features from all datasets are already resampled to same frequency and annotated.
#
#     Note: the 1st and 2nd order differences will have to be recalculated if augmentation is applied
#
#     Input features: dvra_datasets/vra1/listener_annotated_features
#                     dvra_datasets/sewa/annotated_features
#                     dvra_datasets/hatice2010/annotated_features
#                     dvra_datasets/nvb/annotated_features
#
#     Output dataset: dvra_datasets/4comb/segmented_datasets/
#
#                     (print outputs from this script were also saved in 
#                      deep-virtual-rapport-agent/notes/results/log_generate_4comb_dataset.docx)
#
#     The generated dataset was used for the development of the Head Gesture Detector.
#     In future, the ccdb dataset can also be included in the training data, extending the 4comb dataset to 5comb.
#######################################################################################################################

In [1]:
###########################################################
import numpy as np
random_seed = 37
np.random.seed(random_seed)
from tensorflow import set_random_seed
set_random_seed(random_seed)
###########################################################

import os
import glob
import time
import warnings
import pandas as pd
from collections import defaultdict
from sklearn.utils import shuffle


# Mask value (if all features for a given sample timestep are equal to MASK_VALUE, 
# then the sample timestep will be masked (skipped))
# Cannot use MASK_VALUE=0.0, as it corresponds to no movement (derivatives are zero)
# Cannot use MASK_VALUE=np.inf, as RandomUnderSampler cannot handle NaNs and Inf values
MASK_VALUE = 7777777.7777777


def generate_4comb_dataset(selected_features, window_size, val_size, test_size, head_gesture, 
                           augment, augment_params={}):
    
#     dataset_names = ['vra1', 'sewa', 'hatice2010', 'nvb']
    
    dataset_type = f'{window_size}ws_{len(selected_features)}f'
    dataset_output_filename_prefix = f'/home/ICT2000/jondras/dvra_datasets/4comb/segmented_datasets/4comb_{head_gesture}_{dataset_type}'
        
    # Nod offsets (needed for vra1 dataset only)
    nods_offsets_filename = f'/home/ICT2000/jondras/datasets/vra1/offsetListenerNods.txt'
    nods_offsets = np.loadtxt(nods_offsets_filename)
    
    print(f'Head gesture: {head_gesture}')
    print(f'Window size: {window_size}')
    print(f'Val size: {val_size}\t Test size: {test_size}')
    print(f'Selected features: \n\t{selected_features}')
    if augment:
        print(f'Augmentation params: \n\t{augment_params}')
        dataset_output_filename_prefix += f'_A{augment_params["augment_id"]}'
    print()
        
    start_time = time.time()
    # Segments (and chunk lengths) of all 4 datasets
    segments = defaultdict(list)
        
    def get_croppped_dataframe(annotated_features_file):
        '''
        Needed for vra1 dataset only
        Ignore the beginning of the recording prior to the beep & take only selected features and annotations.
        '''
        
        # Get subject ID and its offset
        sid = annotated_features_file.split('/')[-1].split('.')[0][3:]
        offset = nods_offsets[int(sid) - 1]
#         print(sid, offset)
        
        df = pd.read_csv(annotated_features_file)
        df = df[df[' timestamp'] >= offset][selected_features + [head_gesture]]
        
        return df
    
    
    def get_segments(df):
        '''
        Generate segments (X (features) and Y (labels)) from the dataframe. 
        
        Returns 2 lists of 2D arrays.
        '''
        
        X = []
        Y = []

        # Pre-pad all features and labels with (window_size - 1) MASK_VALUE-s 
        padded_features = np.pad(df.values[:, :-1], ((window_size - 1, 0), (0, 0)), 
                                 mode='constant', constant_values=(MASK_VALUE, MASK_VALUE))
        # Labels are padded with 0 mask value (indicating not a nod)
        padded_labels   = np.pad(df.values[:, -1],  (window_size - 1, 0), 
                                 mode='constant', constant_values=(0, 0))
        
        assert padded_features.shape[1] == len(selected_features)
        assert padded_labels.shape[0] == padded_features.shape[0]
        assert len(padded_features) - window_size + 1 == len(df), 'Padding failed!'

        # Slide window of length window_size over the padded features/labels
        for i in range(len(df)):       
            X.append( padded_features[i:i + window_size] )
            Y.append( padded_labels[i:i + window_size] )

        return X, Y
    
    def augment_df(df, augment_params):
        # Perform augmentatations of the given dataframe (contains only selected features and annotation)
        # Only on training dataframes!
        
        # Third approach: pre-defined augmentations
        # Horizontal flip (negation) of 1st and 2nd differences of rotation features
        # Specified by binary encodings, e.g., '101' means to flip the Rx and Rz feature types
        augmented_dfs = []
        for bin_augment_encoding in augment_params['horizontal_flip_bin_encodings']:
            assert len(bin_augment_encoding) == len(augment_params['horizontal_flip_feature_types'])
            augmented_df = df.copy()
            for i, flip_feature in enumerate(bin_augment_encoding):
                if flip_feature == '1':
                    feature_name = augment_params['horizontal_flip_feature_types'][i]
                    augmented_df['diff_' + feature_name] = -augmented_df['diff_' + feature_name]
                    augmented_df['diff2_' + feature_name] = -augmented_df['diff2_' + feature_name]
            augmented_dfs.append(augmented_df)
                                                                                                
#         # Second approach: random augmentations
#         # Random horizontal flip (negation) of 1st and 2nd differences of features given by augment_params['horizontal_flip_feature_types'] independently
#         # Augmentations consist of the original df and other augment_params['n_augment'] dfs
#         # Each augmentation is encoded as a decimal/binary number
#         # E.g. for 6 feature types: 100001 means to flip the first and last feature type
#         augmented_dfs = []
#         # Add original df, corresponds to decimal encoding 0
#         augmented_dfs.append(df.copy())
#         n_feature_types_to_augment = len(augment_params['horizontal_flip_feature_types'])
#         # Shift by 1 since the original df (encoded as 0) is already included
#         dec_augment_encodings = np.random.choice((2 ** n_feature_types_to_augment) - 1, 
#                                                  augment_params['n_augment'], replace=False) + 1
#         #print(dec_augment_encodings)
#         for dec_augment_encoding in dec_augment_encodings:
#             augmented_df = df.copy()
#             # Convert decimal to binary encoding and perform augmentations
#             bin_augment_encoding = f'{dec_augment_encoding:0{n_feature_types_to_augment}b}'
#             print(f'\t\tbin_augment_encoding: {bin_augment_encoding}')
#             for i, flip_feature in enumerate(bin_augment_encoding):
#                 #print(i, flip_feature)
#                 if flip_feature == '1':
#                     feature_name = augment_params['horizontal_flip_feature_types'][i]
#                     augmented_df['diff_' + feature_name] = -augmented_df['diff_' + feature_name]
#                     augmented_df['diff2_' + feature_name] = -augmented_df['diff2_' + feature_name]
#             augmented_dfs.append(augmented_df)
        
#         # First approach: may generate duplicate augmentations
#         augmented_dfs = []
#         for _ in range(augment_params['n_augment']):
#             augmented_df = df.copy()
            
#             # Random horizontal flip of each time-series independently
#             for feature_name in augment_params['horizontal_flip_features']:
#                 if np.random.uniform() < augment_params['horizontal_flip_probab']:
#                     augmented_df[feature_name] = -augmented_df[feature_name]
                    
#             # Stretch / squeeze
#             # ...
#             # Recalculate differences
#             # Fix annotations
                    
#             augmented_dfs.append(augmented_df)
        
        return augmented_dfs
    
    ######################################################################################################
    # vra1 dataset
    #
    #     each recording is different subject (in total 45 subjects)

    if head_gesture == 'nod':
        
        # Load the annotated feature files
        input_annotated_features_dir = '/home/ICT2000/jondras/dvra_datasets/vra1/listener_annotated_features_perframe'
        input_filenames = np.array(sorted(glob.glob(input_annotated_features_dir + '/*.csv')))

        n_subjects = len(input_filenames)
        n_test_subjects = int(n_subjects * test_size)
        n_val_subjects = int(n_subjects * val_size)
        n_train_subjects = n_subjects - n_val_subjects - n_test_subjects
        # print(f'\t {n_subjects} subjects/sessions')
        print(f'vra1 dataset: \n\t {n_train_subjects} train subjects \n\t {n_val_subjects} val subjects \n\t {n_test_subjects} test subjects')

        # Shuffle and split filenames
        input_filenames = shuffle(input_filenames, random_state=random_seed)

        # Split dataset recordings into train-val-test in a subject-independent manner
        # data_split maps dataset_partition (train/val/test) to list of recordings (csv filenames)
        data_split = dict()
        data_split['train'] = input_filenames[:n_train_subjects]
        data_split['val']   = input_filenames[n_train_subjects:n_train_subjects + n_val_subjects]
        data_split['test']  = input_filenames[n_train_subjects + n_val_subjects:]
    #     print(data_split)

        # Get training, validation and testing segments
        for dataset_partition in ['train', 'val', 'test']:
            for annotated_features_file in data_split[dataset_partition]:
                df = get_croppped_dataframe(annotated_features_file)

                # Apply random augmentations to training dataframes
                if augment and dataset_partition == 'train':
                    dfs = augment_df(df, augment_params)
                else:
                    dfs = [df]

                for df_i in dfs:
                    X, Y = get_segments(df=df_i)
                    assert len(X) == len(df_i)
                    segments[f'X_{dataset_partition}'].extend(X)
                    segments[f'Y_{dataset_partition}'].extend(Y)
                    segments[f'{dataset_partition}_len'].append(len(X))

        for key in segments.keys(): print(key, len(segments[key]))
        print()
        
    ######################################################################################################
    # sewa dataset
    #
    #     several recordings per subject (in total 275 subjects)
    
    if (head_gesture == 'nod') or (head_gesture == 'shake'):

        # Load the annotated feature files 
        input_annotated_features_dir = '/home/ICT2000/jondras/dvra_datasets/sewa/annotated_features'
        # Map subject ID to a list of its recordings
        sid_to_filenames = defaultdict(list)
        for annotated_features_file in sorted(glob.glob(input_annotated_features_dir + '/*.csv')):
            sid = int(annotated_features_file.split('/')[-1].split('_')[3][1:])
            sid_to_filenames[sid].append(annotated_features_file)
        sids = list(sid_to_filenames.keys())
    #     print(sid_to_filenames)

        n_subjects = len(sids)
        n_test_subjects = int(n_subjects * test_size)
        n_val_subjects = int(n_subjects * val_size)
        n_train_subjects = n_subjects - n_val_subjects - n_test_subjects
        # print(f'\t {n_subjects} subjects/sessions')
        print(f'sewa dataset: \n\t {n_train_subjects} train subjects \n\t {n_val_subjects} val subjects \n\t {n_test_subjects} test subjects')

        # Shuffle and split filenames by subject
        sids = shuffle(sids, random_state=random_seed)
    #     print(sids)
    #     print(len(sids))

        # Split dataset recordings into train-val-test in a subject-independent manner
        # data_split maps dataset_partition (train/val/test) to list of recordings (csv filenames)
        data_split = dict()
        data_split['train'] = [filename for sid in sids[:n_train_subjects] for filename in sid_to_filenames[sid]]
        data_split['val'] = [filename for sid in sids[n_train_subjects:n_train_subjects + n_val_subjects] for filename in sid_to_filenames[sid]]
        data_split['test'] = [filename for sid in sids[n_train_subjects + n_val_subjects:] for filename in sid_to_filenames[sid]]
    #     print(data_split)

        # Get training, validation and testing segments
        for dataset_partition in ['train', 'val', 'test']:
            for annotated_features_file in data_split[dataset_partition]:
                # Take only selected features and annotation columns
                df = pd.read_csv(annotated_features_file)[selected_features + [head_gesture]]

                # Apply random augmentations to training dataframes
                if augment and dataset_partition == 'train':
                    dfs = augment_df(df, augment_params)
                else:
                    dfs = [df]

                for df_i in dfs:
                    X, Y = get_segments(df=df_i)
                    assert len(X) == len(df_i)
                    segments[f'X_{dataset_partition}'].extend(X)
                    segments[f'Y_{dataset_partition}'].extend(Y)
                    segments[f'{dataset_partition}_len'].append(len(X))

        for key in segments.keys(): print(key, len(segments[key]))
        print()
        
    ######################################################################################################
    # hatice2010 dataset
    #
    #     several recordings per subject (in total 8 (nod and shake) + 7 (other) subjects)
    
    if (head_gesture == 'nod') or (head_gesture == 'shake'):

        subject_groups = ['nodshake', 'other']
        sids = {
            'nodshake': list(range(1, 9)), 
            'other':    list(range(9, 16))
        }
    #     print(sids)
        # Load the annotated feature files
        input_annotated_features_dir = '/home/ICT2000/jondras/dvra_datasets/hatice2010/annotated_features'
        # Map subject ID to a list of its recordings
        sid_to_filenames = defaultdict(list)
        for annotated_features_file in sorted(glob.glob(input_annotated_features_dir + '/*.csv')):
            sid = int(annotated_features_file.split('/')[-1].split('_')[0])
            sid_to_filenames[sid].append(annotated_features_file)
    #     print(sid_to_filenames)

        # Split dataset recordings into train-val-test in a subject-independent manner
        # data_split maps dataset_partition (train/val/test) to list of recordings (csv filenames)
        data_split = defaultdict(list)

        # Split each group of subjects separately
        for subject_group in subject_groups:

            n_subjects = len(sids[subject_group])
            n_test_subjects = int(n_subjects * test_size)
            n_val_subjects = int(n_subjects * val_size)
            n_train_subjects = n_subjects - n_val_subjects - n_test_subjects
            print(f'hatice2010 dataset [{subject_group}]: \n\t {n_train_subjects} train subjects \n\t {n_val_subjects} val subjects \n\t {n_test_subjects} test subjects')

            # Shuffle and split filenames by subject
            sids[subject_group] = shuffle(sids[subject_group], random_state=random_seed)
    #         print(sids[subject_group])

            data_split['train'].extend([filename for sid in sids[subject_group][:n_train_subjects] for filename in sid_to_filenames[sid]])
            data_split['val'].extend([filename for sid in sids[subject_group][n_train_subjects:n_train_subjects + n_val_subjects] for filename in sid_to_filenames[sid]])
            data_split['test'].extend([filename for sid in sids[subject_group][n_train_subjects + n_val_subjects:] for filename in sid_to_filenames[sid]])
    #         print(data_split)

            # Get training, validation and testing segments
            for dataset_partition in ['train', 'val', 'test']:
                for annotated_features_file in data_split[dataset_partition]:
                    # Take only selected features and annotation columns
                    df = pd.read_csv(annotated_features_file)[selected_features + [head_gesture]]

                    # Apply random augmentations to training dataframes
                    if augment and dataset_partition == 'train':
                        dfs = augment_df(df, augment_params)
                    else:
                        dfs = [df]

                    for df_i in dfs:
                        X, Y = get_segments(df=df_i)
                        assert len(X) == len(df_i)
                        segments[f'X_{dataset_partition}'].extend(X)
                        segments[f'Y_{dataset_partition}'].extend(Y)
                        segments[f'{dataset_partition}_len'].append(len(X))

        for key in segments.keys(): print(key, len(segments[key]))
        print()

    ######################################################################################################
    # nvb dataset
    #
    #     each recording is different subject (in total 38 subjects)
    
    if (head_gesture == 'nod') or (head_gesture == 'shake') or (head_gesture == 'tilt'):

        # Load the annotated feature files
        input_annotated_features_dir = '/home/ICT2000/jondras/dvra_datasets/nvb/annotated_features'
        input_filenames = np.array(sorted(glob.glob(input_annotated_features_dir + '/*.csv')))

        n_subjects = len(input_filenames)
        n_test_subjects = int(n_subjects * test_size)
        n_val_subjects = int(n_subjects * val_size)
        n_train_subjects = n_subjects - n_val_subjects - n_test_subjects
        # print(f'\t {n_subjects} subjects/sessions')
        print(f'nvb dataset: \n\t {n_train_subjects} train subjects \n\t {n_val_subjects} val subjects \n\t {n_test_subjects} test subjects')

        # Shuffle and split filenames
        input_filenames = shuffle(input_filenames, random_state=random_seed)

        # Split dataset recordings into train-val-test in a subject-independent manner
        # data_split maps dataset_partition (train/val/test) to list of recordings (csv filenames)
        data_split = dict()
        data_split['train'] = input_filenames[:n_train_subjects]
        data_split['val']   = input_filenames[n_train_subjects:n_train_subjects + n_val_subjects]
        data_split['test']  = input_filenames[n_train_subjects + n_val_subjects:]
    #     print(data_split)

        # Get training, validation and testing segments
        for dataset_partition in ['train', 'val', 'test']:
            for annotated_features_file in data_split[dataset_partition]:
                # Take only selected features and annotation columns
                df = pd.read_csv(annotated_features_file)[selected_features + [head_gesture]]

                # Apply random augmentations to training dataframes
                if augment and dataset_partition == 'train':
                    dfs = augment_df(df, augment_params)
                else:
                    dfs = [df]

                for df_i in dfs:
                    X, Y = get_segments(df=df_i)
                    assert len(X) == len(df_i)
                    segments[f'X_{dataset_partition}'].extend(X)
                    segments[f'Y_{dataset_partition}'].extend(Y)
                    segments[f'{dataset_partition}_len'].append(len(X))

        for key in segments.keys(): print(key, len(segments[key]))
        print()
        
    ######################################################################################################

    # Convert lists to numpy arrays and reshape Y to be 3D (as needed for training)
    for key in segments.keys():
        segments[key] = np.array(segments[key])
        if key[0] == 'Y':
            segments[key] = np.expand_dims(segments[key], axis=-1)               
        print(key, segments[key].shape)

    # Save train/val/test segmented data for this fold
    segments['selected_features'] = selected_features
    segments['WINDOW_SIZE'] = window_size
    segments['MASK_VALUE'] = MASK_VALUE
    segments['val_size'] = val_size
    segments['test_size'] = test_size
    segments['head_gesture'] = head_gesture
    segments['augment'] = augment
    segments['augment_params'] = augment_params
    np.savez(dataset_output_filename_prefix, **segments)

    n_examples = len(segments['X_train']) + len(segments['X_val']) + len(segments['X_test'])
    print(f"\t\t train:val:test = {len(segments['X_train'])}:{len(segments['X_val'])}:{len(segments['X_test'])} = {len(segments['X_train'])/n_examples}:{len(segments['X_val'])/n_examples}:{len(segments['X_test'])/n_examples}")
    for partition in ['train', 'val', 'test']:
        print(f'\t\t Number of {partition} examples per class: \t{np.unique(segments[f"Y_{partition}"][:, -1], return_counts=True)}')        
    print(f'\t\t Total time taken: {time.time() - start_time} s')
    print('====================================================================================================')


In [2]:
#######################################
# Without augmentations
#######################################

# selected_features_1 = [
#     'diff_ pose_Tx', 
#     'diff_ pose_Ty', 
#     'diff_ pose_Tz',

#     'diff_ pose_Rx', 
#     'diff_ pose_Ry', 
#     'diff_ pose_Rz',
# ]
selected_features_2 = [
    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',
    
    'diff2_ pose_Tx', 
    'diff2_ pose_Ty', 
    'diff2_ pose_Tz',

    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',
    
    'diff2_ pose_Rx', 
    'diff2_ pose_Ry', 
    'diff2_ pose_Rz',
]

for sf in [selected_features_2]:
    for ws in [32]:
        for hg in ['nod', 'shake', 'tilt']:
            generate_4comb_dataset(selected_features=sf, window_size=ws, val_size=0.15, test_size=0.15, 
                                   head_gesture=hg, augment=False)

Head gesture: nod
Window size: 32
Val size: 0.15	 Test size: 0.15
Selected features: 
	['diff_ pose_Tx', 'diff_ pose_Ty', 'diff_ pose_Tz', 'diff2_ pose_Tx', 'diff2_ pose_Ty', 'diff2_ pose_Tz', 'diff_ pose_Rx', 'diff_ pose_Ry', 'diff_ pose_Rz', 'diff2_ pose_Rx', 'diff2_ pose_Ry', 'diff2_ pose_Rz']

vra1 dataset: 
	 33 train subjects 
	 6 val subjects 
	 6 test subjects
X_train 134112
Y_train 134112
train_len 33
X_val 25857
Y_val 25857
val_len 6
X_test 24784
Y_test 24784
test_len 6

sewa dataset: 
	 193 train subjects 
	 41 val subjects 
	 41 test subjects
X_train 346441
Y_train 346441
train_len 413
X_val 63623
Y_val 63623
val_len 81
X_test 74601
Y_test 74601
test_len 89

hatice2010 dataset [nodshake]: 
	 6 train subjects 
	 1 val subjects 
	 1 test subjects
hatice2010 dataset [other]: 
	 5 train subjects 
	 1 val subjects 
	 1 test subjects
X_train 365648
Y_train 365648
train_len 864
X_val 64549
Y_val 64549
val_len 109
X_test 77327
Y_test 77327
test_len 151

nvb dataset: 
	 28 train sub

In [2]:
#######################################
# Generate augmented dataset
#######################################

# selected_features_1 = [
#     'diff_ pose_Tx', 
#     'diff_ pose_Ty', 
#     'diff_ pose_Tz',

#     'diff_ pose_Rx', 
#     'diff_ pose_Ry', 
#     'diff_ pose_Rz',
# ]
selected_features_2 = [
    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',
    
    'diff2_ pose_Tx', 
    'diff2_ pose_Ty', 
    'diff2_ pose_Tz',

    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',
    
    'diff2_ pose_Rx', 
    'diff2_ pose_Ry', 
    'diff2_ pose_Rz',
]

# Augmentation parameters
augment_params = {
    # Third approach
    'augment_id': 1,
    'horizontal_flip_feature_types': [
        ' pose_Rx', 
        ' pose_Ry', 
        ' pose_Rz',
    ],
    'horizontal_flip_bin_encodings': [
        '000',
        '010',
        '100',
        '110',
        '001',
        '011',
        '101', 
        '111'
    ]
    # Second approach
#     'augment_id': 1,
#     'n_augment': 6, 
#     'horizontal_flip_feature_types': [
#         ' pose_Tx', 
#         ' pose_Ty', 
#         ' pose_Tz',
#         ' pose_Rx', 
#         ' pose_Ry', 
#         ' pose_Rz',
#     ]
    # First approach
#     'horizontal_flip_features': selected_features_2, 
#     'horizontal_flip_probab': 0.5
}

for sf in [selected_features_2]:
    for ws in [32]:
        for hg in ['nod', 'shake', 'tilt']:
            generate_4comb_dataset(selected_features=sf, window_size=ws, val_size=0.15, test_size=0.15, 
                                   head_gesture=hg, augment=True, augment_params=augment_params)

Head gesture: nod
Window size: 32
Val size: 0.15	 Test size: 0.15
Selected features: 
	['diff_ pose_Tx', 'diff_ pose_Ty', 'diff_ pose_Tz', 'diff2_ pose_Tx', 'diff2_ pose_Ty', 'diff2_ pose_Tz', 'diff_ pose_Rx', 'diff_ pose_Ry', 'diff_ pose_Rz', 'diff2_ pose_Rx', 'diff2_ pose_Ry', 'diff2_ pose_Rz']
Augmentation params: 
	{'augment_id': 1, 'horizontal_flip_feature_types': [' pose_Rx', ' pose_Ry', ' pose_Rz'], 'horizontal_flip_bin_encodings': ['000', '010', '100', '110', '001', '011', '101', '111']}

vra1 dataset: 
	 33 train subjects 
	 6 val subjects 
	 6 test subjects
X_train 1072896
Y_train 1072896
train_len 264
X_val 25857
Y_val 25857
val_len 6
X_test 24784
Y_test 24784
test_len 6

sewa dataset: 
	 193 train subjects 
	 41 val subjects 
	 41 test subjects
X_train 2771528
Y_train 2771528
train_len 3304
X_val 63623
Y_val 63623
val_len 81
X_test 74601
Y_test 74601
test_len 89

hatice2010 dataset [nodshake]: 
	 6 train subjects 
	 1 val subjects 
	 1 test subjects
hatice2010 dataset [othe