In [1]:
###########################################################
import numpy as np
random_seed = 37
np.random.seed(random_seed)
from tensorflow import set_random_seed
set_random_seed(random_seed)
###########################################################

import os
import glob
import time
from collections import defaultdict
from sklearn.model_selection import KFold


def split_dataset(n_folds):
    '''
    Split dataset (csv files of recordings) into train+val/test paritions for each fold.
    '''
    
    input_annotated_features_dir = '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe'
    dataset_split_filename = f'/home/ICT2000/jondras/datasets/vra1/subject_independent/dataset_split_{n_folds}fold'
    input_filenames = np.array(sorted(glob.glob(input_annotated_features_dir + '/*.csv')))
    
    n_subjects = len(input_filenames)
    n_test_subjects = int(n_subjects / n_folds)
    n_trainval_subjects = n_subjects - n_test_subjects
    print(f'{n_folds}-fold dataset: \n\t {n_trainval_subjects} train+val subjects \n\t {n_test_subjects} test subjects')
    
    folds = defaultdict(list)
    # Shuffle filenames
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for k, (train_indexes, test_indexes) in enumerate(kf.split(input_filenames)):
        folds[f'{k}_trainval'] = input_filenames[train_indexes]
        folds[f'{k}_test']     = input_filenames[test_indexes]
    
    print(f'{folds}')
    
    # Save the dataset split
    np.savez(dataset_split_filename, **folds)
    
    # Verify
    d = np.load(dataset_split_filename + '.npz')
    trainval_set = []
    test_set = set()
    for k in range(n_folds):
        trainval_set.extend(d[f'{k}_trainval'])
        test_set.update(d[f'{k}_test'])
    assert len(trainval_set) == n_folds * n_trainval_subjects, len(trainval_set)
    assert len(test_set) == n_folds * n_test_subjects, len(test_set)
    

In [2]:
split_dataset(n_folds=9)

9-fold dataset: 
	 40 train+val subjects 
	 5 test subjects
defaultdict(<class 'list'>, {'0_trainval': array(['/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES102.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES104.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES106.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES109.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES110.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES112.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES114.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES130.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES138.csv',
       '/home/ICT2000/jondras/datasets/vra1/listene

AssertionError: 45

In [2]:
# Leave-one-subject-out (LOSO)
split_dataset(n_folds=45)

45-fold dataset: 
	 44 train+val subjects 
	 1 test subjects
defaultdict(<class 'list'>, {'0_trainval': array(['/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES101.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES102.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES104.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES106.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES109.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES110.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES112.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES114.csv',
       '/home/ICT2000/jondras/datasets/vra1/listener_annotated_features_perframe/SES130.csv',
       '/home/ICT2000/jondras/datasets/vra1/listen