In [1]:
#####################################################################################
# Audio-driven upper-body motion synthesis on a humanoid robot
# Computer Science Tripos Part III Project
# Jan Ondras (jo356@cam.ac.uk), Trinity College, University of Cambridge
# 2017/18
#####################################################################################
###############################################################################################################
# Split dataset into train/val/test partitions (single split)
# Save the train/val/test masks
# 1.) For each subject's VID separately: for subject-dependent models
# 2.) Overall: for subject-independent models; data from one subject are present only in one of the 3 partitions
# REDONE after subject 19 eliminated
###############################################################################################################
##################
# 1.) DONE
##################

import numpy as np
import time
import glob

######################################################

val_frac = 0.15  # ok for SEGMENT_LEN=300, for SEGMENT_LEN=250 can have 0.10
test_frac = 0.15

TE_folder = 'TrainingExamples_16kHz'

######################################################

np.random.seed(37) # for reproducibility

unique_srt_VIDs = unique_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_unique_srt_VIDs.npz')['unique_srt_VIDs'] # sorted VIDs
all_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_VIDs.npz')['VIDs']
unique_srt_SIDs = np.array([x[:5] for i, x in enumerate(unique_srt_VIDs) if i % 2 == 0]) # ['PID02', 'PID05', ..

for v, VID in enumerate(unique_srt_VIDs):
    
    print VID
    mask = np.argwhere(all_srt_VIDs == VID)[:,0]

    #######################
    # Dataset split
    N_ex = len(mask)
    N_val_ex =   int(N_ex*val_frac)
    N_test_ex =  int(N_ex*test_frac)
    N_train_ex = N_ex - N_val_ex - N_test_ex

    print "\tDataset split (train/val/test): ", N_train_ex, "/", N_val_ex, "/", N_test_ex, " = ", N_ex, N_train_ex + N_val_ex + N_test_ex
    print "\tDataset split (train/val/test): ", 100.*(1.-val_frac-test_frac), "/", 100.*val_frac, "/", 100.*test_frac, "%"

    train_mask = mask[ : N_train_ex]
    val_mask   = mask[N_train_ex : N_train_ex + N_val_ex]
    test_mask  = mask[N_train_ex + N_val_ex : ]
    
    # Save
    np.savez('./../Dataset/'+TE_folder+'/Dataset_split/split_masks_' + VID + '.npz', 
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

# for s, SID in enumerate(unique_srt_SIDs):
    
#     print SID
#     mask1 = np.argwhere(all_srt_VIDs == SID + 'Task2')[:,0]
#     mask2 = np.argwhere(all_srt_VIDs == SID + 'Task3')[:,0]
#     mask = np.concatenate( (mask1, mask2) )
#     #print mask.shape, mask1.shape, mask2.shape

#     #######################
#     # Dataset split
#     N_ex = len(mask)
#     N_val_ex =   int(N_ex*val_frac)
#     N_test_ex =  int(N_ex*test_frac)
#     N_train_ex = N_ex - N_val_ex - N_test_ex

#     print "\tDataset split (train/val/test): ", N_train_ex, "/", N_val_ex, "/", N_test_ex, " = ", N_ex, N_train_ex + N_val_ex + N_test_ex
#     print "\tDataset split (train/val/test): ", 100.*(1.-val_frac-test_frac), "/", 100.*val_frac, "/", 100.*test_frac, "%"

#     # Randomise the dataset split
#     permI = np.random.permutation(N_ex)

#     train_mask = permI[ : N_train_ex]
#     val_mask   = permI[N_train_ex : N_train_ex + N_val_ex]
#     test_mask  = permI[N_train_ex + N_val_ex : ]
    
#     # Save
#     np.savez('./../Dataset/'+TE_folder+'/Dataset_split/split_masks_' + SID + '.npz', 
#             train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

PID02Task2
	Dataset split (train/val/test):  5341 / 1144 / 1144  =  7629 7629
	Dataset split (train/val/test):  70.0 / 15.0 / 15.0 %
PID02Task3
	Dataset split (train/val/test):  4487 / 961 / 961  =  6409 6409
	Dataset split (train/val/test):  70.0 / 15.0 / 15.0 %
PID05Task2
	Dataset split (train/val/test):  5335 / 1142 / 1142  =  7619 7619
	Dataset split (train/val/test):  70.0 / 15.0 / 15.0 %
PID05Task3
	Dataset split (train/val/test):  5479 / 1174 / 1174  =  7827 7827
	Dataset split (train/val/test):  70.0 / 15.0 / 15.0 %
PID06Task2
	Dataset split (train/val/test):  4127 / 883 / 883  =  5893 5893
	Dataset split (train/val/test):  70.0 / 15.0 / 15.0 %
PID06Task3
	Dataset split (train/val/test):  4321 / 925 / 925  =  6171 6171
	Dataset split (train/val/test):  70.0 / 15.0 / 15.0 %
PID08Task2
	Dataset split (train/val/test):  4585 / 982 / 982  =  6549 6549
	Dataset split (train/val/test):  70.0 / 15.0 / 15.0 %
PID08Task3
	Dataset split (train/val/test):  4365 / 935 / 935  =  6235 6235
	

In [4]:
##################
# 2.) DONE
##################

import numpy as np
import time
import glob

######################################################

N_test_SIDs = 2
N_val_SIDs = 2
N_SIDs = 19

TE_folder = 'TrainingExamples_16kHz'

######################################################

np.random.seed(37) # for reproducibility

unique_srt_VIDs = unique_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_unique_srt_VIDs.npz')['unique_srt_VIDs'] # sorted VIDs
all_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_VIDs.npz')['VIDs']
unique_srt_SIDs = np.array([x[:5] for i, x in enumerate(unique_srt_VIDs) if i % 2 == 0]) # ['PID02', 'PID05', ..

########################################################################
# For given subject SID (PID02) get corresponding VIDs
def get_subjects_VIDs(SIDs):
    result_VIDs = []
    for SID in SIDs:
        # Take VIDs from both tasks, for this subject
        result_VIDs.append(  SID + 'Task2' )
        result_VIDs.append(  SID + 'Task3' )
    return result_VIDs

# For given subjects get corresponding indicies into the feature set & also their counts (per VID)
def get_subjects_indicies(SIDs):
    indicies = []
    indicies_cnts = []
    for SID in SIDs:
        # Take indicies corresponding to both tasks, for this subject
        a = np.argwhere(all_srt_VIDs == SID + 'Task2')[:,0]
        b = np.argwhere(all_srt_VIDs == SID + 'Task3')[:,0]
        indicies.extend( a )
        indicies.extend( b )
        indicies_cnts.append( len(a) )
        indicies_cnts.append( len(b) )
    return indicies, indicies_cnts
# print frameCnts[26] + frameCnts[27] + frameCnts[0] + frameCnts[1] # checks
# print len(get_subjects_indicies(['PID20', 'PID02']))
########################################################################

N_train_SIDs = N_SIDs - N_test_SIDs - N_val_SIDs
print "Dataset split in terms of subjects (train/val/test): ", 100.*N_train_SIDs/N_SIDs, "/", 100.*N_val_SIDs/N_SIDs, "/", 100.*N_test_SIDs/N_SIDs, "%"
print 

# Randomise the dataset split
permI = np.random.permutation(N_SIDs)
print permI
print "Train SIDs",      unique_srt_SIDs[permI[:N_train_SIDs]]
print "Valid SIDs",      unique_srt_SIDs[permI[N_train_SIDs:N_train_SIDs+N_val_SIDs]]
print "Testi SIDs",      unique_srt_SIDs[permI[N_train_SIDs+N_val_SIDs:]]

train_SIDs_mask = permI[:N_train_SIDs] 
val_SIDs_mask =   permI[N_train_SIDs:N_train_SIDs+N_val_SIDs]
test_SIDs_mask =  permI[N_train_SIDs+N_val_SIDs:]

print "Train SIDs mask", train_SIDs_mask
print "Valid SIDs mask", val_SIDs_mask
print "Testi SIDs mask", test_SIDs_mask

train_VIDs = get_subjects_VIDs(unique_srt_SIDs[train_SIDs_mask])
val_VIDs   = get_subjects_VIDs(unique_srt_SIDs[val_SIDs_mask])
test_VIDs  = get_subjects_VIDs(unique_srt_SIDs[test_SIDs_mask])

print "Train VIDs", train_VIDs
print "Valid VIDs", val_VIDs
print "Testi VIDs", test_VIDs
print 
    
train_mask, train_VIDs_ind_cnts = get_subjects_indicies(unique_srt_SIDs[train_SIDs_mask])
val_mask, val_VIDs_ind_cnts     = get_subjects_indicies(unique_srt_SIDs[val_SIDs_mask])
test_mask, test_VIDs_ind_cnts   = get_subjects_indicies(unique_srt_SIDs[test_SIDs_mask])
# print test_VIDs_ind_cnts

print "Train/val/test set sizes: ", len(train_mask), "/", len(val_mask), "/", len(test_mask), " = ", len(train_mask) + len(val_mask) + len(test_mask)
print "Dataset split in terms of #examples (train/val/test): ", 100.*len(train_mask)/len(all_srt_VIDs), "/", 100.*len(val_mask)/len(all_srt_VIDs), "/", 100.*len(test_mask)/len(all_srt_VIDs), "%"
print 

# print len(train_mask), len(val_mask), len(test_mask)                         # checks
# print len(all_srt_VIDs),  len(train_mask) + len(val_mask) + len(test_mask)

# Save
np.savez('./../Dataset/'+TE_folder+'/Dataset_split/split_masks_all.npz', 
        train_mask=train_mask, val_mask=val_mask, test_mask=test_mask, 
        train_VIDs=train_VIDs, val_VIDs=val_VIDs, test_VIDs=test_VIDs, 
        train_VIDs_ind_cnts=train_VIDs_ind_cnts, val_VIDs_ind_cnts=val_VIDs_ind_cnts, test_VIDs_ind_cnts=test_VIDs_ind_cnts)

Dataset split in terms of subjects (train/val/test):  78.9473684211 / 10.5263157895 / 10.5263157895 %

[ 1  9  2  4 14  8  0  7 13 18  3 10  6  5 16 17 12 11 15]
Train SIDs ['PID05' 'PID16' 'PID06' 'PID09' 'PID22' 'PID15' 'PID02' 'PID13' 'PID21'
 'PID26' 'PID08' 'PID17' 'PID11' 'PID10' 'PID24']
Valid SIDs ['PID25' 'PID20']
Testi SIDs ['PID18' 'PID23']
Train SIDs mask [ 1  9  2  4 14  8  0  7 13 18  3 10  6  5 16]
Valid SIDs mask [17 12]
Testi SIDs mask [11 15]
Train VIDs ['PID05Task2', 'PID05Task3', 'PID16Task2', 'PID16Task3', 'PID06Task2', 'PID06Task3', 'PID09Task2', 'PID09Task3', 'PID22Task2', 'PID22Task3', 'PID15Task2', 'PID15Task3', 'PID02Task2', 'PID02Task3', 'PID13Task2', 'PID13Task3', 'PID21Task2', 'PID21Task3', 'PID26Task2', 'PID26Task3', 'PID08Task2', 'PID08Task3', 'PID17Task2', 'PID17Task3', 'PID11Task2', 'PID11Task3', 'PID10Task2', 'PID10Task3', 'PID24Task2', 'PID24Task3']
Valid VIDs ['PID25Task2', 'PID25Task3', 'PID20Task2', 'PID20Task3']
Testi VIDs ['PID18Task2', 'PID18Tas