## Config-1 Spec:
- 4 disjoint train/test splits.
- Training and Test data have NO overlapping pieces.
- For each split: 80% training 20% testing. (216 pieces and 54 pieces)
- Each training split has 26 tracks for validation set.



## Config-2 Spec:
- Only MUS pieces.
- Test data from ‘ENSTDkAm’ and ‘ENSTDkCl’
- Train from everything else



## Imports

In [8]:
import os
import shutil
import numpy as np

## Constants

In [2]:
MAPS_DIR = '../datasets/maps/'
MAPS_CONFIG_1_DIR = '../datasets/maps_config1/'
MAPS_CONFIG_2_DIR = '../datasets/maps_config2/'

CONFIG_1_FOLD_1 = os.path.join(MAPS_CONFIG_1_DIR, 'fold_1')
CONFIG_1_FOLD_1_TRAIN = os.path.join(CONFIG_1_FOLD_1, 'train')
CONFIG_1_FOLD_1_TEST = os.path.join(CONFIG_1_FOLD_1, 'test')
CONFIG_1_FOLD_2 = os.path.join(MAPS_CONFIG_1_DIR, 'fold_2')
CONFIG_1_FOLD_2_TRAIN = os.path.join(CONFIG_1_FOLD_2, 'train')
CONFIG_1_FOLD_2_TEST = os.path.join(CONFIG_1_FOLD_2, 'test')
CONFIG_1_FOLD_3 = os.path.join(MAPS_CONFIG_1_DIR, 'fold_3')
CONFIG_1_FOLD_3_TRAIN = os.path.join(CONFIG_1_FOLD_3, 'train')
CONFIG_1_FOLD_3_TEST = os.path.join(CONFIG_1_FOLD_3, 'test')
CONFIG_1_FOLD_4 = os.path.join(MAPS_CONFIG_1_DIR, 'fold_4')
CONFIG_1_FOLD_4_TRAIN = os.path.join(CONFIG_1_FOLD_4, 'train')
CONFIG_1_FOLD_4_TEST = os.path.join(CONFIG_1_FOLD_4, 'test')

CONFIG_2_TRAIN_DIR = os.path.join(MAPS_CONFIG_2_DIR, 'train')
CONFIG_2_TEST_DIR = os.path.join(MAPS_CONFIG_2_DIR, 'test')

#### Make Dirs:

In [3]:
if not os.path.exists(MAPS_CONFIG_1_DIR):
    os.mkdir(MAPS_CONFIG_1_DIR)
if not os.path.exists(MAPS_CONFIG_2_DIR):
    os.mkdir(MAPS_CONFIG_2_DIR)
    
if not os.path.exists(CONFIG_1_FOLD_1):
    os.mkdir(CONFIG_1_FOLD_1)
if not os.path.exists(CONFIG_1_FOLD_1_TRAIN):
    os.mkdir(CONFIG_1_FOLD_1_TRAIN)
if not os.path.exists(CONFIG_1_FOLD_1_TEST):
    os.mkdir(CONFIG_1_FOLD_1_TEST)
if not os.path.exists(CONFIG_1_FOLD_2):
    os.mkdir(CONFIG_1_FOLD_2)
if not os.path.exists(CONFIG_1_FOLD_2_TRAIN):
    os.mkdir(CONFIG_1_FOLD_2_TRAIN)
if not os.path.exists(CONFIG_1_FOLD_2_TEST):
    os.mkdir(CONFIG_1_FOLD_2_TEST) 
if not os.path.exists(CONFIG_1_FOLD_3):
    os.mkdir(CONFIG_1_FOLD_3)
if not os.path.exists(CONFIG_1_FOLD_3_TRAIN):
    os.mkdir(CONFIG_1_FOLD_3_TRAIN)
if not os.path.exists(CONFIG_1_FOLD_3_TEST):
    os.mkdir(CONFIG_1_FOLD_3_TEST) 
if not os.path.exists(CONFIG_1_FOLD_4):
    os.mkdir(CONFIG_1_FOLD_4)
if not os.path.exists(CONFIG_1_FOLD_4_TRAIN):
    os.mkdir(CONFIG_1_FOLD_4_TRAIN)
if not os.path.exists(CONFIG_1_FOLD_4_TEST):
    os.mkdir(CONFIG_1_FOLD_4_TEST) 

if not os.path.exists(CONFIG_2_TRAIN_DIR):
    os.mkdir(CONFIG_2_TRAIN_DIR)
if not os.path.exists(CONFIG_2_TEST_DIR):
    os.mkdir(CONFIG_2_TEST_DIR)


## Find Unique Pieces

In [4]:
def is_mus(file_name):
    stripped_name = file_name.partition('_')[2].partition('_')[0].partition('-')[0]
    if stripped_name == 'MUS':
        return True
    return False

In [5]:
unique_pieces = {}
mus_pieces = {}
total_size = 0
for subdir_name in os.listdir(MAPS_DIR):
    subdir_path = os.path.join(MAPS_DIR, subdir_name)
    if not os.path.isdir(subdir_path):
        continue
    for dir_parent, dir_name, file_names in os.walk(subdir_path):
        for name in file_names:
            if name.endswith('.wav'):
                track_name = name.split('.wav')[0]
                if is_mus(track_name):
                    mus_pieces[track_name] = True
                    unique_piece = track_name.partition('-')[2].rpartition('_')[0]
                    unique_pieces[unique_piece] = True
                    size = os.path.getsize(os.path.join(dir_parent, name))
                    total_size += size

In [6]:
mbytes = total_size / 1000000
gbytes = mbytes / 1000
print "MB:" + str(mbytes)
print "GB:" + str(gbytes)

MB:11496
GB:11


In [7]:
len(mus_pieces.keys())

270

In [8]:
len(unique_pieces.keys())

160

## Partition Config-1

- Get all unique pieces. 
- Separate into 4 disjoint sets.
- Bin files into respective fold train test folders.
- Within each set, 80% train 20% test
- Check size of each set.
- Check size of each train and test in each set.

In [9]:
unique_pieces = list(unique_pieces.keys())
np.random.shuffle(unique_pieces)
l = np.array_split(unique_pieces, 4)
fold_1, fold_2, fold_3, fold_4 = l[0], l[1], l[2], l[3]
fold_1_wavs, fold_2_wavs, fold_3_wavs, fold_4_wavs = [], [], [], [] 

#### Bin .wav paths into folds:

In [10]:
for subdir_name in os.listdir(MAPS_DIR):
    subdir_path = os.path.join(MAPS_DIR, subdir_name)
    if not os.path.isdir(subdir_path):
        continue
    for dir_parent, dir_name, file_names in os.walk(subdir_path):
        for name in file_names:
            if name.endswith('.wav'):
                track_name = name.split('.wav')[0]
                if is_mus(track_name):
                    wav = os.path.join(dir_parent, name)
                    mid = os.path.join(dir_parent, track_name + '.mid')
                    unique_piece = track_name.partition('-')[2].rpartition('_')[0]
                    if unique_piece in fold_1:
                        fold_1_wavs.append(name)
                    elif unique_piece in fold_2:
                        fold_2_wavs.append(name)
                    elif unique_piece in fold_3:
                        fold_3_wavs.append(name)
                    elif unique_piece in fold_4:
                        fold_4_wavs.append(name)

In [11]:
print len(fold_1_wavs)
print len(fold_2_wavs)
print len(fold_3_wavs)
print len(fold_4_wavs)

65
64
72
69


In [12]:
train_index = int(.8 * len(fold_1))
fold_1_train = fold_1[:train_index]
fold_1_test = fold_1[train_index:]
fold_2_train = fold_2[:train_index]
fold_2_test = fold_2[train_index:]
fold_3_train = fold_3[:train_index]
fold_3_test = fold_3[train_index:]
fold_4_train = fold_4[:train_index]
fold_4_test = fold_4[train_index:]

#### Copy .wav into respective fold train/test:

In [None]:
for subdir_name in os.listdir(MAPS_DIR):
    subdir_path = os.path.join(MAPS_DIR, subdir_name)
    if not os.path.isdir(subdir_path):
        continue
    for dir_parent, dir_name, file_names in os.walk(subdir_path):
        for name in file_names:
            if name.endswith('.wav'):
                track_name = name.split('.wav')[0]
                if is_mus(track_name):
                    wav = os.path.join(dir_parent, name)
                    mid = os.path.join(dir_parent, track_name + '.mid')
                    unique_piece = track_name.partition('-')[2].rpartition('_')[0]
                    if unique_piece in fold_1_train:
                        shutil.copy2(wav, CONFIG_1_FOLD_1_TRAIN)
                        shutil.copy2(mid, CONFIG_1_FOLD_1_TRAIN)
                    elif unique_piece in fold_1_test:
                        shutil.copy2(wav, CONFIG_1_FOLD_1_TEST)
                        shutil.copy2(mid, CONFIG_1_FOLD_1_TEST)
                    elif unique_piece in fold_2_train:
                        shutil.copy2(wav, CONFIG_1_FOLD_2_TRAIN)
                        shutil.copy2(mid, CONFIG_1_FOLD_2_TRAIN)
                    elif unique_piece in fold_2_test:
                        shutil.copy2(wav, CONFIG_1_FOLD_2_TEST)
                        shutil.copy2(mid, CONFIG_1_FOLD_2_TEST)
                    elif unique_piece in fold_3_train:
                        shutil.copy2(wav, CONFIG_1_FOLD_3_TRAIN)
                        shutil.copy2(mid, CONFIG_1_FOLD_3_TRAIN)
                    elif unique_piece in fold_3_test:
                        shutil.copy2(wav, CONFIG_1_FOLD_3_TEST)
                        shutil.copy2(mid, CONFIG_1_FOLD_3_TEST)
                    elif unique_piece in fold_4_train:
                        shutil.copy2(wav, CONFIG_1_FOLD_4_TRAIN)
                        shutil.copy2(mid, CONFIG_1_FOLD_4_TRAIN)
                    elif unique_piece in fold_4_test:
                        shutil.copy2(wav, CONFIG_1_FOLD_4_TEST)
                        shutil.copy2(mid, CONFIG_1_FOLD_4_TEST)

## Partition Config-2

In [None]:
accoustic_folders = ['ENSTDkAm', 'ENSTDkCl']

for subdir_name in os.listdir(MAPS_DIR):
    subdir_path = os.path.join(MAPS_DIR, subdir_name)
    if not os.path.isdir(subdir_path):
        continue
    for dir_parent, dir_name, file_names in os.walk(subdir_path):
        for name in file_names:
            if name.endswith('.wav'):
                track_name = name.split('.wav')[0]
                
                # Partition music pieces only.
                if is_mus(track_name):
                    wav = os.path.join(dir_parent, name)
                    mid = os.path.join(dir_parent, track_name + '.mid')

                    # if subdir_name is a test dir... add to config 2 test
#                     if subdir_name in accoustic_folders:
#                         shutil.copy2(wav, CONFIG_2_TEST_DIR)
#                         shutil.copy2(mid, CONFIG_2_TEST_DIR)
#                     else:
#                         shutil.copy2(wav, CONFIG_2_TRAIN_DIR)
#                         shutil.copy2(mid, CONFIG_2_TRAIN_DIR)
                        

In [15]:
# List all wavs in old_train
OLD_TRAIN = '../datasets/maps_config2/old_train/'
MAPS_CONFIG2 = '../datasets/maps_config2/'

wavs = []
for some_file in os.listdir(OLD_TRAIN):
    if some_file.endswith('.wav'):
        wavs.append(some_file)

In [16]:
# Partition into 4 sets
np.random.shuffle(wavs)
partitions = np.array_split(wavs, 4)

In [17]:
# Copy respective files into their own fold folders
def copy_wavs_to_folder(wavs, folder_name):
    for wav in wavs:
        source_wav = os.path.join(OLD_TRAIN, wav)
        dest = os.path.join(MAPS_CONFIG2, folder_name)
        mid = wav.split('.wav')[0]
        mid = mid + '.mid'
        source_mid = os.path.join(OLD_TRAIN, mid)
        
        shutil.copy2(source_wav, dest)
        shutil.copy2(source_mid, dest)

In [18]:
copy_wavs_to_folder(partitions[0], 'fold_1')
copy_wavs_to_folder(partitions[1], 'fold_2')
copy_wavs_to_folder(partitions[2], 'fold_3')
copy_wavs_to_folder(partitions[3], 'fold_4')