# Organize all .npz files into one directory

In [1]:
import os
import numpy as np
# import matplotlib.pyplot as plt
from pypianoroll import Multitrack, Track

In [2]:
original_dir = './lpd_5_cleansed'
target_dir = './lpd_5_cleansed-piano_non_empty400bool'

if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [3]:
program_nums = [118, 0, 25, 33, 49]
is_drums = [True, False, False, False, False]
track_names = ['drums', 'piano', 'guitar', 'bass', 'strings']
tempo = 80.0
beat_resolution = 24

In [4]:
num_songs = 21425
cut_len = 400  # min length in this dataset is 600
cut_pitch_range = 84
num_tracks = 5

## save each piano roll segment as .npy file

I clear the underneath cell because I print all file names and the output is too long.

In [None]:
count = 0
for root, _, files in os.walk(original_dir):
    for name in files:
        if name.endswith('npz'):
            npz_file = os.path.join(root, name)
            print(npz_file)

            mt = Multitrack(npz_file, beat_resolution=24)

            # To verify that the track order is: 'drums', 'piano', 'guitar', 'bass', 'strings'
            # track_list = mt.tracks
            # for track in track_list:
            #     print(track.name)

            mt.binarize()
            pr = mt.get_stacked_pianorolls()  # shape=(num_time_step, 128, num_track)

            # Starting from time step 0, cut 400-time-step-length segment successively
            # Pad zero to the last segment whose length is less than 400
            K = pr.shape[0]//cut_len
            for k in range(K + 1):
                if k == K:
                    segment_pr = np.zeros([cut_len, 128, num_tracks])
                    segment_pr[:(pr.shape[0] % cut_len), :, :] = pr[k*cut_len:, :, :]
                else:
                    segment_pr = pr[k * cut_len:(k + 1) * cut_len, :, :]

                # Remove the segment whose piano track is empty
                if np.sum(segment_pr[..., 1]) != 0:   # track 1 is piano
                    count += 1

#                     # Save the piano-roll figure of each segment
#                     tracks = list()
#                     for j in range(segment_pr.shape[2]):
#                         track = Track(segment_pr[..., j], program_nums[j], is_drums[j], track_names[j])
#                         tracks.append(track)
#                     cut_mt = Multitrack(tracks=tracks, tempo=tempo, beat_resolution=beat_resolution)
#                     cut_mt.plot(filepath=os.path.join(pianoroll_dir, str(count)) + '.png')
#                     plt.close('all')

                    # Save the segment as .npy file
                    segment_data = segment_pr[:, 20:104, :]
                    np.save(os.path.join(target_dir, str(count)+'.npy'), segment_data.astype(np.bool_))

## Split into three sub-dirs for train, develop and test

In [16]:
import shutil
from random import shuffle

In [17]:
sample_names = [f for f in os.listdir(target_dir) if f.endswith('.npy')]
shuffle(sample_names)

In [19]:
subset_dirs = ['train', 'develop', 'test']

for sub in subset_dirs:
    sub_dirname = os.path.join(target_dir, sub)
    if not os.path.exists(sub_dirname):
        os.makedirs(sub_dirname)
        
# Split all files for training, development, test respectively 8:1:1
split_idx1 = int(len(sample_names) * 0.8)
split_idx2 = int(len(sample_names) * 0.9)

train_samples = sample_names[:split_idx1]
print(len(train_samples))
for f in train_samples:
    shutil.move(os.path.join(target_dir, f), os.path.join(target_dir, 'train', f))

dev_samples = sample_names[split_idx1:split_idx2]
print(len(dev_samples))
for f in dev_samples:
    shutil.move(os.path.join(target_dir, f), os.path.join(target_dir, 'develop', f))

test_samples = sample_names[split_idx2:]
print(len(test_samples))
for f in test_samples:
    shutil.move(os.path.join(target_dir, f), os.path.join(target_dir, 'test', f))

163658
20457
20458
