In [None]:
'''
This is the data processing script for POP909:A Pop song Dataset for Music Arrangement Generation
============
It will allow you to quickly process the POP909 Files (Midi) into the Google Magenta's music representation 
    as like [Music Transformer](https://magenta.tensorflow.org/music-transformer) 
            [Performance RNN](https://magenta.tensorflow.org/performance-rnn).

'''


import pickle
import os
import sys
import utils

import pretty_midi as pyd
import numpy as np



In [None]:
from processor import MidiEventProcessor

In [None]:
total = 0
def preprocess_midi(path):
    global total
    data = pyd.PrettyMIDI(path)
    
    main_notes = []
    acc_notes = []
    for ins in data.instruments:
        acc_notes.extend(ins.notes)
    for i in range(len(main_notes)):
        main_notes[i].start = round(main_notes[i].start,2)
        main_notes[i].end = round(main_notes[i].end,2)
    for i in range(len(acc_notes)):
        acc_notes[i].start = round(acc_notes[i].start,2)
        acc_notes[i].end = round(acc_notes[i].end,2)
    main_notes.sort(key = lambda x:x.start)
    acc_notes.sort(key = lambda x:x.start)
    mpr = MidiEventProcessor()
    
    # changed from [main_notes, acc_notes] to main_notes+ acc_notes
    repr_seq = mpr.encode(main_notes + acc_notes)
    total += len(repr_seq)
    return repr_seq

def preprocess_pop909_single_midi(midi_root, save_dir):
    save_py = []
    midi_paths = [d for d in os.listdir(midi_root)]
    i = 0
    out_fmt = '{}-{}.data'
    for path in midi_paths:
        print(' ', end='[{}]'.format(path), flush=True)
        filename = midi_root + path

        if filename[-3:] != "mid":
            continue
        try:
            data = preprocess_midi(filename)
        except KeyboardInterrupt:
            print(' Abort')
            return
        except EOFError:
            print('EOF Error')
            return
        save_py.append(data)
    save_py = np.array(save_py)
    print(save_py.size)
    np.save("pop909-event-token.npy", save_py)
            
def preprocess_pop909(midi_root, save_dir):
    save_py = []
    midi_paths = [d for d in os.listdir(midi_root)]
    i = 0
    out_fmt = '{}-{}.data'
    for path in midi_paths:
        # print(' ', end='[{}]'.format(path), flush=True)
        
        try: 
            int(path)
        except:
            continue
        filename = midi_root + path + "/" + path + ".mid"
        print(filename)
        data = []
        if filename[-3:] != "mid":
            continue
        try:
            data = preprocess_midi(filename)
        except KeyboardInterrupt:
            print(' Abort')
            return
        except EOFError:
            print('EOF Error')
            return
        save_py.append(data)
    save_py = np.array(save_py, dtype=object)
    print(save_py.size)
    np.save("pop909-event-token.npy", save_py)
             
# replace the folder with your POP909 data folder
# not sure if each midi file should be its own npy file or if the entire dataset should be put into one file????????
# Basic structure of how they were originally doing it makes me think one file but not sure as of now
# might make more sense to do 1 at a time since we avoid the problem with ragged arrays

# uncomment based on which approach, single_midi for one file (must come up with new name for each file to not overwrite)
# original for all of them in one file.
# preprocess_pop909_single_midi("../POP909/001/","midi_data/")
# preprocess_pop909("../POP909/", "midi_data/")