# Process MIDI for GloVe
Loads a collection of MIDI files from disk, filters out all non-monophonic tracks, and saves a sequence of notes from the remaining tracks to `data/notes.txt`. `notes.txt` can then be used by GloVe to create MIDI note embeddings.

## Imports

In [18]:
import os, time
import numpy as np
import pretty_midi
import pandas as pd

## Utils

In [22]:
def parse_midi(path):
    midi = None
    with open(path, 'rb') as f:
        try:
            midi = pretty_midi.PrettyMIDI(f)
            midi.remove_invalid_notes()
        except:
            pass
    return midi

def get_percent_monophonic(pm_instrument_roll):
    mask = pm_instrument_roll.T > 0
    notes = np.sum(mask, axis=1)
    n = np.count_nonzero(notes)
    single = np.count_nonzero(notes == 1)
    if single > 0:
        return float(single) / float(n)
    elif single == 0 and n > 0:
        return 0.0
    else: # no notes of any kind
        return 0.0
    
def filter_monophonic(pm_instruments, percent_monophonic=0.99):
    return [i for i in pm_instruments if get_percent_monophonic(i.get_piano_roll()) >= percent_monophonic]
def sort_by_start(note):
    return float(note.start)
def get_note_string(midi):
    midi = parse_midi(f)
    if midi is not None:
        for instrument in midi.instruments:
            buff = [n for n in instrument.notes]
        buff.sort(key=sort_by_start)
        buff = [str(n.pitch) for n in buff]
        return buff
    else: 
        return None

Edit `midi_dir` to point to the directory of MIDI files you would like to learn your note embeddings from.

In [23]:
midi_dir = 'data/audio'

Run it!

In [4]:
files = [os.path.join(midi_dir, f) for f in os.listdir(midi_dir)]
unread = []
midi_vector = []
start_time = time.time()
i=0
for f in files:
    midi_vector.append(get_note_string(f))
    i += 1
    if i==5: break
print('Finished in {} seconds'.format(time.time() - start_time))

Finished in 0.5497093200683594 seconds


In [7]:
with open('data/audio_embedding.p','wb') as f:
    p.dump(midi_vector,f)
with open('data/unread_list.p','wb') as f:
    p.dump(unread,f)
