In [54]:
import json
import pretty_midi as pm
import pandas as pd
import numpy as np
from pathlib import Path

In [55]:
#maestro dataframe (from google)
maestro_df = pd.read_json('maestro-v2.0.0.json')

#virtuoso dataframe (i produce parsing the folders of the dataset)
virtuoso_df = pd.read_pickle("performance_dataframe.pkl")

In [56]:
maestro_df.head()

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508


In [57]:
virtuoso_df.head()

Unnamed: 0,author,folder,midi2midi_alignment_path,performed_midi_path,performer,score2midi_alignment,score_midi_path,score_xml_path,title
0,Bach,Bach/Fugue/bwv_846,Bach/Fugue/bwv_846/Shi05_infer_corresp.txt,Bach/Fugue/bwv_846/Shi05.mid,Shi05,Bach/Fugue/bwv_846/Shi05_infer_match.txt,Bach/Fugue/bwv_846/midi_cleaned.mid,Bach/Fugue/bwv_846/musicxml_cleaned.musicxml,Fugue_bwv_846
1,Bach,Bach/Fugue/bwv_848,Bach/Fugue/bwv_848/Denisova06_infer_corresp.txt,Bach/Fugue/bwv_848/Denisova06.mid,Denisova06,Bach/Fugue/bwv_848/Denisova06_infer_match.txt,Bach/Fugue/bwv_848/midi_cleaned.mid,Bach/Fugue/bwv_848/musicxml_cleaned.musicxml,Fugue_bwv_848
2,Bach,Bach/Fugue/bwv_848,Bach/Fugue/bwv_848/Lee01_infer_corresp.txt,Bach/Fugue/bwv_848/Lee01.mid,Lee01,Bach/Fugue/bwv_848/Lee01_infer_match.txt,Bach/Fugue/bwv_848/midi_cleaned.mid,Bach/Fugue/bwv_848/musicxml_cleaned.musicxml,Fugue_bwv_848
3,Bach,Bach/Fugue/bwv_848,Bach/Fugue/bwv_848/LeeSH01_infer_corresp.txt,Bach/Fugue/bwv_848/LeeSH01.mid,LeeSH01,Bach/Fugue/bwv_848/LeeSH01_infer_match.txt,Bach/Fugue/bwv_848/midi_cleaned.mid,Bach/Fugue/bwv_848/musicxml_cleaned.musicxml,Fugue_bwv_848
4,Bach,Bach/Fugue/bwv_848,Bach/Fugue/bwv_848/Lin04_infer_corresp.txt,Bach/Fugue/bwv_848/Lin04.mid,Lin04,Bach/Fugue/bwv_848/Lin04_infer_match.txt,Bach/Fugue/bwv_848/midi_cleaned.mid,Bach/Fugue/bwv_848/musicxml_cleaned.musicxml,Fugue_bwv_848


## Some scripts for the automatic alignment

In [None]:
MAESTRO_BPATH = Path("./maestro-v2.0.0/")  #you need to download the maestro dataset from https://magenta.tensorflow.org/datasets/maestro
VIRTUOSO_BPATH = Path("./")

def get_midi_notes(midi,start = 0): # get all midi notes ordered
    #extract notes looping through the instrument
    notes_all = [[note.start, note.end, note.pitch] for instrument in midi.instruments for note in instrument.notes if note.start>=start]
    #order notes according to onset and pitch
    return np.array(sorted(notes_all, key = lambda x: (x[0],x[2])))

def normalize_midi_notes(notes): # remove the initial silence in the midi 
    #get the onset of the first midi note
    first = notes[0][0]
    #subtract the first to all the tempos
    notes[:, :2] -= first
    return notes

def path2normmidinotes(base_path,midi_path,start = 0): #function to add the normnilized notes to the dataframes
    midi = pm.PrettyMIDI(str(Path(base_path,midi_path)))
    return normalize_midi_notes(get_midi_notes(midi,start = start))

#We add the normalized note to the df instead of computing them on the fly to speed up the computation later.
#the alignment is a search problem of complexity len(virtuoso)*len(maestro) and the midi import take some time,
#so better to do it just once for all the pieces.
#As a negative side, this will take a huge amount of memory, so other solutions can be maybe found
virtuoso_df["norm_notes"]= virtuoso_df.apply(lambda row:  path2normmidinotes(VIRTUOSO_BPATH,row["performed_midi_path"]),axis=1)
maestro_df["norm_notes"]= maestro_df.apply(lambda row:  path2normmidinotes(MAESTRO_BPATH,row["midi_filename"]),axis=1)

In [None]:
def notes_are_similar(notes1,notes2, number_to_match,tolerance = 0.03, verbose=False):
    """
    function to test if 2 lists of notes are similar. A note is a triple (start,end,pitch)
    PARAMETERS:
    -number to match: the number of notes (at beginning) to match to consider similar 2 list of notes
    -tolerance: absolute tolerance in ms to consider two time similar
    """
    #test the lenght is number_to_match is not specified
    if number_to_match is None and (len(notes1)!=len(notes2)):
        if verbose: print("Not equal lenght:", len(notes1), "vs", len(notes2))
        return False
    
    onsets1 = notes1[:number_to_match, 0]
    offsets1 = notes1[:number_to_match, 1]
    pitches1 = notes1[:number_to_match, 2]
    onsets2 = notes2[:number_to_match, 0]
    offsets2 = notes2[:number_to_match, 1]
    pitches2 = notes2[:number_to_match, 2]
    #test onsets
    try:
        np.testing.assert_allclose(onsets1, onsets2,atol=tolerance)
    except AssertionError as e:
        if verbose: print("Onset problem:",e)
        return False
    #test offsets
    try:
        np.testing.assert_allclose(offsets1, offsets2,atol=tolerance)
    except AssertionError as e:
        if verbose: print("Offset problem:",e)
        return False
    #test pitches
    if pitches1 == pitches2:
        return True
    else:
        if verbose: 
            print("Pitch problem occurring at indices:")
            print([(i,p1,p2) for i,(p1,p2) in enumerate(zip(pitches1,pitches2)) if p1!=p2])
        return False

def find_similar_midi(df1,df2, number_to_match):
    """
    Given two dataframes find a correspondence of each piece of the first to the second (if exist).
    There is no control for injectivity or uniqueness of the matching function for now except one print warning for multiple matching (that we woul very like to avoid).
    """
    #initialize the line contatining the matching information
    correspondence = []
    #now search for a similar midi in maestro_dict
    for i1,row1 in df1.iterrows():
        #compare row1 with each element of df2
        matched = df2.apply(lambda row2: notes_are_similar(row1["norm_notes"],row2["norm_notes"],5),axis=1)
        #save the matching information
        number_matched = sum(matched)
        if number_matched == 0:
            correspondence.append(None)
        elif number_matched == 1:
            correspondence.append(df2[matched]["Unnamed: 4"].tolist()[0])
        else: #multiple matching 
            print("WARNING: Multiple matching found for", row1["performed_midi_path"]+".mid")
            correspondence.append(df2[matched]["midi_filename"].tolist())
    df1["correspondence"] =correspondence

In [None]:
find_similar_midi(virtuoso_df,maestro_df,10)