In [6]:
import os
import scipy
import librosa
import librosa.display
import pretty_midi
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from music21 import converter, midi
from IPython.display import Audio

In [7]:
path = Path('/mnt/sdb/ibukey/asap-dataset/Bach/Fugue/bwv_848')
performance_wav1 = str(path/'Denisova06M.wav')
performance_wav2 = str(path/'Lee01M.wav')
musicxml_score = str(path/'xml_score.musicxml')

In [8]:
with open("file_list.txt", "w") as file:
    file.write(performance_wav1.replace('/mnt/sdb/ibukey/', '/opt/')+'\n')
    file.write(performance_wav2.replace('/mnt/sdb/ibukey/', '/opt/')+'\n')

In [4]:
# Audio to MIDI transcription
os.makedirs('midi', exist_ok=True)
os.makedirs('onf_score', exist_ok=True)

In [84]:
# ! docker build -t onf .

In [83]:
# ! docker run \
# 	-v /home/ibukey/music_translation_eval/midi:/opt/midi \
# 	-v /mnt/sdb/ibukey/asap-dataset:/opt/asap-dataset \
#     -v /mnt/sdb/ibukey/music_translation_eval/onf_score:/opt/onf_score \
#     -t onf

In [9]:
# midi alignment of two audio recordings

In [10]:
transcribed_midi1 = 'midi/asap-dataset/Bach/Fugue/bwv_848/Denisova06M.wav.midi'
transcribed_midi2 = 'midi/asap-dataset/Bach/Fugue/bwv_848/Lee01M.wav.midi'

In [11]:
frames1 = np.load('wavs/Denisova06M_preds.npy', allow_pickle=True)[0]['frame_predictions'][0].T
frames2 = np.load('/mnt/sdb/ibukey/asap-dataset/Bach/Fugue/bwv_848/Lee01M_preds.npy', allow_pickle=True)[0]['frame_predictions'][0].T

In [12]:
# aligning frames from o&f for time alignment
D, wp = librosa.sequence.dtw(frames1, frames2)

## Eval

In [13]:
from mir_eval.transcription import precision_recall_f1_overlap, onset_precision_recall_f1, offset_precision_recall_f1

In [14]:
midi_data = pretty_midi.PrettyMIDI(transcribed_midi1)
ref_intervals, ref_pitches = [], []
for instrument in midi_data.instruments:
    for note in instrument.notes:
        ref_intervals.append((note.start, note.end))
        ref_pitches.append(note.pitch)
ref_intervals = np.array(ref_intervals)
ref_pitches = np.array(ref_pitches)

In [15]:
seen = set()
new_wp = np.array([(a, b) for a, b in wp[::-1] if b not in seen and not seen.add(b)])

In [16]:
interp_func = scipy.interpolate.interp1d(new_wp[:, 1], new_wp[:, 0], kind='linear', fill_value="extrapolate")

In [17]:
midi_data = pretty_midi.PrettyMIDI(transcribed_midi2)
est_intervals, est_pitches = [], []
for instrument in midi_data.instruments:
    for note in instrument.notes:
        start = interp_func(note.start * frame_rate).item() / frame_rate
        end = interp_func(note.end * frame_rate).item() / frame_rate
        if start == end: # if interpolation causes start and end to be the same due to short duration
            end += 1e-9
        est_intervals.append((start, end))
        est_pitches.append(note.pitch)
est_intervals = np.array(est_intervals)
est_pitches = np.array(est_pitches)

In [18]:
precision, recall, f_measure, avg_overlap_ratio = precision_recall_f1_overlap(ref_intervals, ref_pitches, est_intervals, est_pitches)

In [19]:
print(f"precision: {round(precision, 2)} \nrecall: {round(recall, 2)} \nf_measure: {round(f_measure, 2)} \navg_overlap_ratio: {round(avg_overlap_ratio, 2)}")


precision: 0.78 
recall: 0.78 
f_measure: 0.78 
avg_overlap_ratio: 0.86


In [20]:
p, r, f = onset_precision_recall_f1(ref_intervals, est_intervals)
p, r, f

(0.9475920679886686, 0.9469214437367304, 0.9472566371681417)

In [21]:
p, r, f = offset_precision_recall_f1(ref_intervals, est_intervals)
p, r, f

(0.8555240793201133, 0.8549186128803963, 0.8552212389380531)