In [1]:
import os
import librosa
import librosa.display
import pretty_midi
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from music21 import converter, midi
from IPython.display import Audio

In [2]:
path = Path('/mnt/sdb/ibukey/asap-dataset/Bach/Fugue/bwv_848')
performance_wav1 = str(path/'Denisova06M.wav')
performance_wav2 = str(path/'Lee01M.wav')
musicxml_score = str(path/'xml_score.musicxml')

In [3]:
with open("file_list.txt", "w") as file:
    file.write(performance_wav1.replace('/mnt/sdb/ibukey/', '/opt/')+'\n')
    file.write(performance_wav2.replace('/mnt/sdb/ibukey/', '/opt/')+'\n')

In [4]:
# Audio to MIDI transcription
os.makedirs('midi', exist_ok=True)

In [19]:
! docker build -t onf .

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[1A[0G[?25l[+] Building 0.1s (2/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 1.01kB                                     0.0s
[0m[34m => [internal] load metadata for docker.io/library/python:3.7-slim-bullse  0.1s
[1A[1A[1A[1A[0G[?25l[+] Building 0.2s (12/14)                                        docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 1.01kB                                     0.0s
[0m[34m => [internal] load metadata for docker.io/library/python:3.7-slim-bullse  0.1s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0

In [20]:
! docker run \
	-v /home/ibukey/icml_eval/midi:/opt/midi \
	-v /mnt/sdb/ibukey/asap-dataset:/opt/asap-dataset \
    -t onf

Processing: /opt/asap-dataset/Bach/Fugue/bwv_848/Denisova06M.wav
2025-01-26 18:13:53.593737: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-01-26 18:13:53.593785: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
/bin/sh: 1: sox: not found
SoX could not be found!

    If you do not have SoX, proc

In [4]:
# MusicXML to MIDI
def musicxml_to_midi(musicxml_file):
    score = converter.parse(musicxml_file)
    midi_stream = score.write('midi')
    print(f"MIDI file saved as {midi_stream}")
    return midi_stream

In [5]:
# Align MIDI using DTW
def extract_notes(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []

    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append((note.start, note.pitch))
    
    notes.sort(key=lambda x: x[0])
    return np.array(notes)

def align_midi_with_dtw(midi_file_1, midi_file_2):
    notes_1 = extract_notes(midi_file_1)
    notes_2 = extract_notes(midi_file_2)
    
    time_1 = notes_1[:, 0]
    pitch_1 = notes_1[:, 1]
    
    time_2 = notes_2[:, 0]
    pitch_2 = notes_2[:, 1]
    
    # Convert time-pitch sequences into 2D feature matrices
    # Feature vector: [time, pitch]
    features_1 = np.vstack((time_1, pitch_1))
    features_2 = np.vstack((time_2, pitch_2))

    # Perform DTW using librosa
    D, wp = librosa.sequence.dtw(X=features_1, Y=features_2)
    
    return D, wp

In [6]:
def plot_alignment(path, midi_file_1, midi_file_2):
    # Extract notes from both MIDI files
    notes_1 = extract_notes(midi_file_1)
    notes_2 = extract_notes(midi_file_2)
    
    # Prepare data for plotting
    time_1 = notes_1[:, 0]
    pitch_1 = notes_1[:, 1]
    time_2 = notes_2[:, 0]
    pitch_2 = notes_2[:, 1]
    
    # Align the notes based on the warping path
    aligned_time_1 = [time_1[i] for i, j in path]
    aligned_pitch_1 = [pitch_1[i] for i, j in path]
    aligned_time_2 = [time_2[j] for i, j in path]
    aligned_pitch_2 = [pitch_2[j] for i, j in path]

    # Plot the notes
    plt.figure(figsize=(10, 6))
    plt.scatter(time_1, pitch_1, color='blue', label='MIDI 1')
    plt.scatter(time_2, pitch_2, color='red', label='MIDI 2')
    plt.plot(aligned_time_1, aligned_pitch_1, color='blue', linestyle='--', label='Aligned MIDI 1')
    plt.plot(aligned_time_2, aligned_pitch_2, color='red', linestyle='--', label='Aligned MIDI 2')
    
    plt.xlabel('Time (s)')
    plt.ylabel('Pitch (MIDI Number)')
    plt.legend()
    plt.show()

In [7]:
# midi alignment of two audio recordings

In [8]:
transcribed_midi1 = 'midi/asap-dataset/Bach/Fugue/bwv_848/Denisova06M.wav.midi'
transcribed_midi2 = 'midi/asap-dataset/Bach/Fugue/bwv_848/Lee01M.wav.midi'

In [9]:
D, wp = align_midi_with_dtw(transcribed_midi1, transcribed_midi2)

In [10]:
# midi alignment of synthesized musicxml and audio

In [11]:
# xml_to_midi_file = str(musicxml_to_midi(musicxml_score))

In [12]:
# D, wp = align_midi_with_dtw(xml_to_midi_file, transcribed_midi1)

## Eval

In [13]:
from mir_eval.transcription import precision_recall_f1_overlap, onset_precision_recall_f1, offset_precision_recall_f1

In [14]:
midi_data = pretty_midi.PrettyMIDI(transcribed_midi1)
ref_intervals, ref_pitches = [], []
for instrument in midi_data.instruments:
    for note in instrument.notes:
        ref_intervals.append((note.start, note.end))
        ref_pitches.append(note.pitch)
ref_intervals = np.array(ref_intervals)
ref_pitches = np.array(ref_pitches)

In [15]:
midi_data = pretty_midi.PrettyMIDI(transcribed_midi2)
est_intervals, est_pitches = [], []
for instrument in midi_data.instruments:
    for note in instrument.notes:
        est_intervals.append((note.start, note.end))
        est_pitches.append(note.pitch)
est_intervals = np.array(est_intervals)
est_pitches = np.array(est_pitches)

In [16]:
ref_intervals = ref_intervals[wp[::-1].T[0]]
ref_pitches = ref_pitches[wp[::-1].T[0]]
est_intervals = est_intervals[wp[::-1].T[1]]
est_pitches = est_pitches[wp[::-1].T[1]]

In [17]:
precision, recall, f_measure, avg_overlap_ratio = precision_recall_f1_overlap(ref_intervals, ref_pitches, est_intervals, est_pitches)

In [18]:
print(f"precision: {round(precision, 2)} \nrecall: {round(recall, 2)} \nf_measure: {round(f_measure, 2)} \navg_overlap_ratio: {round(avg_overlap_ratio, 2)}")


precision: 0.04 
recall: 0.04 
f_measure: 0.04 
avg_overlap_ratio: 0.74


In [19]:
p, r, f = onset_precision_recall_f1(ref_intervals, est_intervals)
p, r, f

(0.4849665924276169, 0.4849665924276169, 0.4849665924276169)

In [20]:
p, r, f = offset_precision_recall_f1(ref_intervals, est_intervals)
p, r, f

(0.5044543429844098, 0.5044543429844098, 0.5044543429844098)