In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../..') # leave the notebooks folder into root as the main

import matplotlib.pyplot as plt
import numpy as np
from dtw import *
import scipy
import librosa

from app.core.audio.AudioData import AudioData
from app.core.audio.AudioPlayer import AudioPlayer
from app.core.midi.MidiData import MidiData
from app.core.midi.MidiPlayer import MidiPlayer
from app.core.midi.MidiSynth import MidiSynth

from app.algorithms.pitch.PYin import PYin
from app.algorithms.align.OnsetDf import UserOnsetDf, MidiOnsetDf
from app.config import AppConfig
from app.core.recording.PitchDf import PitchDf, PitchConfig
from app.algorithms.align.DTW import DTW
from app.algorithms.align.CQT import CQT

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [8]:
AUDIO_FILEPATH = '../../app/resources/audio/user_fugue2.mp3'
MIN_VIOLIN_FREQ = 196.0
SAMPLE_RATE = 44100

user_audio_data = AudioData()
user_audio_data.load_data(AUDIO_FILEPATH)
pitches, most_likely_pitches = PYin.pyin(user_audio_data.data)

Processing frame 4604/4604
Done!


In [5]:
# Create a synth with a soundfont
SOUNDFONT_FILEPATH = '../../app/resources/MuseScore_General.sf3'
midi_synth = MidiSynth(SOUNDFONT_FILEPATH)

# Load the midi file into a MidiData object
MIDI_FILEPATH = '../../app/resources/midi/fugue.mid'
midi_data = MidiData(MIDI_FILEPATH)

# Create MidiSynth/Player objects
midi_player = MidiPlayer(midi_synth)
midi_player.load_midi(midi_data)

# Also synthesize + convert MIDI to audio (for later CQT extraction)
midi_audio_data = AudioData()
midi_audio_data.load_midi_file(MIDI_FILEPATH, SOUNDFONT_FILEPATH)

# midi_player.play(start_time=0) # Play the MIDI

Loading MidiSynth...
Synth + soundfont loaded.


In [9]:
# View the pitch estimates in the app
import sys
sys.path.append('..')

from app.ui.plots.PitchPlot import RunPitchPlot
from app.core.midi.MidiData import MidiData
from PyQt6.QtWidgets import QApplication
from PyQt6.QtCore import QCoreApplication


if __name__ == '__main__':
    if not QCoreApplication.instance():
        app = QApplication(sys.argv)
    else:
        app = QCoreApplication.instance()

    # ALIGNED_MIDI_FILEPATH = 'aligned.mid'
    # aligned_midi_data = MidiData(ALIGNED_MIDI_FILEPATH)

    # pitchplot = RunPitchPlot(
    #     app, midi_data=midi_data, pitches=most_likely_pitches, onsets=user_onset2_df.onset_df, align_df=align_df2
    # )
    pitchplot = RunPitchPlot(
        app, midi_data=midi_data, pitches=most_likely_pitches
    )

Plotting pitches...
Done!


qt.pointer.dispatch: skipping QEventPoint(id=1 ts=0 pos=0,0 scn=729.825,338.795 gbl=729.825,338.795 Released ellipse=(1x1 ∡ 0) vel=0,0 press=-729.825,-338.795 last=-729.825,-338.795 Δ 729.825,338.795) : no target window
qt.pointer.dispatch: skipping QEventPoint(id=2 ts=0 pos=0,0 scn=776.267,307.836 gbl=776.267,307.836 Released ellipse=(1x1 ∡ 0) vel=0,0 press=-776.267,-307.836 last=-776.267,-307.836 Δ 776.267,307.836) : no target window


In [24]:
import pandas as pd

user_onset_df = UserOnsetDf(user_audio_data, most_likely_pitches)

def make_note_df(onset_df, pitch_list):
    """
    Add median pitch MIDI number and duration for each note in onset_df using pitch_list.
    
    Args:
        onset_df (pd.DataFrame): DataFrame with 'time' and 'onset' columns.
        pitch_list (list[Pitch]): List of Pitch objects with 'midi_num' and 'time'.
    
    Returns:
        pd.DataFrame: Updated onset_df with 'median_pitch' and 'duration' columns.
    """
    # Convert pitch_list to a DataFrame for easier manipulation
    pitch_df = pd.DataFrame([(p.time, p.midi_num) for p in pitch_list], columns=['time', 'midi_num'])

    # Initialize result columns
    onset_df['pitch'] = None
    onset_df['duration'] = None

    # Iterate through onset_df to calculate median pitch and duration
    for i in range(len(onset_df) - 1):
        start_time = onset_df.loc[i, 'time']
        end_time = None
        
        if i < len(onset_df) - 1:
            end_time = onset_df.loc[i + 1, 'time']
        else:
            # For the last note, use the maximum time in pitch_df
            end_time = pitch_df['time'].max()

        # Filter pitches within this time range
        note_pitches = pitch_df[(pitch_df['time'] >= start_time) & (pitch_df['time'] <= end_time)]

        # Calculate median pitch and duration
        if not note_pitches.empty:
            median_pitch = note_pitches['midi_num'].median()
            duration = end_time - start_time
        else:
            median_pitch = None
            duration = 0.0

        # Update onset_df for the current row
        onset_df.loc[i, 'pitch'] = median_pitch
        onset_df.loc[i, 'duration'] = duration

    # Handle the last row (no duration since there's no next time)
    onset_df.loc[len(onset_df) - 1, 'pitch'] = None
    onset_df.loc[len(onset_df) - 1, 'duration'] = 0.0

    # Ensure median_pitch and duration are properly cast
    onset_df['pitch'] = onset_df['pitch'].astype(float)
    onset_df['duration'] = onset_df['duration'].astype(float)

    return onset_df

note_df = make_note_df(user_onset_df.onset_df, most_likely_pitches)
note_df

Detecting onsets... Done!
Detecting pitch changes with rolling median window_size=30 and threshold=0.6... Done!


Unnamed: 0,time,pitch_diff,onset,pitch,duration
0,0.000000,True,True,62.147640,0.232200
1,0.232200,False,True,62.149591,0.063855
2,0.296054,True,False,50.111616,0.168345
3,0.464399,True,True,61.069563,0.116100
4,0.580499,True,False,73.074377,0.081270
...,...,...,...,...,...
77,12.585216,False,True,73.893175,0.119002
78,12.704218,True,False,72.270714,0.055147
79,12.759365,False,True,72.133982,0.150929
80,12.910295,False,True,72.114955,0.107393


In [25]:
midi_data.pitch_df

Unnamed: 0,note_idx,start,channel,pitch,velocity,duration,frequency
0,0,0.000000,0,62,100,0.185938,293.664768
1,1,0.187500,0,69,100,0.185938,440.000000
2,2,0.375001,0,73,100,0.185938,554.365262
3,3,0.562501,0,76,100,0.185938,659.255114
4,4,0.750002,0,77,100,0.185938,698.456463
...,...,...,...,...,...,...,...
60,60,11.250030,0,63,100,0.185938,311.126984
61,61,11.437530,0,74,100,0.185938,587.329536
62,62,11.625031,0,79,100,0.185938,783.990872
63,63,11.812532,0,72,100,0.185938,523.251131


In [26]:
import numpy as np

def row_distance(midi_row, user_row):
    """
    Calculate the custom cost for substituting one note for another.
    
    Parameters:
    midi_row: pandas.Series (row from midi_df)
    user_row: pandas.Series (row from user_df)
    
    Returns:
    float: The cost of substituting midi_row with user_row.
    """
    pitch_cost = abs(midi_row['pitch'] - user_row['pitch']) if not np.isnan(user_row['pitch']) else 10  # Handle NaNs
    duration_cost = abs(midi_row['duration'] - user_row['duration'])
    
    # Weighted sum
    return 0.7 * pitch_cost + 0.3 * duration_cost

def align_notes(midi_df, user_df):
    """
    Align notes from two dataframes using Levenshtein distance with a custom cost function.
    
    Parameters:
    midi_df: pandas.DataFrame (reference MIDI notes)
    user_df: pandas.DataFrame (user-performed notes)
    
    Returns:
    tuple: (distance, alignment) where alignment is a list of tuples showing matched notes.
    """
    m, n = len(midi_df), len(user_df)
    dp = np.zeros((m + 1, n + 1))

    # Initialize the DP table with insertion and deletion costs
    for i in range(1, m + 1):
        dp[i][0] = dp[i - 1][0] + 10  # Deletion cost (midi_df row deleted)
    for j in range(1, n + 1):
        dp[0][j] = dp[0][j - 1] + 10  # Insertion cost (user_df row inserted)

    # Fill the DP table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            substitution_cost = row_distance(midi_df.iloc[i - 1], user_df.iloc[j - 1])
            dp[i][j] = min(
                dp[i - 1][j] + 10,  # Deletion
                dp[i][j - 1] + 10,  # Insertion
                dp[i - 1][j - 1] + substitution_cost  # Substitution
            )

    # Traceback to find the alignment
    alignment = []
    i, j = m, n
    while i > 0 and j > 0:
        if dp[i][j] == dp[i - 1][j - 1] + row_distance(midi_df.iloc[i - 1], user_df.iloc[j - 1]):
            alignment.append((midi_df.iloc[i - 1], user_df.iloc[j - 1]))  # Match
            i, j = i - 1, j - 1
        elif dp[i][j] == dp[i - 1][j] + 10:
            alignment.append((midi_df.iloc[i - 1], None))  # Deletion
            i -= 1
        else:
            alignment.append((None, user_df.iloc[j - 1]))  # Insertion
            j -= 1

    # Add remaining notes
    while i > 0:
        alignment.append((midi_df.iloc[i - 1], None))  # Deletion
        i -= 1
    while j > 0:
        alignment.append((None, user_df.iloc[j - 1]))  # Insertion
        j -= 1

    return dp[m][n], alignment[::-1]  # Reverse alignment for correct order


distance, alignment = align_notes(midi_data.pitch_df, note_df)

print(f"Total alignment cost: {distance}")
print("Alignment:")
for midi_note, user_note in alignment:
    print(f"MIDI Note: {midi_note}, User Note: {user_note}")


Total alignment cost: 191.17765252331253
Alignment:
MIDI Note: note_idx       0.000000
start          0.000000
channel        0.000000
pitch         62.000000
velocity     100.000000
duration       0.185938
frequency    293.664768
Name: 0, dtype: float64, User Note: time               0.0
pitch_diff        True
onset             True
pitch         62.14764
duration        0.2322
Name: 0, dtype: object
MIDI Note: note_idx       1.000000
start          0.187500
channel        0.000000
pitch         69.000000
velocity     100.000000
duration       0.185938
frequency    440.000000
Name: 1, dtype: float64, User Note: time             0.2322
pitch_diff        False
onset              True
pitch         62.149591
duration       0.063855
Name: 1, dtype: object
MIDI Note: None, User Note: time           0.296054
pitch_diff         True
onset             False
pitch         50.111616
duration       0.168345
Name: 2, dtype: object
MIDI Note: None, User Note: time           0.464399
pitch_diff    

In [31]:
def classify_notes(alignment, pitch_tolerance=1.0):
    """
    Classify user notes as insertions, deletions, or out-of-tune.

    Parameters:
    alignment: list of tuples (midi_row, user_row)
        Output of the alignment algorithm.
    pitch_tolerance: float
        Tolerance for pitch differences to classify as out-of-tune.

    Returns:
    dict: Dictionary with classifications.
    """
    insertions = []  # User notes with no corresponding MIDI note
    deletions = []   # MIDI notes with no corresponding user note
    out_of_tune = [] # User notes that are out of tune
    matched = []     # Correctly matched notes

    for midi_note, user_note in alignment:
        if midi_note is None:
            # User note is an insertion
            insertions.append(user_note)
        elif user_note is None:
            # MIDI note is a deletion
            deletions.append(midi_note)
        else:
            # Check if the note is out-of-tune
            pitch_diff = abs(midi_note['pitch'] - user_note['pitch'])
            if pitch_diff > pitch_tolerance:
                out_of_tune.append((midi_note, user_note))
            else:
                matched.append((midi_note, user_note))

    return {
        "insertions": insertions,
        "deletions": deletions,
        "out_of_tune": out_of_tune,
        "matched": matched
    }

# Classify notes
classifications = classify_notes(alignment, pitch_tolerance=1.0)

def format_note(note, prefix=""):
    """
    Format a note row as a readable string.
    
    Parameters:
    note: pandas.Series or None
        A row from the DataFrame or None.
    prefix: str
        Prefix to label the type of note.
    
    Returns:
    str: Formatted string representation of the note.
    """
    if note is None:
        return f"{prefix}None"
    # Dynamically check for time or start column
    time_field = 'time' if 'time' in note else 'start'

    return (f"{prefix}Pitch: {note['pitch']:.2f}, "
            f"Duration: {note['duration']:.3f}, "
            f"Time: {note[time_field]:.3f}")


def print_classifications(classifications):
    """
    Print the classifications in a readable format.
    
    Parameters:
    classifications: dict
        Output from the `classify_notes` function.
    """
    print("Insertions (User notes not in MIDI):")
    for note in classifications['insertions']:
        print(format_note(note, prefix="User Note -> "))
    print("\nDeletions (MIDI notes not played by User):")
    for note in classifications['deletions']:
        print(format_note(note, prefix="MIDI Note -> "))
    print("\nOut-of-Tune Notes:")
    for midi_note, user_note in classifications['out_of_tune']:
        print(f"MIDI Note: {format_note(midi_note, prefix='MIDI Note -> ')}")
        print(f"User Note: {format_note(user_note, prefix='User Note -> ')}")
        print()
    print("\nMatched Notes (Correctly aligned):")
    for midi_note, user_note in classifications['matched']:
        print(f"MIDI Note: {format_note(midi_note, prefix='MIDI Note -> ')}")
        print(f"User Note: {format_note(user_note, prefix='User Note -> ')}")
        print()

# Print results in a readable format
print(f"Total Alignment Cost: {distance}\n")
print_classifications(classifications)

Total Alignment Cost: 191.17765252331253

Insertions (User notes not in MIDI):
User Note -> Pitch: 50.11, Duration: 0.168, Time: 0.296
User Note -> Pitch: 61.07, Duration: 0.116, Time: 0.464
User Note -> Pitch: 64.10, Duration: 0.070, Time: 0.807
User Note -> Pitch: 74.85, Duration: 0.186, Time: 3.274
User Note -> Pitch: 72.62, Duration: 0.061, Time: 3.913
User Note -> Pitch: 74.23, Duration: 0.107, Time: 4.261
User Note -> Pitch: 72.21, Duration: 0.189, Time: 4.455
User Note -> Pitch: 44.21, Duration: 0.070, Time: 5.271
User Note -> Pitch: 73.44, Duration: 0.128, Time: 6.467
User Note -> Pitch: 50.08, Duration: 0.075, Time: 6.702
User Note -> Pitch: 46.01, Duration: 0.099, Time: 7.851
User Note -> Pitch: 64.36, Duration: 0.070, Time: 8.046
User Note -> Pitch: 66.39, Duration: 0.052, Time: 9.712
User Note -> Pitch: 66.14, Duration: 0.136, Time: 9.880
User Note -> Pitch: 46.13, Duration: 0.061, Time: 11.935
User Note -> Pitch: 72.27, Duration: 0.055, Time: 12.704
User Note -> Pitch: nan