In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../..')

from app.modules.core.audio.AudioData import AudioData
from app.modules.core.audio.AudioPlayer import AudioPlayer
from app.modules.core.midi.MidiData import MidiData
from app.modules.core.midi.MidiPlayer import MidiPlayer
from app.modules.core.midi.MidiSynth import MidiSynth

from app.modules.processing.pda.PYin import PYin
from app.modules.processing.dtw.OnsetDf import UserOnsetDf, MidiOnsetDf
from app.config import AppConfig
from app.modules.core.recording.PitchDf import PitchDf, PitchConfig
from app.modules.processing.dtw.DTW import DTW

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [4]:
AUDIO_FILEPATH = '../../app/resources/audio/user_fugue2.mp3'
MIN_VIOLIN_FREQ = 196.0
SAMPLE_RATE = 44100

audio_data = AudioData()
audio_data.load_data(AUDIO_FILEPATH)
pitches, most_likely_pitches = PYin.pyin(audio_data.data)

Processing frame 4604/4604
Done!


In [5]:
# Create a synth with a soundfont
SOUNDFONT_FILEPATH = '../../app/resources/MuseScore_General.sf3'
midi_synth = MidiSynth(SOUNDFONT_FILEPATH)

# Load the midi file into a MidiData object
MIDI_FILEPATH = '../../app/resources/midi/fugue.mid'
midi_data = MidiData(MIDI_FILEPATH)

# Create MidiSynth/Player objects
midi_player = MidiPlayer(midi_synth)
midi_player.load_midi(midi_data)

# midi_player.play(start_time=0) # Play the MIDI

Loading MidiSynth...
Synth + soundfont loaded.


In [6]:
midi_data.pitch_df

Unnamed: 0,start,channel,pitch,velocity,duration,frequency
0,0.000000,0,62,100,0.185938,293.664768
1,0.187500,0,69,100,0.185938,440.000000
2,0.375001,0,73,100,0.185938,554.365262
3,0.562501,0,76,100,0.185938,659.255114
4,0.750002,0,77,100,0.185938,698.456463
...,...,...,...,...,...,...
60,11.250030,0,63,100,0.185938,311.126984
61,11.437530,0,74,100,0.185938,587.329536
62,11.625031,0,79,100,0.185938,783.990872
63,11.812532,0,72,100,0.185938,523.251131


## Sanity check: Plot pitches

In [11]:
# View the pitch estimates in the app
import sys
sys.path.append('..')

from app.ui.plots.PitchPlot import RunPitchPlot
from app.modules.core.midi.MidiData import MidiData
from PyQt6.QtWidgets import QApplication
from PyQt6.QtCore import QCoreApplication


if __name__ == '__main__':
    if not QCoreApplication.instance():
        app = QApplication(sys.argv)
    else:
        app = QCoreApplication.instance()

    MIDI_FILEPATH = '../../app/resources/midi/fugue.mid'
    midi_data = MidiData(MIDI_FILEPATH)

    pitchplot = RunPitchPlot(
        app, midi_data=midi_data, pitches=pitches
    )

Plotting pitches...
Done!


In [8]:
pitch_config = PitchConfig( # Defines resolution of pitch bins
    bins_per_semitone=10, tuning=440.0, fmin=196, fmax=5000
)
pitch_df = PitchDf(audio_data, pitch_config, pitches)
pitch_df.df

Unnamed: 0,time,frequency,midi_num,probability,volume,audio_idx
0,0.000000,2322.927393,97.804424,1.000000e-02,0.020647,0
1,0.000000,1170.391765,85.936993,0.000000e+00,0.020647,0
2,0.000000,772.024046,78.733707,0.000000e+00,0.020647,0
3,0.000000,586.306036,73.969805,0.000000e+00,0.020647,0
4,0.000000,469.504880,70.123640,0.000000e+00,0.020647,0
...,...,...,...,...,...,...
138733,13.360181,53.488948,32.517710,3.878101e-14,0.028113,589184
138734,13.360181,50.868580,31.648119,0.000000e+00,0.028113,589184
138735,13.360181,49.554469,31.195002,1.802509e-11,0.028113,589184
138736,13.360181,45.530465,29.728807,3.174487e-16,0.028113,589184


In [10]:
pitch_df.best_prob_df()

Unnamed: 0,time,frequency,midi_num,probability,volume,audio_idx
0,0.000000,293.685203,62.001205,0.163487,0.020647,0
1,0.002902,294.031231,62.021591,0.360580,0.021011,128
2,0.005805,294.035593,62.021847,0.492799,0.021652,256
3,0.008707,294.230242,62.033304,0.617195,0.022511,384
4,0.011610,294.525761,62.050684,0.658262,0.023535,512
...,...,...,...,...,...,...
4599,13.348571,148.735341,50.222928,0.513916,0.027452,588672
4600,13.351474,148.730425,50.222355,0.407259,0.027768,588800
4601,13.354376,148.682657,50.216794,0.338989,0.027874,588928
4602,13.357279,148.659318,50.214076,0.511272,0.028044,589056


### Compute Onset Data
We compute `onset_times` with Essentia's 'complex' onset detection which works better for non-percussive detection (eg, music signals). Analyzes the spectrum of the audio and searches for timbrally-percussive spectra (probably). Catches note changes which are the same pitch.

We compute `note_df` based off different-enough pitch changes using a rolling median window. Tries to catch note changes which may have less percussiveness (eg, slurred notes).

We combine them into `onset_df` to capture as many types of note changes as we can, mergining those onsets within `combine_threshold` sec tolerance into the same row of the dataframe, to be considered the same onset for warping purposes.

In [13]:
midi_audio = AudioData()
midi_audio.load_midi_file(MIDI_FILEPATH, SOUNDFONT_FILEPATH)

midi_onset_df = MidiOnsetDf(midi_audio, midi_data)
user_onset_df = UserOnsetDf(audio_data, most_likely_pitches)

Detecting onsets... Done!
Detecting pitch changes with rolling median window_size=30 and threshold=0.6... Done!


In [14]:
midi_onset_df.onset_df

Unnamed: 0,time,cqt_norm
0,0.000000,"[-0.09259383, -0.14248902, -0.05362097, -0.032..."
1,0.187500,"[-0.13928379, -0.11454515, -0.12279686, -0.140..."
2,0.375001,"[-0.099402875, -0.101734154, -0.108879134, -0...."
3,0.562501,"[-0.13577668, -0.20675175, -0.22074828, -0.140..."
4,0.750002,"[-0.13700058, -0.14490505, -0.16714706, -0.176..."
...,...,...
60,11.250030,"[-0.11341987, -0.11064533, -0.10846966, -0.097..."
61,11.437530,"[-0.067895986, -0.066869445, -0.061751507, -0...."
62,11.625031,"[-0.18129997, -0.1311408, -0.097627, -0.099674..."
63,11.812532,"[-0.13862708, -0.1431478, -0.1627941, -0.21417..."


In [15]:
user_onset_df.onset_df

Unnamed: 0,time,pitch_diff,onset,cqt_norm
0,0.000000,True,True,"[-0.080838166, -0.0668917, -0.064405546, -0.05..."
1,0.232200,False,True,"[-0.081852324, -0.07638682, -0.07203554, -0.06..."
2,0.296054,True,False,"[-0.11152571, -0.10601042, -0.10273066, -0.098..."
3,0.464399,True,True,"[-0.22583568, -0.14463839, -0.12643395, -0.124..."
4,0.580499,True,False,"[-0.07312691, -0.07118491, -0.0653522, -0.0571..."
...,...,...,...,...
77,12.585216,False,True,"[-0.10679642, -0.10286422, -0.10027398, -0.095..."
78,12.704218,True,False,"[-0.12708835, -0.12495376, -0.12868106, -0.132..."
79,12.759365,False,True,"[-0.13078195, -0.11990827, -0.119574085, -0.11..."
80,12.910295,False,True,"[-0.090926185, -0.089150496, -0.08656771, -0.0..."


## Align!!!

In [20]:
alignment = DTW.align(user_onset_df, midi_onset_df)
align_df = DTW.align_df(alignment, user_onset_df, midi_onset_df)

DTW alignment computed.
Distance: 9.877340674070053
Mean alignment error: 8.626506024096386


In [21]:
align_df

Unnamed: 0,midi_time,user_time
0,0.000000,[0.0]
1,0.187500,"[0.2321995496749878, 0.2960544217687075, 0.464..."
2,0.375001,[0.6617687344551086]
3,0.562501,[0.8068934240362812]
4,0.750002,[0.8765532879818594]
...,...,...
60,11.250030,[12.155646324157715]
61,11.437530,[12.411066055297852]
62,11.625031,[12.58521556854248]
63,11.812532,[12.70421768707483]
