In [1]:
import os, json, glob
import librosa, mir_eval, guitarpro
import numpy as np

In [2]:
SINGLE_TRACK_GTP_DIR = "/Volumes/MacOnly/UG_rewrite/all_time_top_by_hits/clean_single_track_gtps"
SINGLE_TRACK_ANNO_DIR = "/Volumes/MacOnly/UG_rewrite/all_time_top_by_hits/clean_single_track_annos"
SINGLE_TRACK_AUDIO_DIR = "/Volumes/MacOnly/UG_rewrite/all_time_top_by_hits/clean_single_track_audio"

In [3]:
def poly_vs_mono_vs_silence(song):
    """Return the time stamps for the start and end of each monophonic / polyphonic / silence segments in the song

    Args:
        song (Song): A pyguitarpro Song object. The song to analyze

    Returns:
        list, list, list: A list of (start, end) time stamps for all mono segments, poly segments, and silence segments
    """
    bpm = song.tempo
    poly_segments = []
    mono_segments = []
    silence_segments = []

    previous_beat_status = -1
    beats = []
    for measure in song.tracks[0].measures:
        voice = measure.voices[0]
        beats.extend(voice.beats)
    for beat in beats:
        onset = beat.start
        onset_sec = round(((onset - 960) / 960) / (bpm / 60), 4)
        dur = beat.duration.time
        dur_sec = round((dur / 960) / (bpm / 60), 4)
        offset_sec = onset_sec + dur_sec
        # 2 for polyphonic, 1 for monophonic, 0 for silence
        if len(beat.notes) == 0:
            beat_status = 0
        elif len(beat.notes) == 1:
            beat_status = 1
        else:
            beat_status = 2
        if beat_status != previous_beat_status:
            # if current beat status is different from the previous beat, add the timing to the output list
            # the following lines can obviously be better written, I leave it like this just for clarity
            if beat_status == 2:
                poly_segments.append([onset_sec, offset_sec])
            elif beat_status == 1:
                mono_segments.append([onset_sec, offset_sec])
            else:
                assert beat_status == 0
                silence_segments.append([onset_sec, offset_sec])
        else:
            # if current beat status is the same as the previous one, update the offset of the previous entry
            if beat_status == 2:
                poly_segments[-1][1] = offset_sec
            elif beat_status == 1:
                mono_segments[-1][1] = offset_sec
            else:
                assert beat_status == 0
                silence_segments[-1][1] = offset_sec
        previous_beat_status = beat_status
    return poly_segments, mono_segments, silence_segments


In [4]:
def f0_note_tracker(y, sr, frame_size, hop_size, pitch_diff_thres, note_dur_thres):
    """Note tracking based on F0 segmentation

    This function takes an audio signal as input and use PYIN to estimate an f0 curve.
    The curve is cut at points where the pitch difference between two adjacent frames exceeds `pitch_diff_thres`.
    Spurious notes shorter than `note_dur_thres` are discarded.

    Args:
        y (array): audio signal
        pitch_diff_thres (float, optional): F0 cutting point threshold.
        note_dur_thres (int, optional): Estimated notes whose duration is shorter than this threshold is discarded.

    Returns:
        array: Estimated intervals: [(onset, offset)]
    """
    est_intervals = []

    # if the segment is shorter than note_dur_thres, just discard it
    if len(y) < note_dur_thres/1000*sr:
        return []
        
    f0, _, _ = librosa.pyin(
        y,
        fmin=librosa.note_to_hz("C2"),
        fmax=librosa.note_to_hz("G6"),
        sr=sr,
        frame_length=frame_size,
        hop_length=hop_size,
        center=True,
    )
    times = librosa.times_like(f0, sr=sr, hop_length=hop_size)
    notes = librosa.hz_to_midi(f0)

    note_events = np.split(notes, np.where(abs(np.diff(notes)) > pitch_diff_thres)[0] + 1)
    time_intervals = np.split(times, np.where(abs(np.diff(notes)) > pitch_diff_thres)[0] + 1)

    for i in range(len(note_events)):
        time_interval = time_intervals[i]
        # ignore spurious note events
        if len(time_interval) > librosa.time_to_frames(note_dur_thres/1000, sr=sr, hop_length=hop_size):
            est_intervals.append([time_interval[0], time_interval[-1]])
    
    est_intervals = np.array(est_intervals)

    return est_intervals

In [None]:
# considering the time resolution, a frame size of 8192 is 186ms, the frame is way too long
# FRAME_SIZE = [2048, 4096]
FRAME_SIZE = [1024]
HOP_SIZE_RATIO = [0.25, 0.5, 0.75]
PDT = [0.1, 0.2, 0.4, 0.8]
NDT = [25, 50, 75]

for frame_size in FRAME_SIZE:
    for hop_size_ratio in HOP_SIZE_RATIO:
        hop_size = int(frame_size * hop_size_ratio)
        for pdt in PDT:
            for ndt in NDT:
                print(f"FR: {frame_size}, HOP: {hop_size}, PDT: {pdt}, NDT: {ndt}")
                # bypass the mono detector, test onset note tracker on strictly mono segments
                matching_cnt = 0
                est_cnt = 0
                anno_cnt = 0

                for audio_file in glob.glob(os.path.join(SINGLE_TRACK_AUDIO_DIR, "*.wav"))[:20]:
                    track_name, _ = os.path.splitext(audio_file.split("/")[-1])
                    gtp_file = os.path.join(SINGLE_TRACK_GTP_DIR, track_name + ".gp5")
                    anno_file = os.path.join(SINGLE_TRACK_ANNO_DIR, track_name + ".json")

                    all_est_intervals = []

                    y, sr = librosa.load(audio_file, sr=None)

                    p, m, s = poly_vs_mono_vs_silence(guitarpro.parse(gtp_file))
                    for mono_timestamp in m:
                        start_sp = int(mono_timestamp[0] * sr)
                        end_sp = int(mono_timestamp[1] * sr)
                        mono_segment = y[start_sp : end_sp]
                        est_intervals = f0_note_tracker(mono_segment, sr, frame_size=frame_size, hop_size=hop_size, pitch_diff_thres=pdt, note_dur_thres=ndt)
                        if len(est_intervals)==0:
                            continue
                        # convert to global time
                        est_intervals = est_intervals + mono_timestamp[0]
                        all_est_intervals.append(est_intervals)

                    if not all_est_intervals:
                        continue

                    all_est_intervals = np.concatenate(all_est_intervals, axis=0)

                    all_ref_intervals = []
                    with open(anno_file) as anno:
                        annotation = json.load(anno)
                    for note_event in annotation:
                        if note_event["time"]["dur"] > frame_size / sr:
                            onset_time = note_event["time"]["start"]
                            offset_time = note_event["time"]["start"] + note_event["time"]["dur"]
                            all_ref_intervals.append([onset_time, offset_time])
                    all_ref_intervals = np.array(all_ref_intervals)

                    onset_matching = mir_eval.transcription.match_note_onsets(all_ref_intervals, all_est_intervals)
                    offset_matching = mir_eval.transcription.match_note_offsets(all_ref_intervals, all_est_intervals)
                    matching = [match for match in onset_matching if match in offset_matching]

                    matching_cnt += len(matching)
                    est_cnt += len(all_est_intervals)
                    anno_cnt += len(all_ref_intervals)

                precision = matching_cnt/est_cnt
                recall = matching_cnt/anno_cnt
                f1 = 2 * precision * recall / (precision + recall)

                print(matching_cnt, est_cnt, anno_cnt)
                print(f"precision: {precision}, recall: {recall}, f1: {f1}")

In [4]:
# onset detection-based note-event separation heuristics
def onset_note_tracker(y, sr, frame_size, hop_size, note_dur_thres, backtrack=False):
    est_intervals = []

    # S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=frame_size, hop_length=hop_size, n_mels=128, fmax=8000)
    # S = librosa.power_to_db(S)
    # onset_envelop = librosa.onset.onset_strength(y=None, sr=sr, S=S)

    onset_envelop = librosa.onset.onset_strength(y=y, sr=sr, n_fft=frame_size, hop_length=hop_size)

    onsets = librosa.onset.onset_detect(y=None, sr=sr, onset_envelope=onset_envelop, hop_length=hop_size, units='time', backtrack=backtrack)

    # the onset detection tend to ignore the first note in the audio
    # so add the time 0 to the onsets
    onsets = np.insert(onsets, 0, 0)

    for i in range(len(onsets)-1):
        onset = onsets[i]
        next_onset = onsets[i+1]
        est_intervals.append([onset, next_onset])

    # the last onset to the end of the audio
    est_intervals.append([onsets[-1], len(y)/sr])

    est_intervals = [interval for interval in est_intervals if interval[1] - interval[0] > note_dur_thres/1000]
    est_intervals = np.array(est_intervals)
    return est_intervals

In [None]:
# considering the time resolution, a frame size of 8192 is 186ms, the frame is way too long
FRAME_SIZE = [1024, 2048, 4096]
HOP_SIZE_RATIO = [0.25, 0.5, 0.75]
NDT = [25, 50, 75]

for frame_size in FRAME_SIZE:
    for hop_size_ratio in HOP_SIZE_RATIO:
        hop_size = int(frame_size * hop_size_ratio)
        for ndt in NDT:
            print(f"FR: {frame_size}, HOP: {hop_size}, NDT: {ndt}")
            # bypass the mono detector, test onset note tracker on strictly mono segments
            matching_cnt = 0
            est_cnt = 0
            anno_cnt = 0

            for audio_file in glob.glob(os.path.join(SINGLE_TRACK_AUDIO_DIR, "*.wav"))[:40]:
                track_name, _ = os.path.splitext(audio_file.split("/")[-1])
                gtp_file = os.path.join(SINGLE_TRACK_GTP_DIR, track_name + ".gp5")
                anno_file = os.path.join(SINGLE_TRACK_ANNO_DIR, track_name + ".json")

                all_est_intervals = []

                y, sr = librosa.load(audio_file, sr=None)

                p, m, s = poly_vs_mono_vs_silence(guitarpro.parse(gtp_file))
                for mono_timestamp in m:
                    start_sp = int(mono_timestamp[0] * sr)
                    end_sp = int(mono_timestamp[1] * sr)
                    mono_segment = y[start_sp : end_sp]
                    est_intervals = onset_note_tracker(mono_segment, sr, frame_size=frame_size, hop_size=hop_size, note_dur_thres=ndt, backtrack=True)
                    if len(est_intervals)==0:
                        continue
                    # convert to global time
                    est_intervals = est_intervals + mono_timestamp[0]
                    all_est_intervals.append(est_intervals)

                if not all_est_intervals:
                    continue

                all_est_intervals = np.concatenate(all_est_intervals, axis=0)

                all_ref_intervals = []
                with open(anno_file) as anno:
                    annotation = json.load(anno)
                for note_event in annotation:
                    if note_event["time"]["dur"] > frame_size / sr:
                        onset_time = note_event["time"]["start"]
                        offset_time = note_event["time"]["start"] + note_event["time"]["dur"]
                        all_ref_intervals.append([onset_time, offset_time])
                all_ref_intervals = np.array(all_ref_intervals)

                onset_matching = mir_eval.transcription.match_note_onsets(all_ref_intervals, all_est_intervals)
                offset_matching = mir_eval.transcription.match_note_offsets(all_ref_intervals, all_est_intervals)
                matching = [match for match in onset_matching if match in offset_matching]

                matching_cnt += len(matching)
                est_cnt += len(all_est_intervals)
                anno_cnt += len(all_ref_intervals)

            precision = matching_cnt/est_cnt
            recall = matching_cnt/anno_cnt
            f1 = 2 * precision * recall / (precision + recall)

            print(matching_cnt, est_cnt, anno_cnt)
            print(f"precision: {precision}, recall: {recall}, f1: {f1}")