## About

*winterreise_rt* is a subset of Schubert Winterreise Dataset(SWD) [1] for real-time lyrics alignment.
The Schubert Winterreise Dataset(SWD) dataset is a collection of resources of Schubert’s song cycle for voice and piano ‘Winterreise’. 
The the song cycle Winterreise D911 (Op. 89) consists of 24 songs composed for solo voice with piano accompaniment.

## Download the latest version of the Schubert Winterreise Dataset (SWD)
First of all, we will download the SWD dataset and revise the file structure to fit the *winterreise_rt* dataset from the scratch.

In [None]:
!wget "https://zenodo.org/record/5139893/files/Schubert_Winterreise_Dataset_v2-0.zip?download=1" -O winterreise.zip
!unzip winterreise -d winterreise
!rm -r winterreise.zip

In [None]:
from pathlib import Path
from tqdm import tqdm
import music21
from music21 import converter
import numpy as np
import scipy
import librosa
import pandas as pd
import IPython.display as ipd
from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
from synctoolbox.feature.dlnco import pitch_onset_features_to_DLNCO
from synctoolbox.feature.pitch_onset import audio_to_pitch_onset_features
from synctoolbox.feature.utils import estimate_tuning

from reconstruct import main as reconstruct_main

SWD_PATH = Path("./winterreise")
WINTERREISE_RT_PATH = Path("./winterreise_rt")
AUDIO_DIR = "01_RawData/audio_wav/"
SCORE_DIR = "01_RawData/score_musicxml/"
LYRICS_DIR = "01_RawData/lyrics_txt/"
NOTE_ANN_DIR = "02_Annotations/ann_audio_note/"
AUDIO_KEY_ANN_DIR = "03_ExtraMaterial/"
FILENAME_PREFIX = "Schubert_D911-"
SINGERS = {
    "HU33": "ref",
    "SC06": "target",
}  # singer_name: role
SONG_IDS = ["{:02d}".format(i) for i in range(1, 25)]  # 01 ~ 24
SAMPLE_RATE = 16000
FRAME_RATE = 25
HOP_LENGTH = 640
TOLERANCES = [200, 300, 500, 750, 1000]
N_FRAMES = 3000
THRESHOLD_REC = 10**6
STEP_WEIGHTS = np.array([1.5, 1.5, 2.0])

In [None]:
# reconstruct the winterreise_rt dataset
reconstruct_main(Path("./winterreise"), Path("./winterreise_rt"))

### How to Extract Lyrics Annotations from the Symbolic Music Data

In [None]:
def _update_ann_info_from_note_obj(
    annots, note, offset, quarter_tempo, last_pitch, lyrics
):
    start_timestamp = offset * (60 / quarter_tempo)
    pitch = int(note.pitch.ps)
    pitch_class = note.pitch.pitchClass
    pitch_name = note.pitch.nameWithOctave
    lyric = (
        note.lyrics[0].text.strip("- ") if note.lyrics else ""
    )  # assume there's only 1 verse
    syllabic = (
        note.lyrics[0].syllabic if note.lyrics else "none"
    )  # assume there's only 1 verse
    if (
        lyric == ""
        and pitch == last_pitch
        and note.tie is not None
        and note.tie.type == "stop"
    ):  # omit tie stop note with no lyric (omitted in ref annotation)
        print(f"skip tie note! index: {len(annots)}")
        return annots, last_pitch, lyrics
    
    last_pitch = pitch
    annots.append(
        {
            "start": start_timestamp,
            "pitch": pitch,
            "pitchclass": pitch_class,
            "pitchname": pitch_name,
            "offset": offset,
            "instrument": "voice",
            "lyric": lyric,
            "syllabic": syllabic,
        }
    )
    lyrics += lyric + " " if syllabic in {"single", "end"} else lyric
    return annots, last_pitch, lyrics

def extract_annots_lyrics_from_score(xml_path):
    c = converter.parse(xml_path.as_posix())
    voice = c.parts[0]

    _, _, mm = c.metronomeMarkBoundaries()[0]
    quarter_tempo = (
        mm.number * mm.referent.quarterLength
    )  # normalize tempo to quarter note
    print(f"mm: {mm}, mm.number: {mm.number}, quarter_tempo: {quarter_tempo}")
    annots = []
    lyrics = ""
    last_pitch = 0
    for el in voice.recurse():
        if isinstance(el, music21.chord.Chord):
            offset = el.activeSite.offset + el.offset
            for note in el.notes:
                annots, last_pitch, lyrics = _update_ann_info_from_note_obj(
                    annots, note, offset, quarter_tempo, last_pitch, lyrics
                )
        elif isinstance(el, music21.stream.Voice):
            measure_offset = el.activeSite.offset
            for element in el.elements:
                if isinstance(element, music21.note.Note):
                    offset = element.offset + measure_offset
                    annots, last_pitch, lyrics = _update_ann_info_from_note_obj(
                        annots, element, offset, quarter_tempo, last_pitch, lyrics
                    )
        elif isinstance(el, music21.note.Note):
            if el.activeSite.offset == 0 and isinstance(
                el.activeSite, music21.stream.Voice
            ):
                continue
            offset = el.activeSite.offset + el.offset
            annots, last_pitch, lyrics = _update_ann_info_from_note_obj(
                annots, el, offset, quarter_tempo, last_pitch, lyrics
            )
    annots = sorted(annots, key=lambda x: x["start"])
    return annots, lyrics

In [None]:
annots, lyrics = extract_annots_lyrics_from_score(WINTERREISE_RT_PATH / SCORE_DIR / f"{FILENAME_PREFIX}01.xml")

In [None]:
annots

In [None]:
lyrics

### Run Offline Lyrics Alignment and Evaluation between *score* and *ref* in Voice Note-Level 

In [None]:
def compute_strict_alignment_path_mask(P):
    P = np.array(P, copy=True)
    N, M = P[-1]
    # Get indices for strict monotonicity
    keep_mask = (P[1:, 0] > P[:-1, 0]) & (P[1:, 1] > P[:-1, 1])
    # Add first index to enforce start boundary condition
    keep_mask = np.concatenate(([True], keep_mask))
    # Remove all indices for of last row or column
    keep_mask[(P[:, 0] == N) | (P[:, 1] == M)] = False
    # Add last index to enforce end boundary condition
    keep_mask[-1] = True
    P_mod = P[keep_mask, :]

    return P_mod

def make_path_strictly_monotonic(P: np.ndarray) -> np.ndarray:
    return compute_strict_alignment_path_mask(P.T)

def transfer_note_positions(wp, note_ann_1, feature_rate=FRAME_RATE):
    x, y = wp[0] / feature_rate, wp[1] / feature_rate
    f = scipy.interpolate.interp1d(x, y, kind="linear")
    note_positions_1_transferred_to_2 = f(note_ann_1)
    return note_positions_1_transferred_to_2

def get_stats(
    wp,
    note_ann_filepath_1,
    note_ann_filepath_2,
    feature_rate=FRAME_RATE,
    tolerances=TOLERANCES,
):  # tolerances in milliseconds
    wp = make_path_strictly_monotonic(wp)

    note_ann_1 = pd.read_csv(filepath_or_buffer=note_ann_filepath_1, delimiter=",")[
        "start"
    ]
    note_ann_2 = pd.read_csv(filepath_or_buffer=note_ann_filepath_2, delimiter=",")[
        "start"
    ]

    note_positions_1_transferred_to_2 = transfer_note_positions(
        wp, note_ann_1, feature_rate
    )

    absolute_errors_at_voice_notes = np.abs(
        note_ann_2 - note_positions_1_transferred_to_2
    )
    errors_at_voice_notes = note_ann_2 - note_positions_1_transferred_to_2

    misalignments = np.zeros(len(tolerances))

    for idx, tolerance in enumerate(tolerances):  # in milliseconds
        misalignments[idx] = np.mean(
            (absolute_errors_at_voice_notes > tolerance / 1000.0)
        )

    mean = np.mean(absolute_errors_at_voice_notes) * 1000.0
    std = np.std(absolute_errors_at_voice_notes) * 1000.0

    return (
        mean,
        std,
        np.array(misalignments),
        errors_at_voice_notes,
        absolute_errors_at_voice_notes,
    )

def _get_DLNCO_features_from_audio(
    audio,
    tuning_offset,
    feature_sequence_length,
    Fs=SAMPLE_RATE,
    feature_rate=FRAME_RATE,
    verbose=False,
):
    f_pitch_onset = audio_to_pitch_onset_features(
        f_audio=audio, Fs=Fs, tuning_offset=tuning_offset, verbose=verbose
    )

    f_DLNCO = pitch_onset_features_to_DLNCO(
        f_peaks=f_pitch_onset,
        feature_rate=feature_rate,
        feature_sequence_length=feature_sequence_length,
        visualize=verbose,
    )

    return f_DLNCO


def run_offline_alignment(score_audio_path: Path, ref_audio_path: Path):
    # read audio
    audio_1, _ = librosa.load(score_audio_path.as_posix(), sr=SAMPLE_RATE)
    audio_2, _ = librosa.load(ref_audio_path.as_posix(), sr=SAMPLE_RATE)

    # estimate tuning
    tuning_offset_1 = estimate_tuning(audio_1, SAMPLE_RATE, N=HOP_LENGTH * 2)
    tuning_offset_2 = estimate_tuning(audio_2, SAMPLE_RATE, N=HOP_LENGTH * 2)

    # generate chroma features from librosa
    f_chroma_librosa_1 = librosa.feature.chroma_cens(
        y=audio_1,
        sr=SAMPLE_RATE,
        hop_length=HOP_LENGTH,
    )
    f_chroma_librosa_2 = librosa.feature.chroma_cens(
        y=audio_2,
        sr=SAMPLE_RATE,
        hop_length=HOP_LENGTH,
    )

    # generate DLNCO features
    f_DLNCO_1 = _get_DLNCO_features_from_audio(
        audio=audio_1,
        tuning_offset=tuning_offset_1,
        feature_sequence_length=f_chroma_librosa_1.shape[1],
    )

    f_DLNCO_2 = _get_DLNCO_features_from_audio(
        audio=audio_2,
        tuning_offset=tuning_offset_2,
        feature_sequence_length=f_chroma_librosa_2.shape[1],
    )
    wp_chroma_dlnco = sync_via_mrmsdtw(
        f_chroma1=f_chroma_librosa_1,
        f_onset1=f_DLNCO_1,
        f_chroma2=f_chroma_librosa_2,
        f_onset2=f_DLNCO_2,
        input_feature_rate=FRAME_RATE,
        step_weights=STEP_WEIGHTS,
        threshold_rec=THRESHOLD_REC,
        verbose=False,
    )
    return wp_chroma_dlnco

In [None]:
# offline evaluation (score vs. reference)
for song_id in tqdm(SONG_IDS):
    score_audio_path = Path("score/audio_wav/") / f"audio_{FILENAME_PREFIX}{song_id}_score.wav"
    ref_audio_path = WINTERREISE_RT_PATH / AUDIO_DIR / f"{FILENAME_PREFIX}{song_id}_ref.wav"
    wp = run_offline_alignment(score_audio_path, ref_audio_path)
    # wp_file = WINTERREISE_RT_PATH / WP_ANN_DIR / f"wp_{FILENAME_PREFIX}{song_id}_offline.csv"
    note_ann_file_1 = Path("score/ann_audio_note/") / f"ann_{FILENAME_PREFIX}{song_id}_score.csv"
    note_ann_file_2 = WINTERREISE_RT_PATH / NOTE_ANN_DIR / f"ann_{FILENAME_PREFIX}{song_id}_ref.csv"

    note_ann_1 = pd.read_csv(filepath_or_buffer=note_ann_file_1.as_posix(), delimiter=",")[
        "start"
    ]
    note_ann_2 = pd.read_csv(filepath_or_buffer=note_ann_file_2.as_posix(), delimiter=",")[
        "start"
    ]

    (
        offline_mean,
        offline_std,
        offline_misalignments,
        offline_abs_err,
        offline_err,
    ) = get_stats(
        wp=wp,
        note_ann_filepath_1=note_ann_file_1.as_posix(),
        note_ann_filepath_2=note_ann_file_2.as_posix(),
        feature_rate=FRAME_RATE,
        tolerances=TOLERANCES,
    )
    stats_dict = {song_id: dict()}
    stats_dict[song_id]["chroma_dlnco"] = {
        "mean": offline_mean,
        "std": offline_std,
        "misalignments": offline_misalignments,
        "absolute_errors": offline_abs_err,
    }

    rows = pd.MultiIndex.from_product([stats_dict.keys()], names=["Song ID"])
    columns = pd.MultiIndex.from_product(
        [["Chroma & DLNCO"], TOLERANCES], names=["Feature Type", "$\u03C4$ (ms)"]
    )
    data = np.zeros((len(stats_dict), len(offline_misalignments)))
    for row_idx, song_id in enumerate(stats_dict):
        data[row_idx, : len(offline_misalignments)] = (
            stats_dict[song_id]["chroma_dlnco"]["misalignments"] * 100
        )

    df = pd.DataFrame(data, index=rows, columns=columns)
    with pd.option_context("display.float_format", "{:0.2f}".format):
        ipd.display(df)

## References

[1] C. Weiß, F. Zalkow, V. Arifi-Müller, M. Müller, H. V.Koops, A. Volk, and H. Grohganz, “Schubert Winterreise dataset: A multimodal scenario for music analysis,” ACM Journal on Computing and Cultural Heritage (JOCCH), vol. 14, no. 2, pp. 25:1–18, 2021.