# Importing packages

In [None]:
import numpy as np
import mirdata
from pesto import load_model
import torch
import mir_eval
import pesto

# Benchmarking on MDB-stem-synth


In [None]:
dataset = mirdata.initialize("mdb_stem_synth")
#dataset.download() this line has to be commented and re-run if the dataset is not already installed

1.72GB [10:02, 3.06MB/s]                               
72.0kB [00:00, 160kB/s]                             


In [6]:
import numpy as np
import torch
import mir_eval
import pesto
from pesto import load_model

# ---- dataset / track ----
track_id = "AClassicEducation_NightOwl_STEM_01"
track = dataset.track(track_id)  # assumes `dataset` already exists
audio, sr = track.audio  # mirdata returns (samples, channels) usually

# ---- mono + torch tensor (PESTO guideline) ----
audio_mono = audio.mean(axis=-1) if audio.ndim > 1 else audio
x = torch.from_numpy(audio_mono).float()  # (num_samples,)

In [None]:

# ---- device + model (load once) ----
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0

# Important: pass sampling_rate to match your data (repo shows this pattern in advanced usage)
pesto_model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr).to(device)
pesto_model.eval()

# ---- inference ----
with torch.no_grad():
    # Repo example uses: f0, conf, amp = f0_estimator(x, convert_to_freq=True, return_activations=False)
    f0, conf, amp = pesto_model(
        x.to(device),
        convert_to_freq=True,
        return_activations=False,
    )

In [17]:
# ---- move to numpy and build timestamps (repo: step_size in ms; hop is derived from it) ----
f0_pred = f0.detach().cpu().numpy().squeeze()
times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)

# mir_eval expects unvoiced = 0 Hz (not NaN)
f0_pred = np.nan_to_num(f0_pred, nan=0.0)

# ---- reference ----
ref_times = track.f0.times
ref_freqs = track.f0.frequencies

# ---- metrics ----
scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

print(f"--- Results for Track: {track_id} ---")
print(f"Raw Pitch Accuracy (RPA): {scores['Raw Pitch Accuracy']:.4f}")
print(f"Raw Chroma Accuracy (RCA): {scores['Raw Chroma Accuracy']:.4f}")

--- Results for Track: AClassicEducation_NightOwl_STEM_01 ---
Raw Pitch Accuracy (RPA): 0.8691
Raw Chroma Accuracy (RCA): 0.9100


In [27]:
!pip install soundfile




[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


# Benchmarking on Orchset

In [32]:
# orchset
import mirdata
import numpy as np
import torch
import mir_eval
from pesto import load_model

dataset = mirdata.initialize("orchset")
track_id = dataset.track_ids[0]
track = dataset.track(track_id)

# --- audio (orchset-specific) ---
audio, sr = track.audio_mono  # <-- correct for Orchset :contentReference[oaicite:1]{index=1}
x = torch.from_numpy(audio).float()

# --- pesto ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0
model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr).to(device).eval()

with torch.no_grad():
    f0, conf, amp = model(x.to(device), convert_to_freq=True, return_activations=False)

f0_pred = f0.detach().cpu().numpy().squeeze()
times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)
f0_pred = np.nan_to_num(f0_pred, nan=0.0)

# --- reference (orchset-specific) ---
ref_times = track.melody.times
ref_freqs = track.melody.frequencies

scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)
print(f"--- orchset / {track_id} ---")
print(f"RPA: {scores['Raw Pitch Accuracy']:.4f}")
print(f"RCA: {scores['Raw Chroma Accuracy']:.4f}")


--- orchset / Beethoven-S3-I-ex1 ---
RPA: 0.0368
RCA: 0.6477


In [35]:
ref_freqs.shape

(1247,)