In [1]:
!pip install deeplake
!pip install deeplake[audio]
!pip install PyAV

Collecting deeplake
  Downloading deeplake-3.8.9.tar.gz (580 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/580.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/580.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m580.3/580.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting boto3 (from deeplake)
  Downloading boto3-1.33.4-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting pathos (from deeplake)
  Downloading pathos-0.3.1-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m

In [2]:
# Filename: f0_estimation.py
import deeplake
import torch
import librosa
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler

In [3]:
def estimate_f0_accuracy(dataloader):
    total_samples = 0
    correct_estimations = 0

    # Batch size should be 16 --- 64 in other notebooks
    for batch in dataloader:
        audio_data = batch['audio']
        f0_labels = batch['pitch']


        for i, audio_sample in enumerate(audio_data):

            # librosa input is 1D array
            audio_sample_np = audio_sample.numpy().squeeze()
            # F0 from librosa pYIN
            f0, voiced_flag, voiced_probs = librosa.pyin(audio_sample_np, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
            # consolidating estimated f0...
            estimated_f0 = np.nanmean(f0[voiced_flag]) if np.any(voiced_flag) else 0
            # settin 0.5 tolerance ..
            tolerance = 0.5  # can tweak after the fact..
            # also depending on what format the labels are in...
            if np.abs(estimated_f0 - f0_labels[i]) < tolerance:
                correct_estimations += 1
            total_samples += 1




    # calculating accuracy
    accuracy = correct_estimations / total_samples
    return accuracy

In [4]:
# General Purpose Data Loader
def get_data_loader(ds, batch_size=16):

  filtered_audio = []
  filtered_pitch = []
  pitch_values = ds['pitch'].numpy().squeeze()

  for i, data in enumerate(ds):
    instrument_family = ds[i].instrument_family.data()['text'][0]

    # When instrument is piano
    if instrument_family == 'keyboard':
      # Convert to numpy array and remove unnecessary dimension
      audio_sample = ds['audios'][i].numpy().squeeze()

      # Trim the audio
      audio_sample = audio_sample[:int(3 * 16000)]

      # ============= Pick from Augmentation Library ===================
      # Pass audio through augmentations if needed

      # Convert audio to CQT (Constant-Q Transform)
      # audio_sample = librosa.cqt(audio_sample, sr=16000)

      # Add audio to dataset
      filtered_audio.append(audio_sample)

      # Add modified pitch value to the pitch list
      filtered_pitch.append(pitch_values[i]-21)

  sampler = SubsetRandomSampler(range(len(filtered_audio)))
  loader = torch.utils.data.DataLoader(filtered_audio,
      batch_size=batch_size, num_workers=1, sampler=sampler)

  return loader

In [None]:
ds = deeplake.load("hub://activeloop/nsynth-test")
loader = get_data_loader(ds)

-

Opening dataset in read-only mode as you don't have write permissions.


-

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/nsynth-test



\

hub://activeloop/nsynth-test loaded successfully.





In [None]:
for audio, pitch in loader.take(1):
  print("Audio shape:", audio.shape)
  print("Pitch shape:", pitch.shape)

#accuracy = estimate_f0_accuracy(dataloader)
#print(f"The estimated accuracy of pYIN is: {accuracy * 100:.2f}%")

In [None]:
import librosa
import json
import numpy as np

# Define your JSON data here as a Python dictionary or load it from a file
json_data = {
    "organ_electronic_088-056-025": {
        "note": 218033,
        "sample_rate": 16000,
        "pitch": 56,  # MIDI pitch number
        "instrument_source": 1,
        "instrument_family_str": "organ",
        "instrument_str": "organ_electronic_088",
        "note_str": "organ_electronic_088-056-025",
        "qualities_str": [],
        "instrument_source_str": "electronic",
        "velocity": 25,
        "instrument_family": 6,
        "instrument": 845,
        "qualities": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    }
}

def estimate_pitch_accuracy(audio_file_path, json_info):
    # Load the audio file
    y, sr = librosa.load(audio_file_path, sr=None)  # Load with the original sample rate

    # Apply pYIN algorithm to estimate pitch
    pitches, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'), sr=sr)

    # Convert the estimated pitches from frequency to MIDI notes
    estimated_midi_notes = librosa.hz_to_midi(pitches)

    # Extract the actual pitch value from the JSON data
    actual_pitch_midi = json_info['pitch']

    # Count the number of voiced frames where the estimated pitch is equal to the actual pitch within a tolerance
    # MIDI notes can be fractional, so we use a small tolerance to consider two pitches to be equal
    tolerance = 0.5
    correct_estimations = np.sum(np.abs(estimated_midi_notes - actual_pitch_midi) <= tolerance)

    # Calculate the total number of voiced frames (where pitch estimation was provided)
    total_voiced_frames = np.sum(~np.isnan(estimated_midi_notes))

    # Calculate the accuracy
    accuracy = correct_estimations / total_voiced_frames if total_voiced_frames > 0 else 0

    return accuracy

# Example usage
audio_file_path = 'path/to/your/organ_electronic_088-056-025.wav'
actual_json_info = json_data['organ_electronic_088-056-025']
accuracy = estimate_pitch_accuracy(audio_file_path, actual_json_info)
print(f"Pitch estimation accuracy: {accuracy:.2f}")
