In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [17, 6]
from IPython.display import Audio
from ipywidgets import HTML

In [None]:
import torch
from pathlib import Path

In [None]:
import torchcrepe

In [None]:
from zak.model.encoder import encode

In [None]:
audio_path = Path('/home/kureta/Music/violin/audio.pth')
pitch_path = audio_path.with_name('pitch.pth')
loudness_path = audio_path.with_name('loudness.pth')
periodicity_path = audio_path.with_name('periodicity.pth')

In [None]:
audio = torch.load(audio_path)
pitch = torch.load(pitch_path)
loudness = torch.load(loudness_path)
periodicity = torch.load(periodicity_path)

In [None]:
pitch, loudness, periodicity = encode(audio[:, :48000])

In [None]:
Audio(audio[0, :48000], rate=48000)

In [None]:
plt.plot(pitch[0, :100])
plt.show()

In [None]:
start = 48000 * 0
end = start + 48000 * 10
s = 100 * 0
e = s + 100 * 10

plt.plot(periodicity[0, -e:])
plt.plot((loudness[0, -e:] + 90.) / 90.)
plt.plot((pitch[0, -e:] - 190.) / 2700.)
plt.show()

In [None]:
Audio(data=audio[0:, -end:], rate=48000)

In [None]:
def __encode(audio: torch.Tensor,
           sample_rate: int = 48000,
           hop_size: int = 480,
           f_min: float = 190.0,
           f_max: float = 2800.0):
    # Provide a sensible frequency range for your domain (upper limit is 2006 Hz)
    # Select a model capacity--one of "tiny" or "full"
    model = 'full'

    # Choose a device to use for inference
    device = 'cuda:0'

    # Pick a batch size that doesn't cause memory errors on your gpu
    batch_size = 2048

    # Compute pitch using first gpu
    pitch, periodicity = torchcrepe.predict(audio,
                                            sample_rate,
                                            hop_size,
                                            f_min,
                                            f_max,
                                            model,
                                            batch_size=batch_size,
                                            device=device,
                                            decoder=torchcrepe.decode.weighted_argmax,
                                            return_periodicity=True)

    # Filter silence
    periodicity = torchcrepe.threshold.Silence(-90.)(periodicity,
                                                     audio,
                                                     sample_rate,
                                                     hop_size)

    # We'll use a 15 millisecond window assuming a hop length of 5 milliseconds
    win_length = 3

    # Median filter noisy confidence value
    periodicity = torchcrepe.filter.median(periodicity, win_length)

    # Remove inharmonic regions
    pitch = torchcrepe.threshold.At(.21)(pitch, periodicity)

    # Optionally smooth pitch to remove quantization artifacts
    pitch = torchcrepe.filter.mean(pitch, win_length)

    # Unvoiced regions were set to nan. Set them to 0 instead
    pitch.nan_to_num_()

    # calculate loudness
    loudness = torchcrepe.loudness.a_weighted(audio, sample_rate, hop_size)

    return pitch, loudness, periodicity

In [None]:
pitch, loudness, periodicity = __encode(audio[:, :48000 * 10])

In [None]:
plt.plot(periodicity[0])
plt.plot((loudness[0] + 90.) / 90.)
plt.plot((pitch[0] - 190.) / 2700.)
plt.show()

In [None]:
Audio(audio[0, :48000 * 10], rate=48000)