In [None]:
%matplotlib inline

import einops
import librosa
import matplotlib.pyplot as plt
import numpy as np
import soundfile
import torch
from IPython.display import Audio
from torch.nn import functional as F
from torchaudio.transforms import Resample

import myddsp.constants as C
import myddsp.preprocessors as pre

## Load example

In [None]:
y, _ = librosa.load("../data/test/samples/violin-1.wav", sr=48000, mono=False)

In [None]:
yy = torch.from_numpy(y)
yy = yy.unsqueeze(0)

In [None]:
Audio(y, rate=48000)

## Explore loudness calculation methods

In [None]:
ld = pre.Loudness()
ld = ld.eval()
for p in ld.parameters():
    p.requires_grad = False

In [None]:
with torch.inference_mode():
    amps = ld(yy).flatten().numpy()

In [None]:
plt.plot(amps)

In [None]:
filtered = amps[amps >= -70]

In [None]:
plt.plot(filtered)

## Explore pitch calculation methods

In [None]:
from torch import Tensor, nn

from myddsp.crepe import load_model
from myddsp.preprocessors import get_centered_frames

In [None]:
class F0(nn.Module):
    def __init__(self, capacity: str = "full"):
        super().__init__()
        self.model = load_model(capacity)

    def forward(self, x: Tensor) -> Tensor:
        x = x.mean(1)
        frames = get_centered_frames(x, C.CREPE_N_FFT, C.CREPE_HOP_LENGTH)

        b, n, f = frames.shape
        batched = einops.rearrange(frames, "b n f -> (b f) n")
        zeroed = batched - batched.mean(dim=1, keepdim=True)
        normalized = zeroed / zeroed.std(dim=1, keepdim=True)

        activations = self.model(normalized)

        return activations

In [None]:
crepe = F0("tiny")
crepe = crepe.eval()
for p in crepe.parameters():
    p.requires_grad = False

In [None]:
rs = Resample(48000, 16000)

In [None]:
with torch.inference_mode():
    acti = crepe(rs(yy))

In [None]:
plt.matshow(acti[amps >= -70, :].T, origin="lower")