In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd() / 'torchcrepe'))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import librosa
import librosa.display
import numpy as np
from IPython.display import Audio

In [None]:
import torch
from torchaudio.transforms import GriffinLim
from torchvision.transforms import functional as tvf
from torchcrepe import predict
from torchcrepe.decode import argmax, weighted_argmax, viterbi

In [None]:
PATH = '/home/kureta/Music/violin/Violin Samples/yee_arp_ricochet_130#4.wav'

In [None]:
def next_power_of_2(n):
    n = int(np.ceil(n))
    if (n and not(n & (n - 1))):
        return n
 
    p = 1
    while (p < n) :
        p <<= 1
         
    return p;

## DataModule Parameters

- sample_rate
- n_fft
- hop_length
- example_duration
- example_hop_length
- n_channels (1 or 2) or is_mono (bool)
- batch_size

In [None]:
SAMPLE_RATE = 44100
HOP_LENGTH = 512
N_FFT = 2048

# crepe hop length has to be the smallest power of 2 greater than `HOP_LENGTH * CREPE_SAMPLE_RATE / SAMPLE_RATE`
CREPE_SAMPLE_RATE = 16000
CREPE_HOP_LENGTH = next_power_of_2(HOP_LENGTH * CREPE_SAMPLE_RATE / SAMPLE_RATE)

In [None]:
y, sr = librosa.load(PATH, mono=False, sr=SAMPLE_RATE)

In [None]:
Audio(y, rate=SAMPLE_RATE, normalize=False)

In [None]:
s = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH))
# zero out DC offset
s[:, 0, :] = 0.0

In [None]:
sdb = librosa.amplitude_to_db(s[0], amin=1e-6, top_db=96)
sm = s[0]

In [None]:
sdbn = (sdb - np.mean(sdb, axis=0, keepdims=True)) / np.std(sdb, axis=0, keepdims=True)
smn = (sm - np.mean(sm, axis=0, keepdims=True)) / np.std(sm, axis=0, keepdims=True)

In [None]:
librosa.display.specshow(sdbn)

In [None]:
pitches, harmonicity = predict(torch.from_numpy(y).cuda(), sample_rate=16000, hop_length=256, device='cuda',
                               return_harmonicity=True, decoder=argmax)
pitches = tvf.resize(pitches.unsqueeze(1), [1, s.shape[2]]).squeeze(1)
harmonicity = tvf.resize(harmonicity.unsqueeze(1), [1, s.shape[2]]).squeeze(1)

In [None]:
plt.plot(pitches[0].cpu())
plt.show()

In [None]:
gl = GriffinLim(n_fft=N_FFT, hop_length=HOP_LENGTH, power=1.0, n_iter=128).cuda()

In [None]:
y_hat = gl(torch.from_numpy(s).cuda())

In [None]:
Audio(y_hat.cpu(), rate=SAMPLE_RATE, normalize=False)