In [29]:
import librosa
import torchcrepe
import torch
import numpy as np
import scipy.signal

Load & normalization

In [10]:
x, sr = librosa.load("../data/nsynth-test/audio/bass_electronic_018-022-100.wav", sr=16000, mono=True)
x = librosa.util.normalize(x)

F0

In [None]:
def get_f0(wave):
    wave_torch = torch.tensor(wave).unsqueeze(0)
    f0 = torchcrepe.predict(
        audio=wave_torch,
        sample_rate=16000,
        hop_length=320,       
        fmin=50.0,
        fmax=2000.0,
        model='full'
    )
    f0 = f0.squeeze().numpy()
    f0[f0 <= 0] = np.nan

    nans = np.isnan(f0)
    if np.any(nans):
        not_nan = np.logical_not(nans)
        indices = np.arange(len(f0))
        f0[nans] = np.interp(indices[nans], indices[not_nan], f0[not_nan])

    f0 = scipy.signal.medfilt(f0, kernel_size=5) 

    f0 = np.clip(f0, 50, 2000)

    return f0

In [31]:
f0 = get_f0(x)

In [32]:
f0

array([1965.5552, 1976.6909, 1976.6909, 1989.7675, 1989.7675, 1975.4344,
       1975.4344, 1984.3367, 1984.3367, 1984.3367, 1986.1812, 1986.3813,
       1990.0837, 1990.0837, 1990.0837, 1989.6531, 1984.7185, 1984.7185,
       1989.6531, 1989.8674, 1989.8674, 1995.4987, 1992.0245, 1989.8674,
       1988.0056, 1982.954 , 1982.1074, 1982.1074, 1981.3109, 1980.9167,
       1980.9167, 1981.3109, 1981.3109, 1981.9023, 1981.9023, 1981.9023,
       1977.0986, 1977.0986, 1975.6375, 1975.6375, 1975.6375, 1984.7441,
       1970.425 , 1984.7441, 1984.7441, 1983.1572, 1980.2622, 1983.1572,
       1983.1572, 1987.9702, 1987.9702, 1987.9702, 1988.5577, 1980.8396,
       1980.8396, 1975.759 , 1975.46  , 1975.46  , 1980.5638, 1980.5638,
       1980.5638, 1982.4762, 1980.5638, 1987.8928, 1982.8608, 1982.9796,
       1982.8608, 1982.9796, 1982.9796, 1982.9796, 1987.2633, 1987.2633,
       1987.2633, 1986.9264, 1987.2633, 1986.9264, 1986.9264, 1984.0613,
       1984.0613, 1979.8988, 1979.8988, 1987.7639, 

Loudness

In [33]:
def get_loudness(wave, frame_lenght= 640, hop=320):
    loudness = []
    for i in range(0, len(wave) - frame_lenght, hop):
        frame = x[i:i+frame_lenght]
        rms = np.sqrt(np.mean(frame**2))
        loudness.append(20 * np.log10(rms + 1e-6))
    loudness = np.array(loudness)
    loudness = np.clip(loudness, -60.0, 0.0)
    mean = np.mean(loudness)
    std = np.std(loudness) + 1e-6
    loudness = (loudness - mean) / std
    return loudness

In [34]:
loudness = get_loudness(x)

In [35]:
loudness

array([ 7.852125  ,  7.6934977 ,  5.406584  ,  3.9421122 ,  3.0156193 ,
        2.6523376 ,  2.3910012 ,  1.8315657 ,  0.7167899 , -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18783903,
       -0.18783903, -0.18783903, -0.18783903, -0.18783903, -0.18

OnSet

In [19]:
def get_onset(wave):
    onset_env = librosa.onset.onset_strength(y=x, sr=16000, hop_length=320)
    onset_env = onset_env / onset_env.max() 
    return onset_env

In [27]:
onset = get_onset(x)

In [28]:
onset

array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       9.9890006e-01, 4.1615631e-04, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.6593853e-01, 1.0000000e+00,
       9.5366520e-01, 5.3824997e-01, 3.4217015e-01, 1.2504491e-01,
       1.4568093e-02, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e

In [37]:
result = list(zip(f0, loudness, onset))

In [38]:
result

[(np.float32(1965.5552), np.float32(7.852125), np.float32(0.0)),
 (np.float32(1976.6909), np.float32(7.6934977), np.float32(0.0)),
 (np.float32(1976.6909), np.float32(5.406584), np.float32(0.0)),
 (np.float32(1989.7675), np.float32(3.9421122), np.float32(0.0)),
 (np.float32(1989.7675), np.float32(3.0156193), np.float32(0.99890006)),
 (np.float32(1975.4344), np.float32(2.6523376), np.float32(0.0004161563)),
 (np.float32(1975.4344), np.float32(2.3910012), np.float32(0.0)),
 (np.float32(1984.3367), np.float32(1.8315657), np.float32(0.0)),
 (np.float32(1984.3367), np.float32(0.7167899), np.float32(0.0)),
 (np.float32(1984.3367), np.float32(-0.18783903), np.float32(0.0)),
 (np.float32(1986.1812), np.float32(-0.18783903), np.float32(0.16593853)),
 (np.float32(1986.3813), np.float32(-0.18783903), np.float32(1.0)),
 (np.float32(1990.0837), np.float32(-0.18783903), np.float32(0.9536652)),
 (np.float32(1990.0837), np.float32(-0.18783903), np.float32(0.53824997)),
 (np.float32(1990.0837), np.floa