Added default gamma (#1045)

* added vqt implementation * fixed gamma scaling in vqt * Fixed calculation of maximum filter cutoff * Added default gamma * Added comments to define alpha and describe the special cases for gamma, with a citation to the original paper where the transform was introduced. * Added better comment for explanation of the default gamma * Mirrored CQT unit tests for VQT function * Fixed order of paramters in vqt tests * Changed values in tests for indicating VQT regression * Better description of alpha and percentage consistency in tests * added vqt implementation Co-authored-by: Brian McFee <brian.mcfee@nyu.edu>
librosa · May 28, 2020 · 5bcfcfd · 5bcfcfd
1 parent c1d0628
commit 5bcfcfd
Show file tree

Hide file tree

Showing 3 changed files with 159 additions and 17 deletions.
diff --git a/librosa/core/constantq.py b/librosa/core/constantq.py
@@ -759,7 +759,20 @@ def vqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, gamma=None,
 
     gamma : number > 0 [scalar]
         Bandwidth offset for determining filter lengths.
-        `gamma=0` produces the constant-Q transform.
+
+        If `gamma=0`, produces the constant-Q transform.
+
+        If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a
+        constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished
+        by solving for the gamma which gives B_k = alpha * f_k + gamma = C * ERB(f_k), where
+        B_k is the bandwidth of filter k with center frequency f_k, alpha is the inverse of
+        what would be the constant Q-factor, and C = alpha / 0.108 is the constant fraction
+        across all filters. Here we use ERB(f_k) = 24.7 + 0.108 * f_k, the best-fit curve derived
+        from experimental data in [2]_.
+
+        .. [2] Glasberg, Brian R., and Brian CJ Moore.
+            "Derivation of auditory filter shapes from notched-noise data."
+            Hearing research 47.1-2 (1990): 103-138.
 
     bins_per_octave : int > 0 [scalar]
         Number of bins per octave
@@ -855,13 +868,19 @@ def vqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, gamma=None,
 
     len_orig = len(y)
 
+    # Relative difference in frequency between any two consecutive bands
+    alpha = (2.0**(1. / bins_per_octave) - 1)
+
     if fmin is None:
         # C1 by default
         fmin = note_to_hz('C1')
 
     if tuning is None:
         tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
 
+    if gamma is None:
+        gamma = 24.7 * alpha / 0.108
+
     # Apply tuning correction
     fmin = fmin * 2.0**(tuning / bins_per_octave)
 
@@ -873,7 +892,6 @@ def vqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, gamma=None,
     fmax_t = np.max(freqs)
 
     # Determine required resampling quality
-    alpha = (2.0**(1. / bins_per_octave) - 1)
     Q = float(filter_scale) / alpha
     filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) + 0.5 * gamma
     nyquist = sr / 2.0

diff --git a/librosa/filters.py b/librosa/filters.py
@@ -62,7 +62,6 @@
            'window_sumsquare',
            'diagonal_filter']
 
-
 # Dictionary of window function bandwidths
 
 WINDOW_BANDWIDTHS = {'bart': 1.3334961334912805,
@@ -136,7 +135,7 @@ def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False,
     norm : {None, 1, 'slaney', np.inf} [scalar]
         If 1 or 'slaney', divide the triangular mel weights by the width of the mel band
         (area normalization).
-        
+
         .. warning:: `norm=1` and `norm=np.inf` behavior will change in version 0.8.0.
 
         Otherwise, leave all the triangles aiming for a peak value of 1.0
@@ -216,17 +215,16 @@ def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False,
     for i in range(n_mels):
         # lower and upper slopes for all bins
         lower = -ramps[i] / fdiff[i]
-        upper = ramps[i+2] / fdiff[i+1]
+        upper = ramps[i + 2] / fdiff[i + 1]
 
         # .. then intersect them with each other and zero
         weights[i] = np.maximum(0, np.minimum(lower, upper))
 
     if norm in (1, 'slaney'):
         # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
         weights *= enorm[:, np.newaxis]
 
-
     # Only check weights if f_mel[0] is positive
     if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
         # This means we have an empty channel somewhere
@@ -354,25 +352,25 @@ def chroma(sr, n_fft, n_chroma=12, tuning=0.0, ctroct=5.0,
     # Project into range -n_chroma/2 .. n_chroma/2
     # add on fixed offset of 10*n_chroma to ensure all values passed to
     # rem are positive
-    D = np.remainder(D + n_chroma2 + 10*n_chroma, n_chroma) - n_chroma2
+    D = np.remainder(D + n_chroma2 + 10 * n_chroma, n_chroma) - n_chroma2
 
     # Gaussian bumps - 2*D to make them narrower
-    wts = np.exp(-0.5 * (2*D / np.tile(binwidthbins, (n_chroma, 1)))**2)
+    wts = np.exp(-0.5 * (2 * D / np.tile(binwidthbins, (n_chroma, 1))) ** 2)
 
     # normalize each column
     wts = util.normalize(wts, norm=norm, axis=0)
 
     # Maybe apply scaling for fft bins
     if octwidth is not None:
         wts *= np.tile(
-            np.exp(-0.5 * (((frqbins/n_chroma - ctroct)/octwidth)**2)),
+            np.exp(-0.5 * (((frqbins / n_chroma - ctroct) / octwidth) ** 2)),
             (n_chroma, 1))
 
     if base_c:
         wts = np.roll(wts, -3, axis=0)
 
     # remove aliasing columns, copy to ensure row-contiguity
-    return np.ascontiguousarray(wts[:, :int(1 + n_fft/2)], dtype=dtype)
+    return np.ascontiguousarray(wts[:, :int(1 + n_fft / 2)], dtype=dtype)
 
 
 def __float_window(window_spec):
@@ -529,7 +527,7 @@ def constant_q(sr, fmin=None, n_bins=84, bins_per_octave=12, window='hann',
     filters = []
     for ilen, freq in zip(lengths, freqs):
         # Build the filter: note, length will be ceil(ilen)
-        sig = np.exp(np.arange(-ilen//2, ilen//2, dtype=float) * 1j * 2 * np.pi * freq / sr)
+        sig = np.exp(np.arange(-ilen // 2, ilen // 2, dtype=float) * 1j * 2 * np.pi * freq / sr)
 
         # Apply the windowing function
         sig = sig * __float_window(window)(len(sig))
@@ -542,7 +540,7 @@ def constant_q(sr, fmin=None, n_bins=84, bins_per_octave=12, window='hann',
     # Pad and stack
     max_len = max(lengths)
     if pad_fft:
-        max_len = int(2.0**(np.ceil(np.log2(max_len))))
+        max_len = int(2.0 ** (np.ceil(np.log2(max_len))))
     else:
         max_len = int(np.ceil(max_len))
 
@@ -606,7 +604,7 @@ def constant_q_lengths(sr, fmin, n_bins=84, bins_per_octave=12,
 
     # Q should be capitalized here, so we suppress the name warning
     # pylint: disable=invalid-name
-    alpha = 2.0**(1. / bins_per_octave) - 1.0
+    alpha = 2.0 ** (1. / bins_per_octave) - 1.0
     Q = float(filter_scale) / alpha
 
     # Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 2.0**(-1./bins_per_octave))
@@ -791,7 +789,7 @@ def window_bandwidth(window, n=1000):
 
     if key not in WINDOW_BANDWIDTHS:
         win = get_window(window, n)
-        WINDOW_BANDWIDTHS[key] = n * np.sum(win**2) / np.sum(np.abs(win))**2
+        WINDOW_BANDWIDTHS[key] = n * np.sum(win ** 2) / np.sum(np.abs(win)) ** 2
 
     return WINDOW_BANDWIDTHS[key]
 
@@ -1177,7 +1175,7 @@ def window_sumsquare(window, n_frames, hop_length=512, win_length=None, n_fft=20
 
     # Compute the squared window at the desired length
     win_sq = get_window(window, win_length)
-    win_sq = util.normalize(win_sq, norm=norm)**2
+    win_sq = util.normalize(win_sq, norm=norm) ** 2
     win_sq = util.pad_center(win_sq, n_fft)
 
     # Fill the envelope
@@ -1236,7 +1234,7 @@ def diagonal_filter(window, n, slope=1.0, angle=None, zero_mean=False):
 
     win = np.diag(get_window(window, n, fftbins=False))
 
-    if not np.isclose(angle, np.pi/4):
+    if not np.isclose(angle, np.pi / 4):
         win = scipy.ndimage.rotate(win, 45 - angle * 180 / np.pi,
                                    order=5, prefilter=False)
 

diff --git a/tests/test_constantq.py b/tests/test_constantq.py
@@ -46,6 +46,29 @@ def __test_cqt_size(y, sr, hop_length, fmin, n_bins, bins_per_octave, tuning, fi
     return cqt_output
 
 
+def __test_vqt_size(y, sr, hop_length, fmin, n_bins, gamma, bins_per_octave,
+                    tuning, filter_scale, norm, sparsity, res_type):
+
+    vqt_output = np.abs(
+        librosa.vqt(
+            y,
+            sr=sr,
+            hop_length=hop_length,
+            fmin=fmin,
+            n_bins=n_bins,
+            gamma=gamma,
+            bins_per_octave=bins_per_octave,
+            tuning=tuning,
+            filter_scale=filter_scale,
+            norm=norm,
+            sparsity=sparsity,
+            res_type=res_type))
+
+    assert vqt_output.shape[0] == n_bins
+
+    return vqt_output
+
+
 def make_signal(sr, duration, fmin="C1", fmax="C8"):
     """ Generates a linear sine sweep """
 
@@ -120,6 +143,41 @@ def test_cqt(y_cqt, sr_cqt, hop_length, fmin, n_bins, bins_per_octave, tuning, f
     assert C.shape[0] == n_bins
 
 
+@pytest.mark.parametrize("fmin", [None, librosa.note_to_hz("C2")])
+@pytest.mark.parametrize("n_bins", [1, 12, 24, 76])
+@pytest.mark.parametrize("gamma", [None, 0, 2.5])
+@pytest.mark.parametrize("bins_per_octave", [12, 24])
+@pytest.mark.parametrize("tuning", [None, 0, 0.25])
+@pytest.mark.parametrize("filter_scale", [1, 2])
+@pytest.mark.parametrize("norm", [1, 2])
+@pytest.mark.parametrize("res_type", [None, "polyphase"])
+@pytest.mark.parametrize("sparsity", [0.01])
+@pytest.mark.parametrize("hop_length", [256, 512])
+def test_vqt(y_cqt, sr_cqt, hop_length, fmin, n_bins, gamma,
+             bins_per_octave, tuning, filter_scale, norm, res_type, sparsity):
+
+    C = librosa.vqt(
+        y=y_cqt,
+        sr=sr_cqt,
+        hop_length=hop_length,
+        fmin=fmin,
+        n_bins=n_bins,
+        gamma=gamma,
+        bins_per_octave=bins_per_octave,
+        tuning=tuning,
+        filter_scale=filter_scale,
+        norm=norm,
+        sparsity=sparsity,
+        res_type=res_type,
+    )
+
+    # type is complex
+    assert np.iscomplexobj(C)
+
+    # number of bins is correct
+    assert C.shape[0] == n_bins
+
+
 @pytest.fixture(scope="module")
 def y_hybrid():
     return make_signal(11025, 5.0, None)
@@ -210,6 +268,30 @@ def test_cqt_position(y, sr, note_min):
     assert np.nanmax(Cscale) < 5e-2, Cscale
 
 
+@pytest.mark.parametrize("note_min", [12, 18, 24, 30, 36])
+@pytest.mark.parametrize("sr", [22050])
+@pytest.mark.parametrize("y", [np.sin(2 * np.pi * librosa.midi_to_hz(60) * np.arange(2 * 22050) / 22050.0)])
+def test_vqt_position(y, sr, note_min):
+
+    C = np.abs(librosa.vqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min)))**2
+
+    # Average over time
+    Cbar = np.median(C, axis=1)
+
+    # Find the peak
+    idx = np.argmax(Cbar)
+
+    assert idx == 60 - note_min
+
+    # Make sure that the max outside the peak is sufficiently small
+    Cscale = Cbar / Cbar[idx]
+    Cscale[idx] = np.nan
+    assert np.nanmax(Cscale) < 7.5e-1, Cscale
+
+    Cscale[idx-1:idx+2] = np.nan
+    assert np.nanmax(Cscale) < 2.5e-1, Cscale
+
+
 @pytest.mark.xfail(raises=librosa.ParameterError)
 def test_cqt_fail_short_early():
 
@@ -218,13 +300,28 @@ def test_cqt_fail_short_early():
     librosa.cqt(y, sr=44100, n_bins=36)
 
 
+@pytest.mark.xfail(raises=librosa.ParameterError)
+def test_vqt_fail_short_early():
+
+    # sampling rate is sufficiently above the top octave to trigger early downsampling
+    y = np.zeros(16)
+    librosa.vqt(y, sr=44100, n_bins=36)
+
+
 @pytest.mark.xfail(raises=librosa.ParameterError)
 def test_cqt_fail_short_late():
 
     y = np.zeros(16)
     librosa.cqt(y, sr=22050)
 
 
+@pytest.mark.xfail(raises=librosa.ParameterError)
+def test_vqt_fail_short_late():
+
+    y = np.zeros(16)
+    librosa.vqt(y, sr=22050)
+
+
 @pytest.fixture(scope="module", params=[11025, 16384, 22050, 32000, 44100])
 def sr_impulse(request):
     return request.param
@@ -257,6 +354,17 @@ def test_cqt_impulse(y_impulse, sr_impulse, hop_impulse):
     assert np.max(continuity) < 5e-4, continuity
 
 
+def test_vqt_impulse(y_impulse, sr_impulse, hop_impulse):
+
+    C = np.abs(librosa.vqt(y=y_impulse, sr=sr_impulse, hop_length=hop_impulse))
+
+    response = np.mean(C ** 2, axis=1)
+
+    continuity = np.abs(np.diff(response))
+
+    # Test that integrated energy is approximately constant
+    assert np.max(continuity) < 5e-4, continuity
+
 def test_hybrid_cqt_impulse(y_impulse, sr_impulse, hop_impulse):
     # Test to resolve issue #341
     # Updated in #417 to use integrated energy instead of pointwise max
@@ -298,6 +406,24 @@ def test_cqt_white_noise(y_white, sr_white, fmin, n_bins, scale):
     assert np.allclose(np.std(C, axis=1), 0.5, atol=5e-1), np.std(C, axis=1)
 
 
+@pytest.mark.parametrize("scale", [False, True])
+@pytest.mark.parametrize("fmin", list(librosa.note_to_hz(["C1", "C2"])))
+@pytest.mark.parametrize("n_bins", [24, 36])
+@pytest.mark.parametrize("gamma", [2.5])
+def test_vqt_white_noise(y_white, sr_white, fmin, n_bins, gamma, scale):
+
+    C = np.abs(librosa.vqt(y=y_white, sr=sr_white, fmin=fmin, n_bins=n_bins, gamma=gamma, scale=scale))
+
+    if not scale:
+        lengths = librosa.filters.constant_q_lengths(sr_white, fmin, n_bins=n_bins, gamma=gamma)
+        C /= np.sqrt(lengths[:, np.newaxis])
+
+    # Only compare statistics across the time dimension
+    # we want ~ constant mean and variance across frequencies
+    assert np.allclose(np.mean(C, axis=1), 1.0, atol=2.5e-1), np.mean(C, axis=1)
+    assert np.allclose(np.std(C, axis=1), 0.5, atol=5e-1), np.std(C, axis=1)
+
+
 @pytest.mark.parametrize("scale", [False, True])
 @pytest.mark.parametrize("fmin", list(librosa.note_to_hz(["C1", "C2"])))
 @pytest.mark.parametrize("n_bins", [72, 84])