Skip to content

Commit

Permalink
Added default gamma (#1045)
Browse files Browse the repository at this point in the history
* added vqt implementation

* fixed gamma scaling in vqt

* Fixed calculation of maximum filter cutoff

* Added default gamma

* Added comments to define alpha and describe the special cases for gamma, with a citation to the original paper where the transform was introduced.

* Added better comment for explanation of the default gamma

* Mirrored CQT unit tests for VQT function

* Fixed order of paramters in vqt tests

* Changed values in tests for indicating VQT regression

* Better description of alpha and percentage consistency in tests

* added vqt implementation

Co-authored-by: Brian McFee <brian.mcfee@nyu.edu>
  • Loading branch information
cwitkowitz and bmcfee committed May 28, 2020
1 parent c1d0628 commit 5bcfcfd
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 17 deletions.
22 changes: 20 additions & 2 deletions librosa/core/constantq.py
Expand Up @@ -759,7 +759,20 @@ def vqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, gamma=None,
gamma : number > 0 [scalar]
Bandwidth offset for determining filter lengths.
`gamma=0` produces the constant-Q transform.
If `gamma=0`, produces the constant-Q transform.
If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a
constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished
by solving for the gamma which gives B_k = alpha * f_k + gamma = C * ERB(f_k), where
B_k is the bandwidth of filter k with center frequency f_k, alpha is the inverse of
what would be the constant Q-factor, and C = alpha / 0.108 is the constant fraction
across all filters. Here we use ERB(f_k) = 24.7 + 0.108 * f_k, the best-fit curve derived
from experimental data in [2]_.
.. [2] Glasberg, Brian R., and Brian CJ Moore.
"Derivation of auditory filter shapes from notched-noise data."
Hearing research 47.1-2 (1990): 103-138.
bins_per_octave : int > 0 [scalar]
Number of bins per octave
Expand Down Expand Up @@ -855,13 +868,19 @@ def vqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, gamma=None,

len_orig = len(y)

# Relative difference in frequency between any two consecutive bands
alpha = (2.0**(1. / bins_per_octave) - 1)

if fmin is None:
# C1 by default
fmin = note_to_hz('C1')

if tuning is None:
tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)

if gamma is None:
gamma = 24.7 * alpha / 0.108

# Apply tuning correction
fmin = fmin * 2.0**(tuning / bins_per_octave)

Expand All @@ -873,7 +892,6 @@ def vqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, gamma=None,
fmax_t = np.max(freqs)

# Determine required resampling quality
alpha = (2.0**(1. / bins_per_octave) - 1)
Q = float(filter_scale) / alpha
filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) + 0.5 * gamma
nyquist = sr / 2.0
Expand Down
28 changes: 13 additions & 15 deletions librosa/filters.py
Expand Up @@ -62,7 +62,6 @@
'window_sumsquare',
'diagonal_filter']


# Dictionary of window function bandwidths

WINDOW_BANDWIDTHS = {'bart': 1.3334961334912805,
Expand Down Expand Up @@ -136,7 +135,7 @@ def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False,
norm : {None, 1, 'slaney', np.inf} [scalar]
If 1 or 'slaney', divide the triangular mel weights by the width of the mel band
(area normalization).
.. warning:: `norm=1` and `norm=np.inf` behavior will change in version 0.8.0.
Otherwise, leave all the triangles aiming for a peak value of 1.0
Expand Down Expand Up @@ -216,17 +215,16 @@ def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False,
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i+2] / fdiff[i+1]
upper = ramps[i + 2] / fdiff[i + 1]

# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))

if norm in (1, 'slaney'):
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]


# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
Expand Down Expand Up @@ -354,25 +352,25 @@ def chroma(sr, n_fft, n_chroma=12, tuning=0.0, ctroct=5.0,
# Project into range -n_chroma/2 .. n_chroma/2
# add on fixed offset of 10*n_chroma to ensure all values passed to
# rem are positive
D = np.remainder(D + n_chroma2 + 10*n_chroma, n_chroma) - n_chroma2
D = np.remainder(D + n_chroma2 + 10 * n_chroma, n_chroma) - n_chroma2

# Gaussian bumps - 2*D to make them narrower
wts = np.exp(-0.5 * (2*D / np.tile(binwidthbins, (n_chroma, 1)))**2)
wts = np.exp(-0.5 * (2 * D / np.tile(binwidthbins, (n_chroma, 1))) ** 2)

# normalize each column
wts = util.normalize(wts, norm=norm, axis=0)

# Maybe apply scaling for fft bins
if octwidth is not None:
wts *= np.tile(
np.exp(-0.5 * (((frqbins/n_chroma - ctroct)/octwidth)**2)),
np.exp(-0.5 * (((frqbins / n_chroma - ctroct) / octwidth) ** 2)),
(n_chroma, 1))

if base_c:
wts = np.roll(wts, -3, axis=0)

# remove aliasing columns, copy to ensure row-contiguity
return np.ascontiguousarray(wts[:, :int(1 + n_fft/2)], dtype=dtype)
return np.ascontiguousarray(wts[:, :int(1 + n_fft / 2)], dtype=dtype)


def __float_window(window_spec):
Expand Down Expand Up @@ -529,7 +527,7 @@ def constant_q(sr, fmin=None, n_bins=84, bins_per_octave=12, window='hann',
filters = []
for ilen, freq in zip(lengths, freqs):
# Build the filter: note, length will be ceil(ilen)
sig = np.exp(np.arange(-ilen//2, ilen//2, dtype=float) * 1j * 2 * np.pi * freq / sr)
sig = np.exp(np.arange(-ilen // 2, ilen // 2, dtype=float) * 1j * 2 * np.pi * freq / sr)

# Apply the windowing function
sig = sig * __float_window(window)(len(sig))
Expand All @@ -542,7 +540,7 @@ def constant_q(sr, fmin=None, n_bins=84, bins_per_octave=12, window='hann',
# Pad and stack
max_len = max(lengths)
if pad_fft:
max_len = int(2.0**(np.ceil(np.log2(max_len))))
max_len = int(2.0 ** (np.ceil(np.log2(max_len))))
else:
max_len = int(np.ceil(max_len))

Expand Down Expand Up @@ -606,7 +604,7 @@ def constant_q_lengths(sr, fmin, n_bins=84, bins_per_octave=12,

# Q should be capitalized here, so we suppress the name warning
# pylint: disable=invalid-name
alpha = 2.0**(1. / bins_per_octave) - 1.0
alpha = 2.0 ** (1. / bins_per_octave) - 1.0
Q = float(filter_scale) / alpha

# Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 2.0**(-1./bins_per_octave))
Expand Down Expand Up @@ -791,7 +789,7 @@ def window_bandwidth(window, n=1000):

if key not in WINDOW_BANDWIDTHS:
win = get_window(window, n)
WINDOW_BANDWIDTHS[key] = n * np.sum(win**2) / np.sum(np.abs(win))**2
WINDOW_BANDWIDTHS[key] = n * np.sum(win ** 2) / np.sum(np.abs(win)) ** 2

return WINDOW_BANDWIDTHS[key]

Expand Down Expand Up @@ -1177,7 +1175,7 @@ def window_sumsquare(window, n_frames, hop_length=512, win_length=None, n_fft=20

# Compute the squared window at the desired length
win_sq = get_window(window, win_length)
win_sq = util.normalize(win_sq, norm=norm)**2
win_sq = util.normalize(win_sq, norm=norm) ** 2
win_sq = util.pad_center(win_sq, n_fft)

# Fill the envelope
Expand Down Expand Up @@ -1236,7 +1234,7 @@ def diagonal_filter(window, n, slope=1.0, angle=None, zero_mean=False):

win = np.diag(get_window(window, n, fftbins=False))

if not np.isclose(angle, np.pi/4):
if not np.isclose(angle, np.pi / 4):
win = scipy.ndimage.rotate(win, 45 - angle * 180 / np.pi,
order=5, prefilter=False)

Expand Down
126 changes: 126 additions & 0 deletions tests/test_constantq.py
Expand Up @@ -46,6 +46,29 @@ def __test_cqt_size(y, sr, hop_length, fmin, n_bins, bins_per_octave, tuning, fi
return cqt_output


def __test_vqt_size(y, sr, hop_length, fmin, n_bins, gamma, bins_per_octave,
tuning, filter_scale, norm, sparsity, res_type):

vqt_output = np.abs(
librosa.vqt(
y,
sr=sr,
hop_length=hop_length,
fmin=fmin,
n_bins=n_bins,
gamma=gamma,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
res_type=res_type))

assert vqt_output.shape[0] == n_bins

return vqt_output


def make_signal(sr, duration, fmin="C1", fmax="C8"):
""" Generates a linear sine sweep """

Expand Down Expand Up @@ -120,6 +143,41 @@ def test_cqt(y_cqt, sr_cqt, hop_length, fmin, n_bins, bins_per_octave, tuning, f
assert C.shape[0] == n_bins


@pytest.mark.parametrize("fmin", [None, librosa.note_to_hz("C2")])
@pytest.mark.parametrize("n_bins", [1, 12, 24, 76])
@pytest.mark.parametrize("gamma", [None, 0, 2.5])
@pytest.mark.parametrize("bins_per_octave", [12, 24])
@pytest.mark.parametrize("tuning", [None, 0, 0.25])
@pytest.mark.parametrize("filter_scale", [1, 2])
@pytest.mark.parametrize("norm", [1, 2])
@pytest.mark.parametrize("res_type", [None, "polyphase"])
@pytest.mark.parametrize("sparsity", [0.01])
@pytest.mark.parametrize("hop_length", [256, 512])
def test_vqt(y_cqt, sr_cqt, hop_length, fmin, n_bins, gamma,
bins_per_octave, tuning, filter_scale, norm, res_type, sparsity):

C = librosa.vqt(
y=y_cqt,
sr=sr_cqt,
hop_length=hop_length,
fmin=fmin,
n_bins=n_bins,
gamma=gamma,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
res_type=res_type,
)

# type is complex
assert np.iscomplexobj(C)

# number of bins is correct
assert C.shape[0] == n_bins


@pytest.fixture(scope="module")
def y_hybrid():
return make_signal(11025, 5.0, None)
Expand Down Expand Up @@ -210,6 +268,30 @@ def test_cqt_position(y, sr, note_min):
assert np.nanmax(Cscale) < 5e-2, Cscale


@pytest.mark.parametrize("note_min", [12, 18, 24, 30, 36])
@pytest.mark.parametrize("sr", [22050])
@pytest.mark.parametrize("y", [np.sin(2 * np.pi * librosa.midi_to_hz(60) * np.arange(2 * 22050) / 22050.0)])
def test_vqt_position(y, sr, note_min):

C = np.abs(librosa.vqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min)))**2

# Average over time
Cbar = np.median(C, axis=1)

# Find the peak
idx = np.argmax(Cbar)

assert idx == 60 - note_min

# Make sure that the max outside the peak is sufficiently small
Cscale = Cbar / Cbar[idx]
Cscale[idx] = np.nan
assert np.nanmax(Cscale) < 7.5e-1, Cscale

Cscale[idx-1:idx+2] = np.nan
assert np.nanmax(Cscale) < 2.5e-1, Cscale


@pytest.mark.xfail(raises=librosa.ParameterError)
def test_cqt_fail_short_early():

Expand All @@ -218,13 +300,28 @@ def test_cqt_fail_short_early():
librosa.cqt(y, sr=44100, n_bins=36)


@pytest.mark.xfail(raises=librosa.ParameterError)
def test_vqt_fail_short_early():

# sampling rate is sufficiently above the top octave to trigger early downsampling
y = np.zeros(16)
librosa.vqt(y, sr=44100, n_bins=36)


@pytest.mark.xfail(raises=librosa.ParameterError)
def test_cqt_fail_short_late():

y = np.zeros(16)
librosa.cqt(y, sr=22050)


@pytest.mark.xfail(raises=librosa.ParameterError)
def test_vqt_fail_short_late():

y = np.zeros(16)
librosa.vqt(y, sr=22050)


@pytest.fixture(scope="module", params=[11025, 16384, 22050, 32000, 44100])
def sr_impulse(request):
return request.param
Expand Down Expand Up @@ -257,6 +354,17 @@ def test_cqt_impulse(y_impulse, sr_impulse, hop_impulse):
assert np.max(continuity) < 5e-4, continuity


def test_vqt_impulse(y_impulse, sr_impulse, hop_impulse):

C = np.abs(librosa.vqt(y=y_impulse, sr=sr_impulse, hop_length=hop_impulse))

response = np.mean(C ** 2, axis=1)

continuity = np.abs(np.diff(response))

# Test that integrated energy is approximately constant
assert np.max(continuity) < 5e-4, continuity

def test_hybrid_cqt_impulse(y_impulse, sr_impulse, hop_impulse):
# Test to resolve issue #341
# Updated in #417 to use integrated energy instead of pointwise max
Expand Down Expand Up @@ -298,6 +406,24 @@ def test_cqt_white_noise(y_white, sr_white, fmin, n_bins, scale):
assert np.allclose(np.std(C, axis=1), 0.5, atol=5e-1), np.std(C, axis=1)


@pytest.mark.parametrize("scale", [False, True])
@pytest.mark.parametrize("fmin", list(librosa.note_to_hz(["C1", "C2"])))
@pytest.mark.parametrize("n_bins", [24, 36])
@pytest.mark.parametrize("gamma", [2.5])
def test_vqt_white_noise(y_white, sr_white, fmin, n_bins, gamma, scale):

C = np.abs(librosa.vqt(y=y_white, sr=sr_white, fmin=fmin, n_bins=n_bins, gamma=gamma, scale=scale))

if not scale:
lengths = librosa.filters.constant_q_lengths(sr_white, fmin, n_bins=n_bins, gamma=gamma)
C /= np.sqrt(lengths[:, np.newaxis])

# Only compare statistics across the time dimension
# we want ~ constant mean and variance across frequencies
assert np.allclose(np.mean(C, axis=1), 1.0, atol=2.5e-1), np.mean(C, axis=1)
assert np.allclose(np.std(C, axis=1), 0.5, atol=5e-1), np.std(C, axis=1)


@pytest.mark.parametrize("scale", [False, True])
@pytest.mark.parametrize("fmin", list(librosa.note_to_hz(["C1", "C2"])))
@pytest.mark.parametrize("n_bins", [72, 84])
Expand Down

0 comments on commit 5bcfcfd

Please sign in to comment.