Merge pull request #679 from librosa/pcen

implemented PCEN. fixes #615
librosa · Feb 21, 2018 · 7fe6cd0 · 7fe6cd0
2 parents 24a081d + 27781ab
commit 7fe6cd0
Show file tree

Hide file tree

Showing 3 changed files with 246 additions and 1 deletion.
diff --git a/librosa/core/__init__.py b/librosa/core/__init__.py
@@ -51,6 +51,8 @@
     perceptual_weighting
     A_weighting
 
+    pcen
+
 Time and frequency conversion
 -----------------------------
 .. autosummary::

diff --git a/librosa/core/spectrum.py b/librosa/core/spectrum.py
@@ -6,6 +6,7 @@
 import numpy as np
 import scipy.fftpack as fft
 import scipy
+import scipy.ndimage
 import scipy.signal
 import scipy.interpolate
 import six
@@ -23,7 +24,7 @@
            'perceptual_weighting',
            'power_to_db', 'db_to_power',
            'amplitude_to_db', 'db_to_amplitude',
-           'fmt']
+           'fmt', 'pcen']
 
 
 @cache(level=20)
@@ -1281,6 +1282,177 @@ def fmt(y, t_min=0.5, n_fmt=None, kind='cubic', beta=0.5, over_sample=1, axis=-1
     return result[idx] * np.sqrt(n) / n_fmt
 
 
+@cache(level=30)
+def pcen(S, sr=22050, hop_length=512, gain=0.98, bias=2, power=0.5,
+         time_constant=0.395, eps=1e-6, b=None, max_size=1):
+    '''Per-channel energy normalization (PCEN) [1]_
+
+    This function normalizes a time-frequency representation `S` by
+    performing automatic gain control, followed by nonlinear compression:
+
+        P[f, t] = (S / (eps + M[f, t])**gain + bias)**power - bias**power
+
+    where `M` is the result of applying a low-pass, temporal IIR filter
+    to `S`:
+
+        M[f, t] = (1 - b) * M[f, t - 1] + b * S[f, t]
+
+    If `b` is not provided, it is calculated as:
+
+        b = 1 - exp(-hop_length / (sr * time_constant))
+
+    This normalization is designed to suppress background noise and
+    emphasize foreground signals, and can be used as an alternative to
+    decibel scaling (`amplitude_to_db`).
+
+    This implementation also supports smoothing across frequency bins
+    by specifying `max_size > 1`.  If this option is used, the filtered
+    spectrogram `M` is computed as
+
+        M[f, t] = (1 - b) * M[f, t - 1] + b * R[f, t]
+
+    where `R` has been max-filtered along the frequency axis, similar to
+    the SuperFlux algorithm implemented in `onset.onset_strength`:
+
+        R[f, t] = max(S[f - max_size//2: f + max_size//2, t])
+
+    This can be used to perform automatic gain control on signals that cross
+    or span multiple frequency bans, which may be desirable for spectrograms
+    with high frequency resolution.
+
+    .. [1] Wang, Y., Getreuer, P., Hughes, T., Lyon, R. F., & Saurous, R. A.
+       (2017, March). Trainable frontend for robust and far-field keyword spotting.
+       In Acoustics, Speech and Signal Processing (ICASSP), 2017
+       IEEE International Conference on (pp. 5670-5674). IEEE.
+
+    Parameters
+    ----------
+    S : np.ndarray (non-negative) [shape=(n, m)]
+        The input (magnitude) spectrogram
+
+    sr : number > 0 [scalar]
+        The audio sampling rate
+
+    hop_length : int > 0 [scalar]
+        The hop length of `S`, expressed in samples
+
+    gain : number >= 0 [scalar]
+        The gain factor.  Typical values should be slightly less than 1.
+
+    bias : number >= 0 [scalar]
+        The bias point of the nonlinear compression (default: 2)
+
+    power : number > 0 [scalar]
+        The compression exponent.  Typical values should be between 0 and 1.
+        Smaller values of `power` result in stronger compression.
+
+    time_constant : number > 0 [scalar]
+        The time constant for IIR filtering, measured in seconds.
+
+    eps : number > 0 [scalar]
+        A small constant used to ensure numerical stability of the filter.
+
+    b : number in [0, 1]  [scalar]
+        The filter coefficient for the low-pass filter.
+        If not provided, it will be inferred from `time_constant`.
+
+    max_size : int > 0 [scalar]
+        The width of the max filter applied to the frequency axis.
+        If left as `1`, no filtering is performed.
+
+    Returns
+    -------
+    P : np.ndarray, non-negative [shape=(n, m)]
+        The per-channel energy normalized version of `S`.
+
+    See Also
+    --------
+    amplitude_to_db
+    librosa.onset.onset_strength
+
+    Examples
+    --------
+
+    Compare PCEN to log amplitude (dB) scaling on Mel spectra
+
+    >>> import matplotlib.pyplot as plt
+    >>> y, sr = librosa.load(librosa.util.example_audio_file(),
+    ...                      offset=10, duration=10)
+
+    >>> # We'll use power=1 to get a magnitude spectrum
+    >>> # instead of a power spectrum
+    >>> S = librosa.feature.melspectrogram(y, sr=sr, power=1)
+    >>> log_S = librosa.amplitude_to_db(S, ref=np.max)
+    >>> pcen_S = librosa.pcen(S)
+    >>> plt.figure()
+    >>> plt.subplot(2,1,1)
+    >>> librosa.display.specshow(log_S, x_axis='time', y_axis='mel')
+    >>> plt.title('log amplitude (dB)')
+    >>> plt.colorbar()
+    >>> plt.subplot(2,1,2)
+    >>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel')
+    >>> plt.title('Per-channel energy normalization')
+    >>> plt.colorbar()
+    >>> plt.tight_layout()
+
+    Compare PCEN with and without max-filtering
+
+    >>> pcen_max = librosa.pcen(S, max_size=3)
+    >>> plt.figure()
+    >>> plt.subplot(2,1,1)
+    >>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel')
+    >>> plt.title('Per-channel energy normalization (no max-filter)')
+    >>> plt.colorbar()
+    >>> plt.subplot(2,1,2)
+    >>> librosa.display.specshow(pcen_max, x_axis='time', y_axis='mel')
+    >>> plt.title('Per-channel energy normalization (max_size=3)')
+    >>> plt.colorbar()
+    >>> plt.tight_layout()
+
+    '''
+
+    if power <= 0:
+        raise ParameterError('power={} must be strictly positive'.format(power))
+
+    if gain < 0:
+        raise ParameterError('gain={} must be non-negative'.format(gain))
+
+    if bias < 0:
+        raise ParameterError('bias={} must be non-negative'.format(bias))
+
+    if eps <= 0:
+        raise ParameterError('eps={} must be strictly positive'.format(eps))
+
+    if time_constant <= 0:
+        raise ParameterError('time_constant={} must be strictly positive'.format(time_constant))
+
+    if max_size < 1 or not isinstance(max_size, int):
+        raise ParameterError('max_size={} must be a positive integer'.format(max_size))
+
+    if b is None:
+        b = 1 - np.exp(- float(hop_length) / (time_constant * sr))
+
+    if not 0 <= b <= 1:
+        raise ParameterError('b={} must be between 0 and 1'.format(b))
+
+    if np.issubdtype(S.dtype, np.complexfloating):
+        warnings.warn('pcen was called on complex input so phase '
+                      'information will be discarded. To suppress this warning, '
+                      'call pcen(magphase(D)[0]) instead.')
+        S = np.abs(S)
+
+    if max_size == 1:
+        ref_spec = S
+    else:
+        ref_spec = scipy.ndimage.maximum_filter1d(S, max_size, axis=0)
+
+    S_smooth = scipy.signal.lfilter([b], [1, b - 1], ref_spec)
+
+    # Working in log-space gives us some stability, and a slight speedup
+    smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
+    return (S * smooth + bias)**power - bias**power
+
+
 def _spectrogram(y=None, S=None, n_fft=2048, hop_length=512, power=1):
     '''Helper function to retrieve a magnitude spectrogram.
 

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1283,3 +1283,74 @@ def test_iirt():
     mut = librosa.iirt(y, hop_length=2205, win_length=4410)
 
     assert np.allclose(mut, gt[23:108, :mut.shape[1]], atol=1.8)
+
+
+def test_pcen():
+
+    def __test(gain, bias, power, b, time_constant, eps, ms, S, Pexp):
+
+        warnings.resetwarnings()
+        warnings.simplefilter('always')
+        with warnings.catch_warnings(record=True) as out:
+
+            P = librosa.pcen(S, gain=gain, bias=bias, power=power,
+                             time_constant=time_constant, eps=eps, b=b,
+                             max_size=ms)
+
+            if np.issubdtype(S.dtype, np.complexfloating):
+                assert len(out) > 0
+                assert 'complex' in str(out[0].message).lower()
+
+        assert P.shape == S.shape
+        assert np.all(P >= 0)
+        assert np.all(np.isfinite(P))
+
+        if Pexp is not None:
+            assert np.allclose(P, Pexp)
+
+    tf = raises(librosa.ParameterError)(__test)
+
+    srand()
+    S = np.abs(np.random.randn(9, 30))
+
+    # Bounds tests (failures):
+    #   gain < 0
+    yield tf, -1, 1, 1, 0.5, 0.5, 1e-6, 1, S, S
+
+    #   bias < 0
+    yield tf, 1, -1, 1, 0.5, 0.5, 1e-6, 1, S, S
+
+    #   power <= 0
+    yield tf, 1, 1, 0, 0.5, 0.5, 1e-6, 1, S, S
+
+    #   b < 0
+    yield tf, 1, 1, 1, -2, 0.5, 1e-6, 1, S, S
+
+    #   b > 1
+    yield tf, 1, 1, 1, 2, 0.5, 1e-6, 1, S, S
+
+    #   time_constant <= 0
+    yield tf, 1, 1, 1, 0.5, -2, 1e-6, 1, S, S
+
+    #   eps <= 0
+    yield tf, 1, 1, 1, 0.5, 0.5, 0, 1, S, S
+
+    #   max_size not int, < 1
+    yield tf, 1, 1, 1, 0.5, 0.5, 1e-6, 1.5, S, S
+    yield tf, 1, 1, 1, 0.5, 0.5, 1e-6, 0, S, S
+
+    # Edge cases:
+    #   gain=0, bias=0, power=p, b=1 => S**p
+    for p in [0.5, 1, 2]:
+        yield __test, 0, 0, p, 1.0, 0.5, 1e-6, 1, S, S**p
+
+    #   gain=1, bias=0, power=1, b=1, eps=1e-20 => ones
+    yield __test, 1, 0, 1, 1.0, 0.5, 1e-20, 1, S, np.ones_like(S)
+
+    # Catch the complex warning
+    yield __test, 1, 0, 1, 1.0, 0.5, 1e-20, 1, S * 1.j, np.ones_like(S)
+
+    #   zeros to zeros
+    Z = np.zeros_like(S)
+    yield __test, 0.98, 2.0, 0.5, None, 0.395, 1e-6, 1, Z, Z
+    yield __test, 0.98, 2.0, 0.5, None, 0.395, 1e-6, 3, Z, Z