Skip to content

Commit

Permalink
Merge pull request #679 from librosa/pcen
Browse files Browse the repository at this point in the history
implemented PCEN. fixes #615
  • Loading branch information
bmcfee committed Feb 21, 2018
2 parents 24a081d + 27781ab commit 7fe6cd0
Show file tree
Hide file tree
Showing 3 changed files with 246 additions and 1 deletion.
2 changes: 2 additions & 0 deletions librosa/core/__init__.py
Expand Up @@ -51,6 +51,8 @@
perceptual_weighting
A_weighting
pcen
Time and frequency conversion
-----------------------------
.. autosummary::
Expand Down
174 changes: 173 additions & 1 deletion librosa/core/spectrum.py
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import scipy.fftpack as fft
import scipy
import scipy.ndimage
import scipy.signal
import scipy.interpolate
import six
Expand All @@ -23,7 +24,7 @@
'perceptual_weighting',
'power_to_db', 'db_to_power',
'amplitude_to_db', 'db_to_amplitude',
'fmt']
'fmt', 'pcen']


@cache(level=20)
Expand Down Expand Up @@ -1281,6 +1282,177 @@ def fmt(y, t_min=0.5, n_fmt=None, kind='cubic', beta=0.5, over_sample=1, axis=-1
return result[idx] * np.sqrt(n) / n_fmt


@cache(level=30)
def pcen(S, sr=22050, hop_length=512, gain=0.98, bias=2, power=0.5,
time_constant=0.395, eps=1e-6, b=None, max_size=1):
'''Per-channel energy normalization (PCEN) [1]_
This function normalizes a time-frequency representation `S` by
performing automatic gain control, followed by nonlinear compression:
P[f, t] = (S / (eps + M[f, t])**gain + bias)**power - bias**power
where `M` is the result of applying a low-pass, temporal IIR filter
to `S`:
M[f, t] = (1 - b) * M[f, t - 1] + b * S[f, t]
If `b` is not provided, it is calculated as:
b = 1 - exp(-hop_length / (sr * time_constant))
This normalization is designed to suppress background noise and
emphasize foreground signals, and can be used as an alternative to
decibel scaling (`amplitude_to_db`).
This implementation also supports smoothing across frequency bins
by specifying `max_size > 1`. If this option is used, the filtered
spectrogram `M` is computed as
M[f, t] = (1 - b) * M[f, t - 1] + b * R[f, t]
where `R` has been max-filtered along the frequency axis, similar to
the SuperFlux algorithm implemented in `onset.onset_strength`:
R[f, t] = max(S[f - max_size//2: f + max_size//2, t])
This can be used to perform automatic gain control on signals that cross
or span multiple frequency bans, which may be desirable for spectrograms
with high frequency resolution.
.. [1] Wang, Y., Getreuer, P., Hughes, T., Lyon, R. F., & Saurous, R. A.
(2017, March). Trainable frontend for robust and far-field keyword spotting.
In Acoustics, Speech and Signal Processing (ICASSP), 2017
IEEE International Conference on (pp. 5670-5674). IEEE.
Parameters
----------
S : np.ndarray (non-negative) [shape=(n, m)]
The input (magnitude) spectrogram
sr : number > 0 [scalar]
The audio sampling rate
hop_length : int > 0 [scalar]
The hop length of `S`, expressed in samples
gain : number >= 0 [scalar]
The gain factor. Typical values should be slightly less than 1.
bias : number >= 0 [scalar]
The bias point of the nonlinear compression (default: 2)
power : number > 0 [scalar]
The compression exponent. Typical values should be between 0 and 1.
Smaller values of `power` result in stronger compression.
time_constant : number > 0 [scalar]
The time constant for IIR filtering, measured in seconds.
eps : number > 0 [scalar]
A small constant used to ensure numerical stability of the filter.
b : number in [0, 1] [scalar]
The filter coefficient for the low-pass filter.
If not provided, it will be inferred from `time_constant`.
max_size : int > 0 [scalar]
The width of the max filter applied to the frequency axis.
If left as `1`, no filtering is performed.
Returns
-------
P : np.ndarray, non-negative [shape=(n, m)]
The per-channel energy normalized version of `S`.
See Also
--------
amplitude_to_db
librosa.onset.onset_strength
Examples
--------
Compare PCEN to log amplitude (dB) scaling on Mel spectra
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.util.example_audio_file(),
... offset=10, duration=10)
>>> # We'll use power=1 to get a magnitude spectrum
>>> # instead of a power spectrum
>>> S = librosa.feature.melspectrogram(y, sr=sr, power=1)
>>> log_S = librosa.amplitude_to_db(S, ref=np.max)
>>> pcen_S = librosa.pcen(S)
>>> plt.figure()
>>> plt.subplot(2,1,1)
>>> librosa.display.specshow(log_S, x_axis='time', y_axis='mel')
>>> plt.title('log amplitude (dB)')
>>> plt.colorbar()
>>> plt.subplot(2,1,2)
>>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel')
>>> plt.title('Per-channel energy normalization')
>>> plt.colorbar()
>>> plt.tight_layout()
Compare PCEN with and without max-filtering
>>> pcen_max = librosa.pcen(S, max_size=3)
>>> plt.figure()
>>> plt.subplot(2,1,1)
>>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel')
>>> plt.title('Per-channel energy normalization (no max-filter)')
>>> plt.colorbar()
>>> plt.subplot(2,1,2)
>>> librosa.display.specshow(pcen_max, x_axis='time', y_axis='mel')
>>> plt.title('Per-channel energy normalization (max_size=3)')
>>> plt.colorbar()
>>> plt.tight_layout()
'''

if power <= 0:
raise ParameterError('power={} must be strictly positive'.format(power))

if gain < 0:
raise ParameterError('gain={} must be non-negative'.format(gain))

if bias < 0:
raise ParameterError('bias={} must be non-negative'.format(bias))

if eps <= 0:
raise ParameterError('eps={} must be strictly positive'.format(eps))

if time_constant <= 0:
raise ParameterError('time_constant={} must be strictly positive'.format(time_constant))

if max_size < 1 or not isinstance(max_size, int):
raise ParameterError('max_size={} must be a positive integer'.format(max_size))

if b is None:
b = 1 - np.exp(- float(hop_length) / (time_constant * sr))

if not 0 <= b <= 1:
raise ParameterError('b={} must be between 0 and 1'.format(b))

if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn('pcen was called on complex input so phase '
'information will be discarded. To suppress this warning, '
'call pcen(magphase(D)[0]) instead.')
S = np.abs(S)

if max_size == 1:
ref_spec = S
else:
ref_spec = scipy.ndimage.maximum_filter1d(S, max_size, axis=0)

S_smooth = scipy.signal.lfilter([b], [1, b - 1], ref_spec)

# Working in log-space gives us some stability, and a slight speedup
smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
return (S * smooth + bias)**power - bias**power


def _spectrogram(y=None, S=None, n_fft=2048, hop_length=512, power=1):
'''Helper function to retrieve a magnitude spectrogram.
Expand Down
71 changes: 71 additions & 0 deletions tests/test_core.py
Expand Up @@ -1283,3 +1283,74 @@ def test_iirt():
mut = librosa.iirt(y, hop_length=2205, win_length=4410)

assert np.allclose(mut, gt[23:108, :mut.shape[1]], atol=1.8)


def test_pcen():

def __test(gain, bias, power, b, time_constant, eps, ms, S, Pexp):

warnings.resetwarnings()
warnings.simplefilter('always')
with warnings.catch_warnings(record=True) as out:

P = librosa.pcen(S, gain=gain, bias=bias, power=power,
time_constant=time_constant, eps=eps, b=b,
max_size=ms)

if np.issubdtype(S.dtype, np.complexfloating):
assert len(out) > 0
assert 'complex' in str(out[0].message).lower()

assert P.shape == S.shape
assert np.all(P >= 0)
assert np.all(np.isfinite(P))

if Pexp is not None:
assert np.allclose(P, Pexp)

tf = raises(librosa.ParameterError)(__test)

srand()
S = np.abs(np.random.randn(9, 30))

# Bounds tests (failures):
# gain < 0
yield tf, -1, 1, 1, 0.5, 0.5, 1e-6, 1, S, S

# bias < 0
yield tf, 1, -1, 1, 0.5, 0.5, 1e-6, 1, S, S

# power <= 0
yield tf, 1, 1, 0, 0.5, 0.5, 1e-6, 1, S, S

# b < 0
yield tf, 1, 1, 1, -2, 0.5, 1e-6, 1, S, S

# b > 1
yield tf, 1, 1, 1, 2, 0.5, 1e-6, 1, S, S

# time_constant <= 0
yield tf, 1, 1, 1, 0.5, -2, 1e-6, 1, S, S

# eps <= 0
yield tf, 1, 1, 1, 0.5, 0.5, 0, 1, S, S

# max_size not int, < 1
yield tf, 1, 1, 1, 0.5, 0.5, 1e-6, 1.5, S, S
yield tf, 1, 1, 1, 0.5, 0.5, 1e-6, 0, S, S

# Edge cases:
# gain=0, bias=0, power=p, b=1 => S**p
for p in [0.5, 1, 2]:
yield __test, 0, 0, p, 1.0, 0.5, 1e-6, 1, S, S**p

# gain=1, bias=0, power=1, b=1, eps=1e-20 => ones
yield __test, 1, 0, 1, 1.0, 0.5, 1e-20, 1, S, np.ones_like(S)

# Catch the complex warning
yield __test, 1, 0, 1, 1.0, 0.5, 1e-20, 1, S * 1.j, np.ones_like(S)

# zeros to zeros
Z = np.zeros_like(S)
yield __test, 0.98, 2.0, 0.5, None, 0.395, 1e-6, 1, Z, Z
yield __test, 0.98, 2.0, 0.5, None, 0.395, 1e-6, 3, Z, Z

0 comments on commit 7fe6cd0

Please sign in to comment.