simplified stream API, updated documentation

librosa · Jun 21, 2019 · 90780bf · 90780bf
1 parent 3f89801
commit 90780bf
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 59 deletions.
diff --git a/docs/examples/plot_pcen_stream.py b/docs/examples/plot_pcen_stream.py
@@ -43,11 +43,13 @@
 # fill_value pads out the last frame with zeros so that we have a
 # full frame at the end of the signal, even if the signal doesn't
 # divide evenly into full frames.
-blocks, sr = librosa.stream(filename, block_length=16,
-                            frame_length=n_fft,
-                            hop_length=hop_length,
-                            mono=True,
-                            fill_value=0)
+sr = librosa.get_samplerate(filename)
+
+stream = librosa.stream(filename, block_length=16,
+                        frame_length=n_fft,
+                        hop_length=hop_length,
+                        mono=True,
+                        fill_value=0)
 #####################################################################
 # For this example, we'll compute PCEN on each block, average over
 # frequency, and store the results in a list.
@@ -58,7 +60,7 @@
 # Initialize the PCEN filter delays to steady state
 zi = None
 
-for y_block in blocks:
+for y_block in stream:
     # Compute the STFT (without padding, so center=False)
     D = librosa.stft(y_block, n_fft=n_fft, hop_length=hop_length,
                      center=False)
@@ -72,9 +74,6 @@
     # Compute the average PCEN over frequency, and append it to our list
     pcen_blocks.extend(np.mean(P, axis=0))
 
-# Close the block reader
-blocks.close()
-
 # Cast to a numpy array for use downstream
 pcen_blocks = np.asarray(pcen_blocks)
 

diff --git a/docs/ioformats.rst b/docs/ioformats.rst
@@ -15,7 +15,7 @@ For a list of codecs supported by `soundfile`, see the *libsndfile* `documentati
 
 Librosa's load function is meant for the common case where you want to load an entire (fragment of a) recording into memory, but some applications require more flexibility.
 In these cases, we recommend using `soundfile` directly.
-Reading audio files using `soundfile` is similar to the method in *librosa*. One important difference is that the read data is of shape ``(nb_samples, nb_channels)`` compared to ``(nb_channels, nb_samples)`` in :func:`<librosa.core.load>`. Also the signal is not resampled to 22050 Hz by default, hence it would need be transposed and resampled for further processing in *librosa*. The following example is equivalent to ``librosa.load(librosa.util.example_audio_file())``:
+Reading audio files using `soundfile` is similar to the method in *librosa*. One important difference is that the read data is of shape ``(nb_samples, nb_channels)`` compared to ``(nb_channels, nb_samples)`` in :func:`librosa.core.load`. Also the signal is not resampled to 22050 Hz by default, hence it would need be transposed and resampled for further processing in *librosa*. The following example is equivalent to ``librosa.load(librosa.util.example_audio_file())``:
 
 .. code-block:: python
     :linenos:
@@ -35,34 +35,54 @@ Blockwise Reading
 -----------------
 
 For large audio signals it could be beneficial to not load the whole audio file
-into memory. *PySoundFile* supports blockwise reading. In the following example
-a block of 1024 samples of audio are read and directly fed into the chroma
-feature extractor.
+into memory.  Librosa 0.7 introduces a streaming interface, which can be used to
+work on short fragments of audio sequentially.  :func:`librosa.core.stream` cuts an input
+file into *blocks* of audio, which correspond to a given number of *frames*,
+which can be iterated over as in the following example:
+
 
 .. code-block:: python
-    :linenos:
+   :linenos:
 
-    import numpy as np
-    import soundfile as sf
-    from librosa.feature import chroma_stft
+   import librosa
+
+   sr = librosa.get_samplerate('/path/to/file.wav')
+
+   # Set the frame parameters to be equivalent to the librosa defaults
+   # in the file's native sampling rate
+   frame_length = (2048 * sr) // 22050
+   hop_length = (512 * sr) // 22050
+
+   # Stream the data, working on 128 frames at a time
+   stream = librosa.stream('path/to/file.wav',
+                           block_length=128,
+                           frame_length=frame_length,
+                           hop_length=hop_length)
 
-    block_gen = sf.blocks('stereo_file.wav', blocksize=1024)
-    rate = sf.info('stereo_file.wav').samplerate
+   chromas = []
+   for y in stream:
+      chroma_block = librosa.feature.chroma_stft(y=y, sr=sr,
+                                                 n_fft=frame_length,
+                                                 hop_length=hop_length,
+                                                 center=False)
+      chromas.append(chromas)
+                                                
 
-    chromas = []
-    for bl in block_gen:
-        # downmix frame to mono (averaging out the channel dimension)
-        y=np.mean(bl, axis=1)
-        # compute chroma feature
-        chromas.append(chroma_stft(y, sr=rate))
+In this example, each audio fragment ``y`` will consist of 128 frames worth of samples,
+or more specifically, ``len(y) == frame_length + (block_length - 1) * hop_length``.
+Each fragment ``y`` will overlap with the subsequent fragment by ``frame_length - hop_length``
+samples, which ensures that stream processing will provide equivalent results to if the entire
+sequence was processed in one step (assuming padding / centering is disabled).
 
+For more details about the streaming interface, refer to :func:`librosa.core.stream`.
 
 
 Read file-like objects
 ----------------------
 
 If you want to read audio from file-like objects (also called *virtual files*)
-you can use `soundfile` as well.  (This will also work with `librosa.load`, provided that the underlying codec is supported by `soundfile`.)
+you can use `soundfile` as well.  (This will also work with :func:`librosa.core.load` and :func:`librosa.core.stream`, provided
+that the underlying codec is supported by `soundfile`.)
 
 E.g.: read files from zip compressed archives:
 

diff --git a/librosa/core/audio.py b/librosa/core/audio.py
@@ -300,13 +300,11 @@ def stream(path, block_length, frame_length, hop_length,
     dtype : numeric type
         data type of audio buffers to be produced
 
-    Returns
-    -------
-    stream : generator
-        A generator which produces blocks of audio.
-
-    sr : number > 0
-        The sampling rate of the audio
+    Yields
+    ------
+    y : np.ndarray
+        An audio buffer of (at most) 
+        `block_length * (hop_length-1) + frame_length` samples.
 
     See Also
     --------
@@ -320,20 +318,24 @@ def stream(path, block_length, frame_length, hop_length,
     at a time.  Note that streaming operation requires left-aligned
     frames, so we must set `center=False` to avoid padding artifacts.
 
-    >>> stream, sr = librosa.stream(librosa.util.example_audio_file(),
-    ...                             block_length=256,
-    ...                             frame_length=4096,
-    ...                             hop_length=1024)
+    >>> filename = librosa.util.example_audio_file()
+    >>> sr = librosa.get_samplerate(filename)
+    >>> stream librosa.stream(filename,
+    ...                       block_length=256,
+    ...                       frame_length=4096,
+    ...                       hop_length=1024)
     >>> for y_block in stream:
     ...     D_block = librosa.stft(y_block, center=False)
 
     Or compute a mel spectrogram over a stream, using a shorter frame
     and non-overlapping windows
 
-    >>> stream, sr = librosa.stream(librosa.util.example_audio_file(),
-    ...                             block_length=256,
-    ...                             frame_length=2048,
-    ...                             hop_length=2048)
+    >>> filename = librosa.util.example_audio_file()
+    >>> sr = librosa.get_samplerate(filename)
+    >>> stream = librosa.stream(filename,
+    ...                         block_length=256,
+    ...                         frame_length=2048,
+    ...                         hop_length=2048)
     >>> for y_block in stream:
     ...     m_block = librosa.feature.melspectrogram(y_block, sr=sr,
     ...                                              n_fft=2048,
@@ -353,16 +355,6 @@ def stream(path, block_length, frame_length, hop_length,
     sr = sf.info(path).samplerate
 
     # Construct the stream
-    block_stream = __stream(path, sr, block_length, frame_length, hop_length,
-                            mono, offset, duration, fill_value, dtype)
-
-    return block_stream, sr
-
-
-def __stream(path, sr, block_length, frame_length, hop_length,
-             mono, offset, duration, fill_value, dtype):
-    '''Private function for wrapping sf.blocks in a librosa interface.'''
-
     if offset:
         start = int(offset * sr)
     else:

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1690,17 +1690,17 @@ def test_stream(block_length, frame_length, hop_length, mono, offset,
     # test data is stereo, int 16
     path = os.path.join('tests', 'data', 'test1_22050.wav')
 
-    blocks, sr_stream = librosa.stream(path, block_length=block_length,
-                                       frame_length=frame_length,
-                                       hop_length=hop_length,
-                                       dtype=dtype, mono=mono,
-                                       offset=offset, duration=duration,
-                                       fill_value=fill_value)
+    stream = librosa.stream(path, block_length=block_length,
+                            frame_length=frame_length,
+                            hop_length=hop_length,
+                            dtype=dtype, mono=mono,
+                            offset=offset, duration=duration,
+                            fill_value=fill_value)
 
     y_frame_stream = []
     target_length = frame_length + (block_length - 1) * hop_length
 
-    for y_block in blocks:
+    for y_block in stream:
         # Check the dtype
         assert y_block.dtype == dtype
 
@@ -1732,7 +1732,6 @@ def test_stream(block_length, frame_length, hop_length, mono, offset,
     y_full, sr = librosa.load(path, sr=None, dtype=dtype, mono=True,
                               offset=offset, duration=duration)
     # First, check the rate
-    assert sr == sr_stream
     y_frame = librosa.util.frame(y_full, frame_length, hop_length)
 
     # Raw audio will not be padded