lhotse-speech · pzelasko · Feb 29, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
diff --git a/docs/api.rst b/docs/api.rst
@@ -3,10 +3,10 @@ API Reference
 
 This page contains a comprehensive list of all classes and functions within `lhotse`.
 
-Recording manifests
+Audio loading, saving, and manifests
 -------------------
 
-Data structures used for describing audio recordings in a dataset.
+Data structures and utilities used for describing and manipulating audio recordings.
 
 .. automodule:: lhotse.audio
   :members:
@@ -24,6 +24,8 @@ Data structures used for describing supervisions in a dataset.
 Lhotse Shar -- sequential storage
 ---------------------------------
 
+Documentation for Lhotse Shar multi-tarfile sequential I/O format.
+
 Lhotse Shar readers
 *******************
 

diff --git a/docs/cli.rst b/docs/cli.rst
@@ -3,4 +3,4 @@ Command-line interface
 
 .. click:: lhotse.bin:cli
    :prog: lhotse
-   :show-nested:
+   :nested: full
diff --git a/docs/conf.py b/docs/conf.py
@@ -78,4 +78,4 @@
     "exclude-members": "__weakref__",
 }
 
-autodoc_mock_imports = ["torchaudio", "SoundFile"]
+autodoc_mock_imports = ["torchaudio", "SoundFile", "soundfile"]
diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -128,6 +128,19 @@ In general, pre-computed features can be greatly compressed (we achieve 70% size
 
 When I/O is not the issue, it might be preferable to use on-the-fly computation as it shouldn't require any prior steps to perform the network training. It is also simpler to apply a vast range of data augmentation methods in a fully randomized way (e.g. reverberation), although Lhotse provides support for approximate feature-domain signal mixing (e.g. for additive noise augmentation) to alleviate that to some extent.
 
+Handling random seeds
+---------------------
+
+Lhotse provides several mechanisms for controlling randomness. At a basic level, there is a function :func:`lhotse.utils.fix_random_seed` which seeds Python's, numpy's and torch's RNGs with the provided number.
+
+However, many functions and classes in Lhotse accept either a random seed or an RNG instance to provide a finer control over randomness. Whenever random seed is accepted, it can be either an integer, or one of two strings: ``"randomized"`` or ``"trng"``.
+
+* ``"randomized``" seed is resolved lazily at the moment it's needed and is intended as a mechanism to provide a different seed to each dataloading worker. In order for ``"randomized"`` to work, you have to first invoke :func:`lhotse.dataset.dataloading.worker_init_fn` in a given subprocess which sets the right environment variables. With a PyTorch ``DataLoader`` you can pass the keyword argument ``worker_init_fn==make_worker_init_fn(seed=int_seed, rank=..., world_size=...)`` using :func:`lhotse.dataset.dataloading.make_worker_init_fn` which will set the right seeds for you in multiprocessing and multi-node training. Note that if you resume training, you should change the ``seed`` passed to ``make_worker_init_fn`` on each resumed run to make the model train on different data.
+* ``"trng"`` seed is also resolved lazily at runtime, but it uses a true RNG (if available on your OS; consult Python's ``secrets`` module documentation). It's an easy way to ensure that every time you iterate data it's done in different order, but may cause debugging data issues to be more difficult.
+
+.. note:: The lazy seed resolution is done by calling :func:`lhotse.dataset.dataloading.resolve_seed`.
+
+
 Dataset's list
 --------------
 
@@ -185,3 +198,8 @@ Collation utilities for building custom Datasets
 ------------------------------------------------
 
 .. automodule:: lhotse.dataset.collation
+
+Dataloading seeding utilities
+-----------------------------
+
+.. automodule:: lhotse.dataset.dataloading
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.18.1
-sphinx_rtd_theme
-sphinx==4.2.0
-sphinx-click==3.0.1
-sphinx-autodoc-typehints==1.12.0
+sphinx_rtd_theme==2.0.0
+sphinx==7.2.6
+sphinx-click==5.1.0
+sphinx-autodoc-typehints==2.0.0
diff --git a/lhotse/audio/__init__.py b/lhotse/audio/__init__.py
@@ -6,6 +6,7 @@
     get_ffmpeg_torchaudio_info_enabled,
     info,
     read_audio,
+    save_audio,
     set_current_audio_backend,
     set_ffmpeg_torchaudio_info_enabled,
 )
@@ -21,3 +22,26 @@
     set_audio_duration_mismatch_tolerance,
     suppress_audio_loading_errors,
 )
+
+__all__ = [
+    "AudioSource",
+    "Recording",
+    "RecordingSet",
+    "AudioLoadingError",
+    "DurationMismatchError",
+    "VideoInfo",
+    "audio_backend",
+    "available_audio_backends",
+    "get_current_audio_backend",
+    "get_default_audio_backend",
+    "get_audio_duration_mismatch_tolerance",
+    "get_ffmpeg_torchaudio_info_enabled",
+    "info",
+    "read_audio",
+    "save_audio",
+    "set_current_audio_backend",
+    "set_audio_duration_mismatch_tolerance",
+    "set_ffmpeg_torchaudio_info_enabled",
+    "null_result_on_audio_loading_error",
+    "suppress_audio_loading_errors",
+]
diff --git a/lhotse/audio/recording.py b/lhotse/audio/recording.py
@@ -55,13 +55,19 @@ class Recording:
     and a 1-hour session with multiple channels and speakers (e.g., in AMI).
     In the latter case, it is partitioned into data suitable for model training using :class:`~lhotse.cut.Cut`.
 
-    .. hint::
-        Lhotse reads audio recordings using `pysoundfile`_ and `audioread`_, similarly to librosa,
-        to support multiple audio formats. For OPUS files we require ffmpeg to be installed.
+    Internally, Lhotse supports multiple audio backends to read audio file.
+    By default, we try to use libsoundfile, then torchaudio (with FFMPEG integration starting with torchaudio 2.1),
+    and then audioread (which is an ffmpeg CLI wrapper).
+    For sphere files we prefer to use sph2pipe binary as it can work with certain unique encodings such as "shorten".
+
+    Audio backends in Lhotse are configurable. See:
+
+    * :func:`~lhotse.audio.backend.available_audio_backends`
+    * :func:`~lhotse.audio.backend.audio_backend`,
+    * :func:`~lhotse.audio.backend.get_current_audio_backend`
+    * :func:`~lhotse.audio.backend.set_current_audio_backend`
+    * :func:`~lhotse.audio.backend.get_default_audio_backend`
 
-    .. hint::
-        Since we support importing Kaldi data dirs, if ``wav.scp`` contains unix pipes,
-        :class:`~lhotse.audio.Recording` will also handle them correctly.
 
     Examples
 
@@ -110,6 +116,8 @@ class Recording:
             >>> assert samples.shape == (1, 16000)
             >>> samples2 = recording.load_audio(offset=0.5)
             >>> assert samples2.shape == (1, 8000)
+
+        See also: :class:`~lhotse.audio.recording.Recording`, :class:`~lhotse.cut.Cut`, :class:`~lhotse.cut.CutSet`.
     """
 
     id: str

diff --git a/lhotse/bin/modes/cut.py b/lhotse/bin/modes/cut.py
@@ -134,29 +134,6 @@ def trim_to_supervisions(
     Splits each input cut into as many cuts as there are supervisions.
     These cuts have identical start times and durations as the supervisions.
     When there are overlapping supervisions, they can be kept or discarded with options.
-
-    \b
-    For example, the following cut:
-                Cut
-        |-----------------|
-         Sup1
-        |----|  Sup2
-           |-----------|
-
-    \b
-    is transformed into two cuts:
-         Cut1
-        |----|
-         Sup1
-        |----|
-           Sup2
-           |-|
-                Cut2
-           |-----------|
-           Sup1
-           |-|
-                Sup2
-           |-----------|
     """
     cuts = CutSet.from_file(cuts)
 

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
@@ -1667,7 +1667,7 @@ def mix(
         snr: Optional[Union[Decibels, Sequence[Decibels]]] = 20,
         preserve_id: Optional[str] = None,
         mix_prob: float = 1.0,
-        seed: Union[int, Literal["trng"]] = 42,
+        seed: Union[int, Literal["trng", "randomized"]] = 42,
         random_mix_offset: bool = False,
     ) -> "CutSet":
         """
@@ -3440,7 +3440,7 @@ def __init__(
         snr: Optional[Union[Decibels, Sequence[Decibels]]] = 20,
         preserve_id: Optional[str] = None,
         mix_prob: float = 1.0,
-        seed: Union[int, Literal["trng"]] = 42,
+        seed: Union[int, Literal["trng", "randomized"]] = 42,
         random_mix_offset: bool = False,
     ) -> None:
         self.source = cuts
@@ -3463,10 +3463,9 @@ def __init__(
             assert isinstance(self.snr, (type(None), int, float))
 
     def __iter__(self):
-        if self.seed == "trng":
-            rng = secrets.SystemRandom()
-        else:
-            rng = random.Random(self.seed)
+        from lhotse.dataset.dataloading import resolve_seed
+
+        rng = random.Random(resolve_seed(self.seed))
         mix_in_cuts = iter(self.mix_in_cuts.repeat().shuffle(rng=rng, buffer_size=100))
 
         for cut in self.source:

diff --git a/lhotse/dataset/cut_transforms/mix.py b/lhotse/dataset/cut_transforms/mix.py
@@ -36,7 +36,7 @@ def __init__(
             Otherwise, new random IDs are generated for the augmented cuts (default).
         :param random_mix_offset: an optional bool.
             When ``True`` and the duration of the to be mixed in cut in longer than the original cut,
-             select a random sub-region from the to be mixed in cut.
+            select a random sub-region from the to be mixed in cut.
         """
         self.cuts = cuts
         if len(self.cuts) == 0:

diff --git a/lhotse/dataset/cut_transforms/reverberate.py b/lhotse/dataset/cut_transforms/reverberate.py
@@ -10,8 +10,7 @@ class ReverbWithImpulseResponse:
     response with some probability :attr:`p`.
     The impulse response is chosen randomly from a specified CutSet of RIRs :attr:`rir_cuts`.
     If no RIRs are specified, we will generate them using a fast random generator (https://arxiv.org/abs/2208.04101).
-    If `early_only` is set to True, convolution is performed only with the first 50ms of
-        the impulse response.
+    If `early_only` is set to True, convolution is performed only with the first 50ms of the impulse response.
     """
 
     def __init__(

diff --git a/lhotse/dataset/dataloading.py b/lhotse/dataset/dataloading.py
@@ -22,9 +22,10 @@ def make_worker_init_fn(
     Calling this function creates a worker_init_fn suitable to pass to PyTorch's DataLoader.
 
     It helps with two issues:
-    - sets the random seeds differently for each worker and node, which helps with
+
+    * sets the random seeds differently for each worker and node, which helps with
         avoiding duplication in randomized data augmentation techniques.
-    - sets environment variables that help WebDataset detect it's inside multi-GPU (DDP)
+    * sets environment variables that help WebDataset detect it's inside multi-GPU (DDP)
         training, so that it correctly de-duplicates the data across nodes.
     """
     return partial(
@@ -43,6 +44,9 @@ def worker_init_fn(
     set_different_node_and_worker_seeds: bool = True,
     seed: Optional[int] = 42,
 ) -> None:
+    """
+    Function created by :func:`~lhotse.dataset.dataloading.make_worker_init_fn`, refer to its documentation for details.
+    """
     if set_different_node_and_worker_seeds:
         process_seed = seed + 100 * worker_id
         if rank is not None:
@@ -74,7 +78,7 @@ def resolve_seed(seed: Union[int, Literal["trng", "randomized"]]) -> int:
     using a true RNG (to the extend supported by the OS).
 
     If it's "randomized", we'll check whether we're in a dataloading worker of ``torch.utils.data.DataLoader``.
-    If we are, we expect that it was passed the result of :func:``lhotse.dataset.dataloading.make_worker_init_fn``
+    If we are, we expect that it was passed the result of :func:`~lhotse.dataset.dataloading.make_worker_init_fn`
     into its ``worker_init_fn`` argument, in which case we'll return a special seed exclusive to that worker.
     If we are not in a dataloading worker (or ``num_workers`` was set to ``0``), we'll return Python's ``random``
     module global seed.

diff --git a/lhotse/dataset/input_strategies.py b/lhotse/dataset/input_strategies.py
@@ -61,7 +61,7 @@ def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
 
         Depending on the strategy, the dict should look like:
 
-        .. code-block:
+        .. code-block::
 
             {
                 "sequence_idx": tensor(shape=(S,)),
@@ -71,7 +71,7 @@ def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
 
         or
 
-        .. code-block:
+        .. code-block::
 
             {
                 "sequence_idx": tensor(shape=(S,)),
@@ -127,7 +127,7 @@ def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
         Returns a dict that specifies the start and end bounds for each supervision,
         as a 1-D int tensor, in terms of frames:
 
-        .. code-block:
+        .. code-block::
 
             {
                 "sequence_idx": tensor(shape=(S,)),
@@ -233,7 +233,7 @@ def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
         Returns a dict that specifies the start and end bounds for each supervision,
         as a 1-D int tensor, in terms of samples:
 
-        .. code-block:
+        .. code-block::
 
             {
                 "sequence_idx": tensor(shape=(S,)),
@@ -410,7 +410,7 @@ def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
         Returns a dict that specifies the start and end bounds for each supervision,
         as a 1-D int tensor, in terms of frames:
 
-        .. code-block:
+        .. code-block::
 
             {
                 "sequence_idx": tensor(shape=(S,)),

diff --git a/lhotse/dataset/unsupervised.py b/lhotse/dataset/unsupervised.py
@@ -131,6 +131,7 @@ class RecordingChunkIterableDataset(IterableDataset):
     overlapping audio chunks.
 
     The format of yielded items is the following::
+
         {
             "recording_id": str
             "begin_time": tensor with dtype=float32 shape=(1,)

diff --git a/lhotse/lazy.py b/lhotse/lazy.py
@@ -68,8 +68,7 @@ def mux(
         *manifests,
         stop_early: bool = False,
         weights: Optional[List[Union[int, float]]] = None,
-        seed: Union[int, Literal["trng"]] = 0,
-        max_open_streams: Optional[int] = None,
+        seed: Union[int, Literal["trng", "randomized"]] = 0,
     ):
         """
         Merges multiple manifest iterables into a new iterable by lazily multiplexing them during iteration time.
@@ -96,7 +95,7 @@ def infinite_mux(
         cls,
         *manifests,
         weights: Optional[List[Union[int, float]]] = None,
-        seed: Union[int, Literal["trng"]] = 0,
+        seed: Union[int, Literal["trng", "randomized"]] = 0,
         max_open_streams: Optional[int] = None,
     ):
         """
@@ -315,7 +314,7 @@ def __init__(
         self,
         *iterators: Iterable,
         shuffle_iters: bool = False,
-        seed: Optional[int] = None,
+        seed: Optional[Union[int, Literal["trng", "randomized"]]] = None,
     ) -> None:
         self.iterators = []
         self.shuffle_iters = shuffle_iters
@@ -330,12 +329,14 @@ def __init__(
                 self.iterators.append(it)
 
     def __iter__(self):
+        from lhotse.dataset.dataloading import resolve_seed
+
         iterators = self.iterators
         if self.shuffle_iters:
             if self.seed is None:
                 rng = random  # global Python RNG
             else:
-                rng = random.Random(self.seed + self.num_iters)
+                rng = random.Random(resolve_seed(self.seed) + self.num_iters)
             rng.shuffle(iterators)
             self.num_iters += 1
         for it in iterators:
@@ -367,7 +368,7 @@ def __init__(
         *iterators: Iterable,
         stop_early: bool = False,
         weights: Optional[List[Union[int, float]]] = None,
-        seed: Union[int, Literal["trng"]] = 0,
+        seed: Union[int, Literal["trng", "randomized"]] = 0,
     ) -> None:
         self.iterators = list(iterators)
         self.stop_early = stop_early
@@ -385,7 +386,9 @@ def __init__(
         assert len(self.iterators) == len(self.weights)
 
     def __iter__(self):
-        rng = build_rng(self.seed)
+        from lhotse.dataset.dataloading import resolve_seed
+
+        rng = random.Random(resolve_seed(self.seed))
         iters = [iter(it) for it in self.iterators]
         exhausted = [False for _ in range(len(iters))]
 
@@ -447,7 +450,7 @@ def __init__(
         *iterators: Iterable,
         stop_early: bool = False,
         weights: Optional[List[Union[int, float]]] = None,
-        seed: Union[int, Literal["trng"]] = 0,
+        seed: Union[int, Literal["trng", "randomized"]] = 0,
         max_open_streams: Optional[int] = None,
     ) -> None:
         self.iterators = list(iterators)
@@ -475,7 +478,9 @@ def __iter__(self):
         - each stream may be interpreted as a shard belonging to some larger group of streams
           (e.g. multiple shards of a given dataset).
         """
-        rng = build_rng(self.seed)
+        from lhotse.dataset.dataloading import resolve_seed
+
+        rng = random.Random(resolve_seed(self.seed))
 
         def shuffled_streams():
             # Create an infinite iterable of our streams.