Utils for discovering attached data and dropping in-memory data (#1361)

lhotse-speech · Jun 24, 2024 · e3bed73 · e3bed73
1 parent d1f94c0
commit e3bed73
Show file tree

Hide file tree

Showing 10 changed files with 274 additions and 9 deletions.
diff --git a/lhotse/array.py b/lhotse/array.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 
-from lhotse.utils import Pathlike, Seconds, fastcopy, ifnone
+from lhotse.utils import Pathlike, Seconds, fastcopy
 
 
 @dataclass
@@ -51,6 +51,16 @@ class Array:
     def ndim(self) -> int:
         return len(self.shape)
 
+    @property
+    def is_in_memory(self) -> bool:
+        from lhotse.features.io import is_in_memory
+
+        return is_in_memory(self.storage_type)
+
+    @property
+    def is_placeholder(self) -> bool:
+        return self.storage_type == "shar"
+
     def to_dict(self) -> dict:
         return asdict(self)
 
@@ -157,6 +167,14 @@ class TemporalArray:
     # the shape, temporal_dim, and frame_shift.
     start: Seconds
 
+    @property
+    def is_in_memory(self) -> bool:
+        return self.array.is_in_memory
+
+    @property
+    def is_placeholder(self) -> bool:
+        return self.array.is_placeholder
+
     @property
     def shape(self) -> List[int]:
         return self.array.shape

diff --git a/lhotse/audio/recording.py b/lhotse/audio/recording.py
@@ -155,6 +155,14 @@ def _video_source(self) -> Optional[AudioSource]:
                 return s
         return None
 
+    @property
+    def is_in_memory(self) -> bool:
+        return any(s.type == "memory" for s in self.sources)
+
+    @property
+    def is_placeholder(self) -> bool:
+        return any(s.type == "shar" for s in self.sources)
+
     @property
     def num_channels(self) -> int:
         return len(self.channel_ids)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
@@ -177,6 +177,9 @@ class Cut:
     drop_features: Callable
     drop_recording: Callable
     drop_supervisions: Callable
+    drop_alignments: Callable
+    drop_in_memory_data: Callable
+    iter_data: Callable
     truncate: Callable
     pad: Callable
     extend_by: Callable

diff --git a/lhotse/cut/data.py b/lhotse/cut/data.py
@@ -3,12 +3,23 @@
 from dataclasses import dataclass, field
 from decimal import ROUND_DOWN
 from math import isclose
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import numpy as np
 import torch
 from intervaltree import IntervalTree
 
+from lhotse.array import Array, TemporalArray
 from lhotse.audio import Recording, VideoInfo
 from lhotse.augmentation import AugmentFn
 from lhotse.custom import CustomFieldMixin
@@ -81,6 +92,32 @@ def to_dict(self) -> dict:
                     d["custom"][k] = v.to_dict()
         return {**d, "type": type(self).__name__}
 
+    def iter_data(
+        self,
+    ) -> Generator[
+        Tuple[str, Union[Recording, Features, Array, TemporalArray]], None, None
+    ]:
+        """
+        Iterate over each data piece attached to this cut.
+        Returns a generator yielding tuples of ``(key, manifest)``, where
+        ``key`` is the name of the attribute under which ``manifest`` is found.
+        ``manifest`` is of type :class:`~lhotse.Recording`, :class:`~lhotse.Features`,
+        :class:`~lhotse.TemporalArray`, or :class:`~lhotse.Array`.
+
+        For example, if ``key`` is ``recording``, then ``manifest`` is ``self.recording``.
+        """
+        if self.has_recording:
+            yield "recording", self.recording
+        if self.has_features:
+            yield "features", self.features
+        for k, v in (self.custom or {}).items():
+            if isinstance(v, (Recording, Features, Array, TemporalArray)):
+                yield k, v
+
+    @property
+    def is_in_memory(self) -> bool:
+        return any(v.is_in_memory for k, v in self.iter_data())
+
     @property
     def recording_id(self) -> str:
         return self.recording.id if self.has_recording else self.features.recording_id
@@ -327,6 +364,35 @@ def drop_alignments(self) -> "DataCut":
             self, supervisions=[fastcopy(s, alignment={}) for s in self.supervisions]
         )
 
+    def drop_in_memory_data(self) -> "DataCut":
+        """
+        Return a copy of the current :class:`.DataCut`, detached from any in-memory data.
+        The manifests for in-memory data are converted into placeholders that can still be looked up for
+        metadata, but will fail on attempts to load the data.
+        """
+        from lhotse.shar.utils import to_shar_placeholder
+
+        custom = None
+        if self.custom is not None:
+            custom = self.custom.copy()
+            for k in custom:
+                v = custom[k]
+                if (
+                    isinstance(v, (Recording, Features, Array, TemporalArray))
+                    and v.is_in_memory
+                ):
+                    custom[k] = to_shar_placeholder(v)
+        return fastcopy(
+            self,
+            recording=to_shar_placeholder(self.recording)
+            if self.has_recording and self.recording.is_in_memory
+            else self.recording,
+            features=to_shar_placeholder(self.features)
+            if self.has_features and self.features.is_in_memory
+            else self.features,
+            custom=custom,
+        )
+
     def fill_supervision(
         self, add_empty: bool = True, shrink_ok: bool = False
     ) -> "DataCut":

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
@@ -4,12 +4,23 @@
 from functools import partial, reduce
 from io import BytesIO
 from operator import add
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import numpy as np
 import torch
 from intervaltree import IntervalTree
 
+from lhotse.array import Array, TemporalArray
 from lhotse.audio import Recording, VideoInfo, get_audio_duration_mismatch_tolerance
 from lhotse.audio.backend import save_audio
 from lhotse.audio.mixer import AudioMixer, VideoMixer, audio_energy
@@ -27,6 +38,7 @@
     FeatureMixer,
     create_default_feature_extractor,
 )
+from lhotse.features.base import Features
 from lhotse.features.io import FeaturesWriter
 from lhotse.supervision import SupervisionSegment
 from lhotse.utils import (
@@ -153,6 +165,10 @@ def has_recording(self) -> bool:
     def has_video(self) -> bool:
         return self._first_non_padding_cut.has_video
 
+    @property
+    def is_in_memory(self) -> bool:
+        return any(track.cut.is_in_memory for track in self.tracks)
+
     def has(self, field: str) -> bool:
         return self._first_non_padding_cut.has(field)
 
@@ -191,6 +207,22 @@ def num_channels(self) -> Optional[int]:
     def features_type(self) -> Optional[str]:
         return self._first_non_padding_cut.features.type if self.has_features else None
 
+    def iter_data(
+        self,
+    ) -> Generator[
+        Tuple[str, Union[Recording, Features, Array, TemporalArray]], None, None
+    ]:
+        """
+        Iterate over each data piece attached to this cut.
+        Returns a generator yielding tuples of ``(key, manifest)``, where
+        ``key`` is the name of the attribute under which ``manifest`` is found.
+        ``manifest`` is of type :class:`~lhotse.Recording`, :class:`~lhotse.Features`,
+        :class:`~lhotse.TemporalArray`, or :class:`~lhotse.Array`.
+
+        For example, if ``key`` is ``recording``, then ``manifest`` is ``self.recording``.
+        """
+        return self._first_non_padding_cut.iter_data()
+
     def __getattr__(self, name: str) -> Any:
         """
         This magic function is called when the user tries to access an attribute
@@ -1212,6 +1244,13 @@ def drop_alignments(self) -> "MixedCut":
             tracks=[fastcopy(t, cut=t.cut.drop_alignments()) for t in self.tracks],
         )
 
+    def drop_in_memory_data(self) -> "MixedCut":
+        """Return a copy of the current :class:`MixedCut`, which doesn't contain any in-memory data."""
+        return fastcopy(
+            self,
+            tracks=[fastcopy(t, cut=t.cut.drop_in_memory_data()) for t in self.tracks],
+        )
+
     def compute_and_store_features(
         self,
         extractor: FeatureExtractor,
@@ -1540,9 +1579,19 @@ def with_recording_path_prefix(self, path: Pathlike) -> "MixedCut":
         )
 
     @property
-    def _first_non_padding_cut(self) -> DataCut:
+    def first_non_padding_cut(self) -> DataCut:
         return self._first_non_padding_track.cut
 
     @property
-    def _first_non_padding_track(self) -> MixTrack:
+    def first_non_padding_track(self) -> MixTrack:
         return [t for t in self.tracks if not isinstance(t.cut, PaddingCut)][0]
+
+    # Note: the private properties below are kept for backward compatibility.
+
+    @property
+    def _first_non_padding_cut(self) -> DataCut:
+        return self.first_non_padding_cut
+
+    @property
+    def _first_non_padding_track(self) -> MixTrack:
+        return self.first_non_padding_track
diff --git a/lhotse/cut/padding.py b/lhotse/cut/padding.py
@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -85,6 +85,10 @@ def has_video(self) -> bool:
     def num_channels(self) -> int:
         return 1
 
+    @property
+    def is_in_memory(self) -> bool:
+        return False
+
     def has(self, field: str) -> bool:
         if field == "recording":
             return self.has_recording
@@ -99,6 +103,10 @@ def has(self, field: str) -> bool:
     def recording_id(self) -> str:
         return "PAD"
 
+    def iter_data(self) -> Iterable:
+        """Empty iterable."""
+        return ()
+
     # noinspection PyUnusedLocal
     def load_features(self, *args, **kwargs) -> Optional[np.ndarray]:
         if self.has_features:
@@ -421,11 +429,15 @@ def drop_recording(self) -> "PaddingCut":
         return fastcopy(self, num_samples=None)
 
     def drop_supervisions(self) -> "PaddingCut":
-        """Return a copy of the current :class:`.PaddingCut`, detached from ``supervisions``."""
+        """No-op"""
         return self
 
     def drop_alignments(self) -> "PaddingCut":
-        """Return a copy of the current :class:`.PaddingCut`, detached from ``alignments``."""
+        """No-op"""
+        return self
+
+    def drop_in_memory_data(self) -> "PaddingCut":
+        """No-op."""
         return self
 
     def compute_and_store_features(

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
@@ -1743,6 +1743,14 @@ def drop_alignments(self) -> "CutSet":
         """
         return self.map(_drop_alignments)
 
+    def drop_in_memory_data(self) -> "CutSet":
+        """
+        Return a new :class:`.CutSet`, where each :class:`.Cut` is copied and detached from any in-memory data it held.
+        The manifests for in-memory data are converted into placeholders that can still be looked up for
+        metadata, but will fail on attempts to load the data.
+        """
+        return self.map(_drop_in_memory_data)
+
     def compute_and_store_features(
         self,
         extractor: FeatureExtractor,
@@ -3355,6 +3363,10 @@ def _drop_supervisions(cut, *args, **kwargs):
     return cut.drop_supervisions(*args, **kwargs)
 
 
+def _drop_in_memory_data(cut, *args, **kwargs):
+    return cut.drop_in_memory_data(*args, **kwargs)
+
+
 def _truncate_single(
     cut: Cut,
     max_duration: Seconds,

diff --git a/lhotse/features/base.py b/lhotse/features/base.py
@@ -16,7 +16,7 @@
 
 from lhotse.audio import Recording
 from lhotse.augmentation import AugmentFn
-from lhotse.features.io import FeaturesWriter, get_reader
+from lhotse.features.io import FeaturesWriter, get_reader, is_in_memory
 from lhotse.lazy import AlgorithmMixin
 from lhotse.serialization import LazyMixin, Serializable, load_yaml, save_to_yaml
 from lhotse.utils import (
@@ -458,6 +458,14 @@ class Features:
     def end(self) -> Seconds:
         return self.start + self.duration
 
+    @property
+    def is_in_memory(self) -> bool:
+        return is_in_memory(self.storage_type)
+
+    @property
+    def is_placeholder(self) -> bool:
+        return self.storage_type == "shar"
+
     def load(
         self,
         start: Optional[Seconds] = None,

diff --git a/lhotse/features/io.py b/lhotse/features/io.py
@@ -1108,6 +1108,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 """
 
 
+def is_in_memory(storage_type: str) -> bool:
+    return "memory" in storage_type
+
+
 def get_memory_writer(name: str):
     assert "memory" in name
     return get_writer(name)