From ce0f5c1c80fe271e75de0a720bfec09ca27c19bc Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 11:26:40 -0400
Subject: [PATCH 01/32] add transform attribute for MixedCut

---
 lhotse/cut/mixed.py               | 120 ++++++++++++++++++++++++++----
 test/cut/test_cut_augmentation.py |  41 +++++++++-
 2 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
index b0ec1026d..005a953d6 100644
--- a/lhotse/cut/mixed.py
+++ b/lhotse/cut/mixed.py
@@ -10,7 +10,12 @@
 from intervaltree import IntervalTree
 
 from lhotse.audio import AudioMixer, Recording, audio_energy, torchaudio_save_flac_safe
-from lhotse.augmentation import AugmentFn
+from lhotse.augmentation import (
+    AudioTransform,
+    AugmentFn,
+    LoudnessNormalization,
+    ReverbWithImpulseResponse,
+)
 from lhotse.cut.base import Cut
 from lhotse.cut.data import DataCut
 from lhotse.cut.padding import PaddingCut
@@ -93,6 +98,9 @@ class MixedCut(Cut):
 
     .. note:: Each track in a MixedCut can be either a MonoCut, MultiCut, or PaddingCut.
 
+    .. note:: The ``transforms`` field is a list of dictionaries that describe the transformations
+        that should be applied to the track after mixing.
+
     See also:
 
         - :class:`lhotse.cut.Cut`
@@ -103,6 +111,7 @@ class MixedCut(Cut):
 
     id: str
     tracks: List[MixTrack]
+    transforms: Optional[List[Dict]] = None
 
     @property
     def supervisions(self) -> List[SupervisionSegment]:
@@ -713,6 +722,35 @@ def perturb_volume(self, factor: float, affix_id: bool = True) -> "MixedCut":
             ],
         )
 
+    def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut":
+        """
+        Return a new ``MixedCut`` that will lazily apply loudness normalization.
+
+        :param target: The target loudness in dBFS.
+        :param affix_id: When true, we will modify the ``DataCut.id`` field
+            by affixing it with "_ln{target}".
+        :return: a modified copy of the current ``DataCut``.
+        """
+        # Pre-conditions
+        assert (
+            self.has_recording
+        ), "Cannot apply loudness normalization on a MixedCut without Recording."
+        if self.has_features:
+            logging.warning(
+                "Attempting to normalize loudness on a MixedCut that references pre-computed features. "
+                "The feature manifest will be detached, as we do not support feature-domain "
+                "loudness normalization."
+            )
+            self.features = None
+
+        transforms = self.transforms.copy() if self.transforms is not None else []
+        transforms.append(LoudnessNormalization(target=target).to_dict())
+        return fastcopy(
+            self,
+            id=f"{self.id}_ln{target}" if affix_id else self.id,
+            transforms=transforms,
+        )
+
     def reverb_rir(
         self,
         rir_recording: Optional["Recording"] = None,
@@ -722,6 +760,7 @@ def reverb_rir(
         rir_channels: List[int] = [0],
         room_rng_seed: Optional[int] = None,
         source_rng_seed: Optional[int] = None,
+        mix_first: bool = True,
     ) -> "MixedCut":
         """
         Return a new ``MixedCut`` that will convolve the audio with the provided impulse response.
@@ -739,6 +778,9 @@ def reverb_rir(
             be convolved with one of the specified channels.
         :param room_rng_seed: Seed for the room configuration.
         :param source_rng_seed: Seed for the source position.
+        :param mix_first: When true, the mixing will be done first before convolving with the RIR.
+            This effectively means that all tracks will be convolved with the same RIR. If you
+            are simulating multi-speaker mixtures, you should set this to False.
         :return: a modified copy of the current ``MixedCut``.
         """
         # Pre-conditions
@@ -760,23 +802,64 @@ def reverb_rir(
             self.tracks
         ), "Invalid number of channels in `rir_channels`, must be either 1 or equal to the number of tracks."
 
+        # There are 2 ways to apply RIRs:
+        # 1. Mix the tracks first, then apply RIRs. This is same as applying the same RIR
+        #    to all tracks. It does not make sense if all tracks belong to different speakers,
+        #    but it is useful for cases when we have a mixture of MonoCut and PaddingCut,
+        #    and we want to apply the same RIR to all of them.
+        # 2. Apply RIRs to each track separately. This is useful when we want to simulate
+        #    different speakers in the same room.
+
+        # First simulate the room config (will only be used if RIR is not provided)
+        uuid4_str = str(uuid4())
+        # The room RNG seed is based on the cut ID. This ensures that all tracks in the
+        # mixed cut will have the same room configuration.
+        if room_rng_seed is None:
+            room_rng_seed = hash_str_to_int(uuid4_str + self.id)
+        # The source RNG seed is based on the track ID. This ensures that each track
+        # will have a different source position.
+        source_rng_seeds = [source_rng_seed] * len(self.tracks)
+        if source_rng_seed is None:
+            source_rng_seeds = [
+                hash_str_to_int(uuid4_str + track.cut.id) for track in self.tracks
+            ]
+            source_rng_seed = source_rng_seeds[0]
+
+        # Apply same RIR to all tracks after mixing (default)
+        if mix_first:
+            if rir_recording is None:
+                from lhotse.augmentation.utils import FastRandomRIRGenerator
+
+                rir_generator = FastRandomRIRGenerator(
+                    sr=self.sampling_rate,
+                    room_seed=room_rng_seed,
+                    source_seed=source_rng_seed,
+                )
+            else:
+                rir_generator = None
+
+            transforms = self.transforms.copy() if self.transforms is not None else []
+            transforms.append(
+                ReverbWithImpulseResponse(
+                    rir=rir_recording,
+                    normalize_output=normalize_output,
+                    early_only=early_only,
+                    rir_channels=rir_channels if rir_channels is not None else [0],
+                    rir_generator=rir_generator,
+                ).to_dict()
+            )
+            return fastcopy(
+                self,
+                id=f"{self.id}_rvb" if affix_id else self.id,
+                transforms=transforms,
+            )
+
+        # Apply RIRs to each track separately. Note that we do not pass a `mix_first`
+        # argument below since it is True by default.
+
         if len(rir_channels) == 1:
             rir_channels = rir_channels * len(self.tracks)
 
-        source_rng_seeds = [source_rng_seed] * len(self.tracks)
-        if rir_recording is None:
-            uuid4_str = str(uuid4())
-            # The room RNG seed is based on the cut ID. This ensures that all tracks in the
-            # mixed cut will have the same room configuration.
-            if room_rng_seed is None:
-                room_rng_seed = hash_str_to_int(uuid4_str + self.id)
-            # The source RNG seed is based on the track ID. This ensures that each track
-            # will have a different source position.
-            if source_rng_seed is None:
-                source_rng_seeds = [
-                    hash_str_to_int(uuid4_str + track.cut.id) for track in self.tracks
-                ]
-
         return MixedCut(
             id=f"{self.id}_rvb" if affix_id else self.id,
             tracks=[
@@ -977,6 +1060,13 @@ def load_audio(
                 f"this issue at https://github.com/lhotse-speech/lhotse/issues "
                 f"showing the cut below. MixedCut:\n{self}"
             )
+
+            # We'll apply the transforms now (if any).
+            transforms = [
+                AudioTransform.from_dict(params) for params in self.transforms or []
+            ]
+            for tfn in transforms:
+                audio = tfn(audio, self.sampling_rate)
         else:
             audio = mixer.unmixed_audio
 
diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py
index 7525ed33b..f98f21083 100644
--- a/test/cut/test_cut_augmentation.py
+++ b/test/cut/test_cut_augmentation.py
@@ -349,7 +349,7 @@ def test_mixed_cut_start01_perturb_volume(cut_with_supervision_start01):
 def test_mixed_cut_start01_reverb_rir(cut_with_supervision_start01, rir):
     mixed_rvb = cut_with_supervision_start01.append(
         cut_with_supervision_start01
-    ).reverb_rir(rir_recording=rir)
+    ).reverb_rir(rir_recording=rir, mix_first=False)
     assert mixed_rvb.start == 0  # MixedCut always starts at 0
     assert mixed_rvb.duration == cut_with_supervision_start01.duration * 2
     assert mixed_rvb.end == cut_with_supervision_start01.duration * 2
@@ -396,6 +396,24 @@ def test_mixed_cut_start01_reverb_rir(cut_with_supervision_start01, rir):
     )
 
 
+def test_mixed_cut_start01_reverb_rir_mix_first(cut_with_supervision_start01, rir):
+    mixed_rvb = cut_with_supervision_start01.pad(duration=0.5).reverb_rir(
+        rir_recording=rir, mix_first=True
+    )
+    assert mixed_rvb.start == 0  # MixedCut always starts at 0
+    assert mixed_rvb.duration == 0.5
+    assert mixed_rvb.end == 0.5
+    assert mixed_rvb.num_samples == 4000
+
+    # Check that the padding part should not be all zeros afte
+    np.testing.assert_raises(
+        AssertionError,
+        np.testing.assert_array_almost_equal,
+        mixed_rvb.load_audio()[:, 3200:],
+        np.zeros((1, 800)),
+    )
+
+
 def test_mixed_cut_start01_reverb_rir_with_fast_random(
     cut_with_supervision_start01, rir
 ):
@@ -449,6 +467,23 @@ def test_mixed_cut_start01_reverb_rir_multi_channel(
             mixed_cut.reverb_rir(multi_channel_rir, rir_channels=rir_channels)
 
 
+@pytest.mark.skipif(
+    not is_module_available("pyloudnorm"),
+    reason="This test requires pyloudnorm to be installed.",
+)
+@pytest.mark.parametrize("target", [-15.0, -20.0, -25.0])
+def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target):
+    mixed_cut = cut_with_supervision_start01.append(cut_with_supervision_start01)
+    mixed_cut_ln = mixed_cut.normalize_loudness(target)
+
+    import pyloudnorm as pyln
+
+    # check if loudness is correct
+    meter = pyln.Meter(mixed_cut_ln.sampling_rate)  # create BS.1770 meter
+    loudness = meter.integrated_loudness(mixed_cut_ln.load_audio().T)
+    assert loudness == pytest.approx(target, abs=0.5)
+
+
 @pytest.mark.skipif(
     not is_module_available("nara_wpe"),
     reason="This test requires nara_wpe to be installed.",
@@ -587,6 +622,10 @@ def test_cut_perturb_volume(cut_set, cut_id, scale):
     )
 
 
+@pytest.mark.skipif(
+    not is_module_available("pyloudnorm"),
+    reason="This test requires pyloudnorm to be installed.",
+)
 @pytest.mark.parametrize("target", [-15.0, -20.0, -25.0])
 def test_cut_normalize_loudness(libri_cut_set, target):
     cut_set_ln = libri_cut_set.normalize_loudness(target)

From ab18682257962740ee0825ba170689c9cc877e28 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 11:36:54 -0400
Subject: [PATCH 02/32] add mix_first option in normalize_loudness

---
 lhotse/cut/mixed.py               | 35 ++++++++++++++++++++++++-------
 test/cut/test_cut_augmentation.py | 11 ++++++----
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
index 005a953d6..5cfd4a3d2 100644
--- a/lhotse/cut/mixed.py
+++ b/lhotse/cut/mixed.py
@@ -722,11 +722,16 @@ def perturb_volume(self, factor: float, affix_id: bool = True) -> "MixedCut":
             ],
         )
 
-    def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut":
+    def normalize_loudness(
+        self, target: float, mix_first: bool = True, affix_id: bool = False
+    ) -> "DataCut":
         """
         Return a new ``MixedCut`` that will lazily apply loudness normalization.
 
         :param target: The target loudness in dBFS.
+        :param mix_first: If true, we will mix the underlying cuts before applying
+            loudness normalization. If false, we cannot guarantee that the resulting
+            cut will have the target loudness.
         :param affix_id: When true, we will modify the ``DataCut.id`` field
             by affixing it with "_ln{target}".
         :return: a modified copy of the current ``DataCut``.
@@ -743,13 +748,27 @@ def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut"
             )
             self.features = None
 
-        transforms = self.transforms.copy() if self.transforms is not None else []
-        transforms.append(LoudnessNormalization(target=target).to_dict())
-        return fastcopy(
-            self,
-            id=f"{self.id}_ln{target}" if affix_id else self.id,
-            transforms=transforms,
-        )
+        if mix_first:
+            transforms = self.transforms.copy() if self.transforms is not None else []
+            transforms.append(LoudnessNormalization(target=target).to_dict())
+            return fastcopy(
+                self,
+                id=f"{self.id}_ln{target}" if affix_id else self.id,
+                transforms=transforms,
+            )
+        else:
+            return MixedCut(
+                id=f"{self.id}_ln{target}" if affix_id else self.id,
+                tracks=[
+                    fastcopy(
+                        track,
+                        cut=track.cut.normalize_loudness(
+                            target=target, affix_id=affix_id
+                        ),
+                    )
+                    for track in self.tracks
+                ],
+            )
 
     def reverb_rir(
         self,
diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py
index f98f21083..bc60a075a 100644
--- a/test/cut/test_cut_augmentation.py
+++ b/test/cut/test_cut_augmentation.py
@@ -471,17 +471,20 @@ def test_mixed_cut_start01_reverb_rir_multi_channel(
     not is_module_available("pyloudnorm"),
     reason="This test requires pyloudnorm to be installed.",
 )
-@pytest.mark.parametrize("target", [-15.0, -20.0, -25.0])
-def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target):
+@pytest.mark.parametrize(
+    "target, mix_first", [(-15.0, True), (-20.0, True), (-25.0, False)]
+)
+def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target, mix_first):
     mixed_cut = cut_with_supervision_start01.append(cut_with_supervision_start01)
-    mixed_cut_ln = mixed_cut.normalize_loudness(target)
+    mixed_cut_ln = mixed_cut.normalize_loudness(target, mix_first=mix_first)
 
     import pyloudnorm as pyln
 
     # check if loudness is correct
     meter = pyln.Meter(mixed_cut_ln.sampling_rate)  # create BS.1770 meter
     loudness = meter.integrated_loudness(mixed_cut_ln.load_audio().T)
-    assert loudness == pytest.approx(target, abs=0.5)
+    if mix_first:
+        assert loudness == pytest.approx(target, abs=0.5)
 
 
 @pytest.mark.skipif(

From e4bca7421a7a4506fc3da82d639935233689b36a Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 15:40:46 -0400
Subject: [PATCH 03/32] handle the case when mix is called on MixedCut with
 existing transforms

---
 lhotse/cut/set.py | 57 +++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 1f5ad02b3..88532a1ee 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -2614,10 +2614,14 @@ def mix(
     if offset > reference_cut.duration:
         reference_cut = reference_cut.pad(duration=offset)
 
-    # When the left_cut is a MixedCut, take its existing tracks, otherwise create a new track.
-    if isinstance(reference_cut, MixedCut):
+    # When the left_cut is a MixedCut and it does not have existing transforms,
+    # take its existing tracks, otherwise create a new track.
+    if (
+        isinstance(reference_cut, MixedCut)
+        and len(ifnone(reference_cut.transforms, [])) == 0
+    ):
         old_tracks = reference_cut.tracks
-    elif isinstance(reference_cut, (DataCut, PaddingCut)):
+    elif isinstance(reference_cut, (DataCut, PaddingCut, MixedCut)):
         old_tracks = [MixTrack(cut=reference_cut)]
     else:
         raise ValueError(f"Unsupported type of cut in mix(): {type(reference_cut)}")
@@ -2625,27 +2629,32 @@ def mix(
     # When the right_cut is a MixedCut, adapt its existing tracks with the new offset and snr,
     # otherwise create a new track.
     if isinstance(mixed_in_cut, MixedCut):
-        new_tracks = [
-            MixTrack(
-                cut=track.cut,
-                offset=round(track.offset + offset, ndigits=8),
-                snr=(
-                    # When no new SNR is specified, retain whatever was there in the first place.
-                    track.snr
-                    if snr is None
-                    # When new SNR is specified but none was specified before, assign the new SNR value.
-                    else snr
-                    if track.snr is None
-                    # When both new and previous SNR were specified, assign their sum,
-                    # as the SNR for each track is defined with regard to the first track energy.
-                    else track.snr + snr
-                    if snr is not None and track is not None
-                    # When no SNR was specified whatsoever, use none.
-                    else None
-                ),
-            )
-            for track in mixed_in_cut.tracks
-        ]
+        # Similarly for mixed_in_cut, if it is a MixedCut and it does not have existing transforms,
+        # take its existing tracks, otherwise create a new track.
+        if len(ifnone(mixed_in_cut.transforms, [])) > 0:
+            new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)]
+        else:
+            new_tracks = [
+                MixTrack(
+                    cut=track.cut,
+                    offset=round(track.offset + offset, ndigits=8),
+                    snr=(
+                        # When no new SNR is specified, retain whatever was there in the first place.
+                        track.snr
+                        if snr is None
+                        # When new SNR is specified but none was specified before, assign the new SNR value.
+                        else snr
+                        if track.snr is None
+                        # When both new and previous SNR were specified, assign their sum,
+                        # as the SNR for each track is defined with regard to the first track energy.
+                        else track.snr + snr
+                        if snr is not None and track is not None
+                        # When no SNR was specified whatsoever, use none.
+                        else None
+                    ),
+                )
+                for track in mixed_in_cut.tracks
+            ]
     elif isinstance(mixed_in_cut, (DataCut, PaddingCut)):
         new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)]
     else:

From 71a92367f02da57655e7b3f633f1e7990b60fa6e Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 16:08:03 -0400
Subject: [PATCH 04/32] add test for mixing with transformed MixedCut

---
 test/cut/test_cut_mixing.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/cut/test_cut_mixing.py b/test/cut/test_cut_mixing.py
index c38753ecb..3404d4edf 100644
--- a/test/cut/test_cut_mixing.py
+++ b/test/cut/test_cut_mixing.py
@@ -345,6 +345,28 @@ def test_mix_cut_snr_pad_both(libri_cut):
     assert E(feats_nosnr) > E(feats_snr)
 
 
+@pytest.mark.parametrize("mix_first", [True, False])
+def test_mix_cut_with_transform(libri_cut, mix_first):
+    # Create original mixed cut
+    padded = libri_cut.pad(duration=20, direction="right")
+    # Create transformed mixed cut
+    padded = padded.reverb_rir(mix_first=mix_first)
+    # Mix another cut
+    mixed1 = padded.mix(libri_cut)
+    mixed2 = libri_cut.mix(padded)
+
+    assert isinstance(padded, MixedCut)
+    assert len(padded.tracks) == 2
+    assert isinstance(mixed1, MixedCut)
+    assert isinstance(mixed2, MixedCut)
+    if mix_first:
+        assert len(mixed1.tracks) == 2
+        assert len(mixed2.tracks) == 2
+    else:
+        assert len(mixed1.tracks) == 3
+        assert len(mixed2.tracks) == 3
+
+
 def test_cut_set_mix_snr_is_deterministic():
     cuts = DummyManifest(CutSet, begin_id=0, end_id=2)
 

From 2e54646b5e1219a5ef001814f4305540cd0ff414 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 15 May 2023 20:25:43 -0400
Subject: [PATCH 05/32] enhancements and bug fixes

---
 lhotse/bin/modes/recipes/ami.py  |  9 ++++++
 lhotse/bin/modes/recipes/icsi.py |  9 +++---
 lhotse/recipes/ami.py            | 49 ++++++++++++++++++++++++++------
 lhotse/recipes/icsi.py           | 16 ++++++-----
 lhotse/recipes/utils.py          |  1 +
 5 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/lhotse/bin/modes/recipes/ami.py b/lhotse/bin/modes/recipes/ami.py
index 992e20474..b129de572 100644
--- a/lhotse/bin/modes/recipes/ami.py
+++ b/lhotse/bin/modes/recipes/ami.py
@@ -53,6 +53,13 @@
         " segmentation). If None, no segmentation is performed."
     ),
 )
+@click.option(
+    "--merge-consecutive",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Merge consecutive segments from the same speaker.",
+)
 def ami(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
@@ -61,6 +68,7 @@ def ami(
     partition: str,
     normalize_text: bool,
     max_words_per_segment: int,
+    merge_consecutive: bool,
 ):
     """AMI data preparation."""
     prepare_ami(
@@ -71,6 +79,7 @@ def ami(
         partition=partition,
         normalize_text=normalize_text,
         max_words_per_segment=max_words_per_segment,
+        merge_consecutive=merge_consecutive,
     )
 
 
diff --git a/lhotse/bin/modes/recipes/icsi.py b/lhotse/bin/modes/recipes/icsi.py
index 7d5a033bc..29c7f4edf 100644
--- a/lhotse/bin/modes/recipes/icsi.py
+++ b/lhotse/bin/modes/recipes/icsi.py
@@ -64,12 +64,13 @@ def icsi(
 )
 @click.option(
     "--normalize-text",
-    is_flag=True,
-    help="If set, convert all text annotations to upper case (similar to Kaldi)",
+    type=click.Choice(["none", "upper", "kaldi"], case_sensitive=False),
+    default="kaldi",
+    help="Type of text normalization to apply (kaldi style, by default)",
 )
 def icsi(
     audio_dir: Pathlike,
-    transcript_dir: Pathlike,
+    transcripts_dir: Pathlike,
     output_dir: Pathlike,
     mic: str,
     normalize_text: bool,
@@ -77,7 +78,7 @@ def icsi(
     """AMI data preparation."""
     prepare_icsi(
         audio_dir,
-        transcript_dir,
+        transcripts_dir,
         output_dir=output_dir,
         mic=mic,
         normalize_text=normalize_text,
diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index df6105dee..d5ff85361 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -276,7 +276,8 @@ class AmiSegmentAnnotation(NamedTuple):
 def parse_ami_annotations(
     annotations_dir: Pathlike,
     normalize: str = "upper",
-    max_words_per_segment: int = None,
+    max_words_per_segment: Optional[int] = None,
+    merge_consecutive: bool = False,
 ) -> Dict[str, List[SupervisionSegment]]:
 
     # Extract if zipped file
@@ -355,7 +356,9 @@ def parse_ami_annotations(
             seg_words = list(
                 filter(lambda w: w[0] >= seg_start and w[1] <= seg_end, spk_words)
             )
-            subsegments = split_segment(seg_words, max_words_per_segment)
+            subsegments = split_segment(
+                seg_words, max_words_per_segment, merge_consecutive
+            )
             for subseg in subsegments:
                 start, end, text = subseg
                 annotations[key].append(
@@ -372,26 +375,49 @@ def parse_ami_annotations(
 
 
 def split_segment(
-    words: List[Tuple[float, float, str]], max_words_per_segment: Optional[int]
+    words: List[Tuple[float, float, str]],
+    max_words_per_segment: Optional[int] = None,
+    merge_consecutive: bool = False,
 ):
     def split_(sequence, sep):
         chunk = []
         for val in sequence:
             if val[-1] == sep:
-                yield chunk
+                if len(chunk) > 0:
+                    yield chunk
                 chunk = []
             else:
                 chunk.append(val)
-        yield chunk
+        if len(chunk) > 0:
+            yield chunk
 
     def split_on_fullstop_(sequence):
-        return split_(sequence, ".")
-
-    def split_on_comma_(segment, max_words_per_segment):
+        subsegs = list(split_(sequence, "."))
+        if len(subsegs) < 2:
+            return subsegs
+        # Set a large default value for max_words_per_segment if not provided
+        max_segment_length = max_words_per_segment if max_words_per_segment else 100000
+        if merge_consecutive:
+            # Merge consecutive subsegments if their length is less than max_words_per_segment
+            merged_subsegs = [subsegs[0]]
+            for subseg in subsegs[1:]:
+                if (
+                    merged_subsegs[-1][-1][1] == subseg[0][0]
+                    and len(merged_subsegs[-1]) + len(subseg) <= max_segment_length
+                ):
+                    merged_subsegs[-1].extend(subseg)
+                else:
+                    merged_subsegs.append(subseg)
+            subsegs = merged_subsegs
+        return subsegs
+
+    def split_on_comma_(segment):
         # This function smartly splits a segment on commas such that the number of words
         # in each subsegment is as close to max_words_per_segment as possible.
         # First we create subsegments by splitting on commas
         subsegs = list(split_(segment, ","))
+        if len(subsegs) < 2:
+            return subsegs
         # Now we merge subsegments while ensuring that the number of words in each
         # subsegment is less than max_words_per_segment
         merged_subsegs = [subsegs[0]]
@@ -409,7 +435,7 @@ def split_on_comma_(segment, max_words_per_segment):
         # Now we split each subsegment based on commas to get at most max_words_per_segment
         # words per subsegment.
         subsegments = [
-            list(split_on_comma_(subseg, max_words_per_segment))
+            list(split_on_comma_(subseg))
             if len(subseg) > max_words_per_segment
             else [subseg]
             for subseg in subsegments
@@ -614,6 +640,7 @@ def prepare_ami(
     partition: Optional[str] = "full-corpus",
     normalize_text: str = "kaldi",
     max_words_per_segment: Optional[int] = None,
+    merge_consecutive: bool = False,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
     Returns the manifests which consist of the Recordings and Supervisions
@@ -625,6 +652,9 @@ def prepare_ami(
     :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
     :param max_words_per_segment: int, maximum number of words per segment. If not None, we will split
         longer segments similar to Kaldi's data prep scripts, i.e., split on full-stop and comma.
+    :param merge_consecutive: bool, if True, merge consecutive segments split on full-stop.
+        We will only merge segments if the number of words in the merged segment is less than
+        max_words_per_segment.
     :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys
         'recordings' and 'supervisions'.
 
@@ -662,6 +692,7 @@ def prepare_ami(
         annotations_dir,
         normalize=normalize_text,
         max_words_per_segment=max_words_per_segment,
+        merge_consecutive=merge_consecutive,
     )
 
     # Audio
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 29734e84b..604d7852e 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -234,7 +234,6 @@ def download_icsi(
 def parse_icsi_annotations(
     transcripts_dir: Pathlike, normalize: str = "upper"
 ) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]:
-
     annotations = defaultdict(list)
     # In Lhotse, channels are integers, so we map channel ids to integers for each session
     channel_to_idx_map = defaultdict(dict)
@@ -299,7 +298,6 @@ def prepare_audio_grouped(
     audio_paths: List[Pathlike],
     channel_to_idx_map: Dict[str, Dict[str, int]] = None,
 ) -> RecordingSet:
-
     # Group together multiple channels from the same session.
     # We will use that to create a Recording with multiple sources (channels).
     from cytoolz import groupby
@@ -474,7 +472,7 @@ def prepare_supervision_other(
 
 def prepare_icsi(
     audio_dir: Pathlike,
-    transcripts_dir: Pathlike,
+    transcripts_dir: Optional[Pathlike] = None,
     output_dir: Optional[Pathlike] = None,
     mic: Optional[str] = "ihm",
     normalize_text: str = "kaldi",
@@ -490,7 +488,11 @@ def prepare_icsi(
         'recordings' and 'supervisions'.
     """
     audio_dir = Path(audio_dir)
-    transcripts_dir = Path(transcripts_dir)
+    transcripts_dir = (
+        Path(transcripts_dir)
+        if transcripts_dir is not None
+        else audio_dir / "transcripts"
+    )
 
     assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
     assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}"
@@ -539,6 +541,9 @@ def prepare_icsi(
             lambda x: x.recording_id in PARTITIONS[part]
         )
 
+        audio_part, supervision_part = fix_manifests(audio_part, supervision_part)
+        validate_recordings_and_supervisions(audio_part, supervision_part)
+
         # Write to output directory if a path is provided
         if output_dir is not None:
             audio_part.to_file(output_dir / f"icsi-{mic}_recordings_{part}.jsonl.gz")
@@ -546,9 +551,6 @@ def prepare_icsi(
                 output_dir / f"icsi-{mic}_supervisions_{part}.jsonl.gz"
             )
 
-        audio_part, supervision_part = fix_manifests(audio_part, supervision_part)
-        validate_recordings_and_supervisions(audio_part, supervision_part)
-
         # Combine all manifests into one dictionary
         manifests[part] = {"recordings": audio_part, "supervisions": supervision_part}
 
diff --git a/lhotse/recipes/utils.py b/lhotse/recipes/utils.py
index ab3d7b2d5..37fc00a99 100644
--- a/lhotse/recipes/utils.py
+++ b/lhotse/recipes/utils.py
@@ -146,6 +146,7 @@ def normalize_text_ami(text: str, normalize: str = "upper") -> str:
         text = re.sub(r"MM HMM", "MM-HMM", text)
         text = re.sub(r"UH HUH", "UH-HUH", text)
         text = re.sub(r"(\b)O K(\b)", r"\g<1>OK\g<2>", text)
+        text = re.sub(r"(\b)O_K(\b)", r"\g<1>OK\g<2>", text)
         return text
 
 

From db37a752466c722997cfade9d5a5192f289ab7b0 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 15 May 2023 20:38:08 -0400
Subject: [PATCH 06/32] small changes in some cutset methods

---
 lhotse/cut/base.py | 8 +++++---
 lhotse/cut/set.py  | 9 ++++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index 4df14a017..c9a989cf1 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -652,6 +652,7 @@ def trim_to_supervision_groups(
         supervision_group = [supervisions[0]]
         cur_end = supervisions[0].end
         new_cuts = []
+        group_idx = 0
         for sup in supervisions[1:]:
             if sup.start - cur_end <= max_pause:
                 supervision_group.append(sup)
@@ -666,8 +667,9 @@ def trim_to_supervision_groups(
                         offset=offset,
                         duration=duration,
                         keep_excessive_supervisions=False,
-                    )
+                    ).with_id(f"{self.id}-{max_pause}-{group_idx}")
                 )
+                group_idx += 1
                 supervision_group = [sup]
                 cur_end = sup.end
 
@@ -680,7 +682,7 @@ def trim_to_supervision_groups(
                     offset=offset,
                     duration=duration,
                     keep_excessive_supervisions=False,
-                )
+                ).with_id(f"{self.id}-{max_pause}-{group_idx}")
             )
         # The total number of supervisions should be the same.
         assert sum(len(c.supervisions) for c in new_cuts) == len(self.supervisions), (
@@ -724,7 +726,7 @@ def cut_into_windows(
                     offset=hop * i,
                     duration=duration,
                     keep_excessive_supervisions=keep_excessive_supervisions,
-                )
+                ).with_id(f"{self.id}-{i}")
             )
         return CutSet.from_cuts(new_cuts)
 
diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 8b474e0fa..81851794c 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -590,6 +590,7 @@ def to_shar(
                 warn_unused_fields=warn_unused_fields,
                 include_cuts=include_cuts,
                 shard_suffix=None,
+                verbose=verbose,
             )
 
         progbar = partial(tqdm, desc="Shard progress") if verbose else lambda x: x
@@ -610,6 +611,7 @@ def to_shar(
                         warn_unused_fields=warn_unused_fields,
                         include_cuts=True,
                         shard_suffix=f".{idx:06d}",
+                        verbose=False,
                     )
                 )
             for f in progbar(as_completed(futures)):
@@ -2706,7 +2708,7 @@ def mix(
     elif isinstance(mixed_in_cut, (DataCut, PaddingCut)):
         new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)]
     else:
-        raise ValueError(f"Unsupported type of cut in mix(): {type(reference_cut)}")
+        raise ValueError(f"Unsupported type of cut in mix(): {type(mixed_in_cut)}")
 
     return MixedCut(id=mixed_cut_id, tracks=old_tracks + new_tracks)
 
@@ -3386,9 +3388,12 @@ def _export_to_shar_single(
     warn_unused_fields: bool,
     include_cuts: bool,
     shard_suffix: Optional[str],
+    verbose: bool,
 ) -> Dict[str, List[str]]:
     from lhotse.shar import SharWriter
 
+    pbar = tqdm(desc="Exporting to SHAR", disable=not verbose)
+
     with SharWriter(
         output_dir=output_dir,
         fields=fields,
@@ -3399,5 +3404,7 @@ def _export_to_shar_single(
     ) as writer:
         for cut in cuts:
             writer.write(cut)
+            pbar.update()
 
+    # Finally, return the list of output files.
     return writer.output_paths

From 7b59ecdfb25118dbc9e5841bf414d8ab631d27d0 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 15 May 2023 20:42:17 -0400
Subject: [PATCH 07/32] small fix in error message

---
 lhotse/cut/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index c9a989cf1..8c2519687 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -490,7 +490,7 @@ def trim_to_supervisions(
                     len(set(to_hashable(s.channel) for s in trimmed.supervisions)) == 1
                 ), (
                     "Trimmed cut has supervisions with different channels. Either set "
-                    "`ignore_channel=True` to keep original channels or `keep_overlapping=False` "
+                    "`keep_all_channels=True` to keep original channels or `keep_overlapping=False` "
                     "to retain only 1 supervision per trimmed cut."
                 )
                 trimmed.channel = trimmed.supervisions[0].channel

From a64727a2b5aad58e908cb5028813a09afe8e79a4 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Tue, 16 May 2023 20:26:32 -0400
Subject: [PATCH 08/32] return word alignments from ami recipe

---
 lhotse/recipes/ami.py | 47 +++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index d5ff85361..73b47cc54 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -38,8 +38,8 @@
 from lhotse.audio import AudioSource, Recording, RecordingSet
 from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import normalize_text_ami
-from lhotse.supervision import SupervisionSegment, SupervisionSet
-from lhotse.utils import Pathlike, Seconds, resumable_download
+from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, Seconds, add_durations, resumable_download
 
 # fmt: off
 MEETINGS = {
@@ -271,6 +271,7 @@ class AmiSegmentAnnotation(NamedTuple):
     gender: str
     start_time: Seconds
     end_time: Seconds
+    words: List[AlignmentItem]
 
 
 def parse_ami_annotations(
@@ -279,7 +280,6 @@ def parse_ami_annotations(
     max_words_per_segment: Optional[int] = None,
     merge_consecutive: bool = False,
 ) -> Dict[str, List[SupervisionSegment]]:
-
     # Extract if zipped file
     if str(annotations_dir).endswith(".zip"):
         import zipfile
@@ -360,14 +360,27 @@ def parse_ami_annotations(
                 seg_words, max_words_per_segment, merge_consecutive
             )
             for subseg in subsegments:
-                start, end, text = subseg
+                start = subseg[0][0]
+                end = subseg[-1][1]
+                word_alignments = [
+                    AlignmentItem(
+                        start=round(w[0], ndigits=4),
+                        duration=add_durations(w[1], -w[0], sampling_rate=16000),
+                        symbol=normalize_text_ami(w[2], normalize=normalize),
+                    )
+                    for w in subseg
+                ]
+                # Filter out empty words
+                word_alignments = [w for w in word_alignments if w.symbol]
+                text = " ".join(w.symbol for w in word_alignments)
                 annotations[key].append(
                     AmiSegmentAnnotation(
-                        text=normalize_text_ami(text, normalize=normalize),
+                        text=text,
                         speaker=key[1],
                         gender=key[1][0],
                         start_time=start,
                         end_time=end,
+                        words=word_alignments,
                     )
                 )
 
@@ -378,7 +391,14 @@ def split_segment(
     words: List[Tuple[float, float, str]],
     max_words_per_segment: Optional[int] = None,
     merge_consecutive: bool = False,
-):
+) -> List[List[Tuple[float, float, str]]]:
+    """
+    Given a list of words, return a list of segments (each segment is a list of words)
+    where each segment has at most max_words_per_segment words. If merge_consecutive
+    is True, then consecutive segments with less than max_words_per_segment words
+    will be merged together.
+    """
+
     def split_(sequence, sep):
         chunk = []
         for val in sequence:
@@ -443,11 +463,8 @@ def split_on_comma_(segment):
         # flatten the list of lists
         subsegments = [item for sublist in subsegments for item in sublist]
 
-    # For each subsegment, we create a tuple of (start_time, end_time, text)
-    subsegments = [
-        (subseg[0][0], subseg[-1][1], " ".join([w[2] for w in subseg]))
-        for subseg in filter(lambda s: len(s) > 0, subsegments)
-    ]
+    # Filter out empty subsegments
+    subsegments = list(filter(lambda s: len(s) > 0, subsegments))
     return subsegments
 
 
@@ -563,7 +580,9 @@ def prepare_supervision_ihm(
                 continue
 
             for seg_idx, seg_info in enumerate(annotation):
-                duration = seg_info.end_time - seg_info.start_time
+                duration = add_durations(
+                    seg_info.end_time, -seg_info.start_time, sampling_rate=16000
+                )
                 # Some annotations in IHM setting exceed audio duration, so we
                 # ignore such segments
                 if seg_info.end_time > recording.duration:
@@ -577,13 +596,14 @@ def prepare_supervision_ihm(
                         SupervisionSegment(
                             id=f"{recording.id}-{channel}-{seg_idx}",
                             recording_id=recording.id,
-                            start=seg_info.start_time,
+                            start=round(seg_info.start_time, ndigits=4),
                             duration=duration,
                             channel=channel,
                             language="English",
                             speaker=seg_info.speaker,
                             gender=seg_info.gender,
                             text=seg_info.text,
+                            alignment={"word": seg_info.words},
                         )
                     )
 
@@ -627,6 +647,7 @@ def prepare_supervision_other(
                         speaker=seg_info.speaker,
                         gender=seg_info.gender,
                         text=seg_info.text,
+                        alignment={"word": seg_info.words},
                     )
                 )
     return SupervisionSet.from_segments(segments)

From 850ce2c5fd9b511b95137dc7f1f59a4577828abb Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 09:23:46 -0400
Subject: [PATCH 09/32] add word alignments for ICSI

---
 lhotse/recipes/icsi.py | 204 ++++++++++++++++++++++++++++-------------
 1 file changed, 142 insertions(+), 62 deletions(-)

diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 604d7852e..757461ac5 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -106,8 +106,8 @@
 from lhotse.audio import AudioSource, Recording, RecordingSet, read_sph
 from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import normalize_text_ami
-from lhotse.supervision import SupervisionSegment, SupervisionSet
-from lhotse.utils import Pathlike, Seconds, resumable_download
+from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, Seconds, add_durations, resumable_download
 
 # fmt:off
 PARTITIONS = {
@@ -135,15 +135,6 @@
 # fmt:on
 
 
-class IcsiSegmentAnnotation(NamedTuple):
-    text: str
-    speaker: str
-    channel: str
-    gender: str
-    start_time: Seconds
-    end_time: Seconds
-
-
 def download_audio(
     target_dir: Path,
     force_download: Optional[bool] = False,
@@ -205,22 +196,30 @@ def download_icsi(
     download_audio(audio_dir, force_download, url, mic)
 
     # Annotations
-    logging.info("Downloading AMI annotations")
+    logging.info("Downloading ICSI annotations")
 
     if transcripts_dir.exists() and not force_download:
         logging.info(
             f"Skip downloading transcripts as they exist in: {transcripts_dir}"
         )
         return target_dir
-    annotations_url = f"{url}/ICSICorpusAnnotations/ICSI_original_transcripts.zip"
+
+    # We need the MRT transcripts for the speaker-to-channel mapping. The NXT transcripts
+    # are used for the actual annotations (since they contain word alignments)
+    annotations_url_mrt = f"{url}/ICSICorpusAnnotations/ICSI_original_transcripts.zip"
+    annotations_url_nxt = f"{url}/ICSICorpusAnnotations/ICSI_core_NXT.zip"
     resumable_download(
-        annotations_url,
+        annotations_url_mrt,
         filename=target_dir / "ICSI_original_transcripts.zip",
         force_download=force_download,
     )
+    resumable_download(
+        annotations_url_nxt,
+        filename=target_dir / "ICSI_core_NXT.zip",
+        force_download=force_download,
+    )
 
-    # Unzip annotations zip file
-    with zipfile.ZipFile(target_dir / "ICSI_original_transcripts.zip") as z:
+    with zipfile.ZipFile(target_dir / "ICSI_core_NXT.zip") as z:
         # Unzips transcripts to <target_dir>/'transcripts'
         # zip file also contains some documentation which will be unzipped to <target_dir>
         z.extractall(target_dir)
@@ -228,9 +227,22 @@ def download_icsi(
         if transcripts_dir:
             Path(target_dir / "transcripts").rename(transcripts_dir)
 
+    # From the MRT transcripts, we only need the transcripts/preambles.mrt file
+    with zipfile.ZipFile(target_dir / "ICSI_original_transcripts.zip") as z:
+        z.extract("transcripts/preambles.mrt", transcripts_dir)
+
     return target_dir
 
 
+class IcsiSegmentAnnotation(NamedTuple):
+    text: str
+    speaker: str
+    gender: str
+    start_time: Seconds
+    end_time: Seconds
+    words: List[AlignmentItem]
+
+
 def parse_icsi_annotations(
     transcripts_dir: Pathlike, normalize: str = "upper"
 ) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]:
@@ -240,53 +252,119 @@ def parse_icsi_annotations(
     spk_to_channel_map = defaultdict(dict)
 
     # First we get global speaker ids and channels
-    for meeting_file in tqdm(
-        transcripts_dir.rglob("./*.mrt"), desc="Parsing ICSI mrt files"
-    ):
-        if meeting_file.stem == "preambles":
+    with open(transcripts_dir / "preambles.mrt") as f:
+        root = ET.parse(f).getroot()  # <Meetings>
+        for child in root:
+            if child.tag == "Meeting":
+                meeting_id = child.attrib["Session"]
+                for grandchild in child:
+                    if grandchild.tag == "Preamble":
+                        for greatgrandchild in grandchild:
+                            if greatgrandchild.tag == "Channels":
+                                channel_to_idx_map[meeting_id] = {
+                                    channel.attrib["Name"]: idx
+                                    for idx, channel in enumerate(greatgrandchild)
+                                }
+                            elif greatgrandchild.tag == "Participants":
+                                for speaker in greatgrandchild:
+                                    # some speakers may not have an associated channel in some meetings, so we
+                                    # assign them the SDM channel
+                                    spk_to_channel_map[meeting_id][
+                                        speaker.attrib["Name"]
+                                    ] = (
+                                        speaker.attrib["Channel"]
+                                        if "Channel" in speaker.attrib
+                                        else "chan6"
+                                    )
+
+    # Get the speaker segment times from the segments file
+    segments = {}
+    for file in (transcripts_dir / "Segments").glob("*.xml"):
+        meet_id, local_id, _ = file.stem.split(".")
+        spk_segments = []
+        spk_id = None
+        with open(file) as f:
+            tree = ET.parse(f)
+            for seg in tree.getroot():
+                if seg.tag != "segment":
+                    continue
+                if spk_id is None and "participant" in seg.attrib:
+                    spk_id = seg.attrib["participant"]
+                start_time = float(seg.attrib["starttime"])
+                end_time = float(seg.attrib["endtime"])
+                spk_segments.append((start_time, end_time))
+        if spk_id is None or len(spk_segments) == 0:
             continue
-        with open(meeting_file) as f:
-            meeting_id = meeting_file.stem
-            root = ET.parse(f).getroot()  # <Meeting>
-            for child in root:
-                if child.tag == "Preamble":
-                    for grandchild in child:
-                        if grandchild.tag == "Channels":
-                            channel_to_idx_map[meeting_id] = {
-                                channel.attrib["Name"]: idx
-                                for idx, channel in enumerate(grandchild)
-                            }
-                        elif grandchild.tag == "Participants":
-                            for speaker in grandchild:
-                                # some speakers may not have an associated channel in some meetings, so we
-                                # assign them the SDM channel
-                                spk_to_channel_map[meeting_id][
-                                    speaker.attrib["Name"]
-                                ] = (
-                                    speaker.attrib["Channel"]
-                                    if "Channel" in speaker.attrib
-                                    else "chan6"
-                                )
-                elif child.tag == "Transcript":
-                    for segment in child:
-                        if len(list(segment)) == 0 and "Participant" in segment.attrib:
-                            start_time = float(segment.attrib["StartTime"])
-                            end_time = float(segment.attrib["EndTime"])
-                            speaker = segment.attrib["Participant"]
-                            channel = spk_to_channel_map[meeting_id][speaker]
-                            text = normalize_text_ami(
-                                segment.text.strip(), normalize=normalize
-                            )
-                            annotations[(meeting_id, speaker, channel)].append(
-                                IcsiSegmentAnnotation(
-                                    text,
-                                    speaker,
-                                    channel,
-                                    speaker[0],
-                                    start_time,
-                                    end_time,
-                                )
-                            )
+        key = (meet_id, local_id)
+        channel = spk_to_channel_map[meet_id][spk_id]
+        segments[key] = (spk_id, channel, spk_segments)
+
+    # Now we go through each speaker's word-level annotations and store them
+    words = {}
+    for file in (transcripts_dir / "Words").glob("*.xml"):
+        meet_id, local_id, _ = file.stem.split(".")
+        key = (meet_id, local_id)
+        if key not in segments:
+            continue
+        else:
+            spk_id, channel, spk_segments = segments[key]
+
+        seg_words = []
+        combine_with_next = False
+        with open(file) as f:
+            tree = ET.parse(f)
+            for i, word in enumerate(tree.getroot()):
+                if (
+                    word.tag != "w"
+                    or "starttime" not in word.attrib
+                    or word.attrib["starttime"] == ""
+                    or "endtime" not in word.attrib
+                    or word.attrib["endtime"] == ""
+                ):
+                    continue
+                start_time = float(word.attrib["starttime"])
+                end_time = float(word.attrib["endtime"])
+                seg_words.append((start_time, end_time, word.text))
+        words[key] = (spk_id, channel, seg_words)
+
+    # Now we create segment-level annotations by combining the word-level
+    # annotations with the speaker segment times. We also normalize the text
+    # (if requested).
+    annotations = defaultdict(list)
+
+    for key, (spk_id, channel, spk_segments) in segments.items():
+        # Get the words for this speaker
+        _, _, spk_words = words[key]
+        # Now iterate over the speaker segments and create segment annotations
+        for seg_start, seg_end in spk_segments:
+            seg_words = list(
+                filter(lambda w: w[0] >= seg_start and w[1] <= seg_end, spk_words)
+            )
+            if len(seg_words) == 0:
+                continue
+            start = seg_words[0][0]
+            end = seg_words[-1][1]
+            word_alignments = [
+                AlignmentItem(
+                    start=round(w[0], ndigits=4),
+                    duration=add_durations(w[1], -w[0], sampling_rate=16000),
+                    symbol=normalize_text_ami(w[2], normalize=normalize),
+                )
+                for w in seg_words
+            ]
+            # Filter out empty words
+            word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
+            text = " ".join(w.symbol for w in word_alignments)
+            annotations[key].append(
+                IcsiSegmentAnnotation(
+                    text=text,
+                    speaker=spk_id,
+                    gender=spk_id[0],
+                    start_time=start,
+                    end_time=end,
+                    words=word_alignments,
+                )
+            )
     return annotations, channel_to_idx_map
 
 
@@ -422,6 +500,7 @@ def prepare_supervision_ihm(
                             speaker=seg_info.speaker,
                             gender=seg_info.gender,
                             text=seg_info.text,
+                            alignment={"word": seg_info.words},
                         )
                     )
 
@@ -465,6 +544,7 @@ def prepare_supervision_other(
                         speaker=seg_info.speaker,
                         gender=seg_info.gender,
                         text=seg_info.text,
+                        alignment={"word": seg_info.words},
                     )
                 )
     return SupervisionSet.from_segments(segments)

From 4b39c6fe9b27188b0f97c980eef7b6e3d0dc97b0 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 09:26:05 -0400
Subject: [PATCH 10/32] remove unwanted whitespace

---
 lhotse/recipes/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lhotse/recipes/utils.py b/lhotse/recipes/utils.py
index 37fc00a99..b63ad0c66 100644
--- a/lhotse/recipes/utils.py
+++ b/lhotse/recipes/utils.py
@@ -147,7 +147,7 @@ def normalize_text_ami(text: str, normalize: str = "upper") -> str:
         text = re.sub(r"UH HUH", "UH-HUH", text)
         text = re.sub(r"(\b)O K(\b)", r"\g<1>OK\g<2>", text)
         text = re.sub(r"(\b)O_K(\b)", r"\g<1>OK\g<2>", text)
-        return text
+        return text.strip()
 
 
 def normalize_text_chime6(text: str, normalize: str = "upper") -> str:

From 3c16b906d7d7ff7a22541e04f1f118509005d544 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 10:22:09 -0400
Subject: [PATCH 11/32] fix IHM preparation

---
 lhotse/recipes/icsi.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 757461ac5..6f42896e6 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -245,7 +245,9 @@ class IcsiSegmentAnnotation(NamedTuple):
 
 def parse_icsi_annotations(
     transcripts_dir: Pathlike, normalize: str = "upper"
-) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]:
+) -> Tuple[
+    Dict[Tuple[str, str, str], List[SupervisionSegment]], Dict[str, Dict[str, int]]
+]:
     annotations = defaultdict(list)
     # In Lhotse, channels are integers, so we map channel ids to integers for each session
     channel_to_idx_map = defaultdict(dict)
@@ -329,12 +331,13 @@ def parse_icsi_annotations(
 
     # Now we create segment-level annotations by combining the word-level
     # annotations with the speaker segment times. We also normalize the text
-    # (if requested).
+    # (if requested). The annotations is a dict indexed by (meeting_id, spk_id, channel).
     annotations = defaultdict(list)
 
     for key, (spk_id, channel, spk_segments) in segments.items():
         # Get the words for this speaker
         _, _, spk_words = words[key]
+        new_key = (key[0], spk_id, channel)
         # Now iterate over the speaker segments and create segment annotations
         for seg_start, seg_end in spk_segments:
             seg_words = list(
@@ -355,7 +358,7 @@ def parse_icsi_annotations(
             # Filter out empty words
             word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
             text = " ".join(w.symbol for w in word_alignments)
-            annotations[key].append(
+            annotations[new_key].append(
                 IcsiSegmentAnnotation(
                     text=text,
                     speaker=spk_id,

From 9921575d85b6eaa5f58a2b98c69fd4bc81e7afef Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 11:36:32 -0400
Subject: [PATCH 12/32] remove words with zero or negative duration

---
 lhotse/recipes/ami.py  | 11 +++++++++--
 lhotse/recipes/icsi.py |  8 ++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index 73b47cc54..f668cb0b5 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -370,8 +370,15 @@ def parse_ami_annotations(
                     )
                     for w in subseg
                 ]
-                # Filter out empty words
-                word_alignments = [w for w in word_alignments if w.symbol]
+                word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
+                if any(w.duration <= 0 for w in word_alignments):
+                    logging.warning(
+                        f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} "
+                        f"has a word with zero or negative duration."
+                    )
+                    word_alignments = [
+                        w for w in word_alignments if w.duration > 0
+                    ]  # type: ignore
                 text = " ".join(w.symbol for w in word_alignments)
                 annotations[key].append(
                     AmiSegmentAnnotation(
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 6f42896e6..18699b0dc 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -357,6 +357,14 @@ def parse_icsi_annotations(
             ]
             # Filter out empty words
             word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
+            if any(w.duration <= 0 for w in word_alignments):
+                logging.warning(
+                    f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} "
+                    f"has a word with zero or negative duration."
+                )
+                word_alignments = [
+                    w for w in word_alignments if w.duration > 0
+                ]  # type: ignore
             text = " ".join(w.symbol for w in word_alignments)
             annotations[new_key].append(
                 IcsiSegmentAnnotation(

From dba413f3bf844acb58f5bdab349334094d32de5b Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 13:14:14 -0400
Subject: [PATCH 13/32] ensure word alignments respect segment boundary

---
 lhotse/recipes/ami.py  | 32 ++++++++++++++++----------------
 lhotse/recipes/icsi.py | 33 ++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index f668cb0b5..c71db9e4a 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -362,23 +362,23 @@ def parse_ami_annotations(
             for subseg in subsegments:
                 start = subseg[0][0]
                 end = subseg[-1][1]
-                word_alignments = [
-                    AlignmentItem(
-                        start=round(w[0], ndigits=4),
-                        duration=add_durations(w[1], -w[0], sampling_rate=16000),
-                        symbol=normalize_text_ami(w[2], normalize=normalize),
-                    )
-                    for w in subseg
-                ]
-                word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
-                if any(w.duration <= 0 for w in word_alignments):
-                    logging.warning(
-                        f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} "
-                        f"has a word with zero or negative duration."
+                word_alignments = []
+                for w in subseg:
+                    w_start = max(start, round(w[0], ndigits=4))
+                    w_end = min(end, round(w[1], ndigits=4))
+                    w_dur = add_durations(w_end, -w_start, sampling_rate=16000)
+                    w_symbol = normalize_text_ami(w[2], normalize=normalize)
+                    if len(w_symbol) == 0:
+                        continue
+                    if w_dur <= 0:
+                        logging.warning(
+                            f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} "
+                            f"has a word with zero or negative duration. Skipping."
+                        )
+                        continue
+                    word_alignments.append(
+                        AlignmentItem(start=w_start, duration=w_dur, symbol=w_symbol)
                     )
-                    word_alignments = [
-                        w for w in word_alignments if w.duration > 0
-                    ]  # type: ignore
                 text = " ".join(w.symbol for w in word_alignments)
                 annotations[key].append(
                     AmiSegmentAnnotation(
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 18699b0dc..72b92a407 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -347,24 +347,23 @@ def parse_icsi_annotations(
                 continue
             start = seg_words[0][0]
             end = seg_words[-1][1]
-            word_alignments = [
-                AlignmentItem(
-                    start=round(w[0], ndigits=4),
-                    duration=add_durations(w[1], -w[0], sampling_rate=16000),
-                    symbol=normalize_text_ami(w[2], normalize=normalize),
-                )
-                for w in seg_words
-            ]
-            # Filter out empty words
-            word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
-            if any(w.duration <= 0 for w in word_alignments):
-                logging.warning(
-                    f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} "
-                    f"has a word with zero or negative duration."
+            word_alignments = []
+            for w in seg_words:
+                w_start = max(start, round(w[0], ndigits=4))
+                w_end = min(end, round(w[1], ndigits=4))
+                w_dur = add_durations(w_end, -w_start, sampling_rate=16000)
+                w_symbol = normalize_text_ami(w[2], normalize=normalize)
+                if len(w_symbol) == 0:
+                    continue
+                if w_dur <= 0:
+                    logging.warning(
+                        f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} "
+                        f"has a word with zero or negative duration. Skipping."
+                    )
+                    continue
+                word_alignments.append(
+                    AlignmentItem(start=w_start, duration=w_dur, symbol=w_symbol)
                 )
-                word_alignments = [
-                    w for w in word_alignments if w.duration > 0
-                ]  # type: ignore
             text = " ".join(w.symbol for w in word_alignments)
             annotations[new_key].append(
                 IcsiSegmentAnnotation(

From 12be4242c7b28233dfcd9a569b52c3c2104015ab Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 22 May 2023 10:16:42 -0400
Subject: [PATCH 14/32] add save-to-wav option for icsi

---
 lhotse/bin/modes/recipes/icsi.py | 10 +++++++++-
 lhotse/recipes/icsi.py           | 27 ++++++++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/lhotse/bin/modes/recipes/icsi.py b/lhotse/bin/modes/recipes/icsi.py
index 29c7f4edf..78a18473c 100644
--- a/lhotse/bin/modes/recipes/icsi.py
+++ b/lhotse/bin/modes/recipes/icsi.py
@@ -68,18 +68,26 @@ def icsi(
     default="kaldi",
     help="Type of text normalization to apply (kaldi style, by default)",
 )
+@click.option(
+    "--save-to-wav",
+    is_flag=True,
+    default=False,
+    help="If True and `mic` is sdm/ihm/mdm, save the recordings as WAV for faster processing.",
+)
 def icsi(
     audio_dir: Pathlike,
     transcripts_dir: Pathlike,
     output_dir: Pathlike,
     mic: str,
     normalize_text: bool,
+    save_to_wav: bool,
 ):
-    """AMI data preparation."""
+    """ICSI data preparation."""
     prepare_icsi(
         audio_dir,
         transcripts_dir,
         output_dir=output_dir,
         mic=mic,
         normalize_text=normalize_text,
+        save_to_wav=save_to_wav,
     )
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 72b92a407..124064009 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -100,6 +100,7 @@
 from pathlib import Path
 from typing import Dict, List, NamedTuple, Optional, Tuple, Union
 
+import soundfile as sf
 from tqdm.auto import tqdm
 
 from lhotse import validate_recordings_and_supervisions
@@ -385,6 +386,8 @@ def parse_icsi_annotations(
 def prepare_audio_grouped(
     audio_paths: List[Pathlike],
     channel_to_idx_map: Dict[str, Dict[str, int]] = None,
+    save_to_wav: bool = False,
+    output_dir: Pathlike = None,
 ) -> RecordingSet:
     # Group together multiple channels from the same session.
     # We will use that to create a Recording with multiple sources (channels).
@@ -404,6 +407,16 @@ def prepare_audio_grouped(
             }
         audio_sf, samplerate = read_sph(channel_paths[0])
 
+        if save_to_wav:
+            session_dir = Path(output_dir) / "wavs" / session_name
+            session_dir.mkdir(parents=True, exist_ok=True)
+            for i, audio_path in enumerate(channel_paths):
+                audio, _ = read_sph(audio_path)
+                wav_path = session_dir / f"{audio_path.stem}.wav"
+                sf.write(wav_path, audio.T, samplerate)
+                # Replace the sph path with the wav path
+                channel_paths[i] = wav_path
+
         recordings.append(
             Recording(
                 id=session_name,
@@ -436,7 +449,7 @@ def prepare_audio_single(
     for audio_path in tqdm(audio_paths, desc="Preparing audio"):
         session_name = audio_path.parts[-2]
         if audio_path.suffix == ".wav":
-            audio_sf = sf.SoundFile(str(audio_path))
+            audio_sf = sf.SoundFile(audio_path)
             num_frames = audio_sf.frames
             num_channels = audio_sf.channels
             samplerate = audio_sf.samplerate
@@ -566,6 +579,7 @@ def prepare_icsi(
     output_dir: Optional[Pathlike] = None,
     mic: Optional[str] = "ihm",
     normalize_text: str = "kaldi",
+    save_to_wav: bool = False,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
     Returns the manifests which consist of the Recordings and Supervisions
@@ -574,6 +588,7 @@ def prepare_icsi(
     :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk.
     :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
     :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
+    :param save_to_wav: bool, whether to save the sph audio to wav format
     :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys
         'recordings' and 'supervisions'.
     """
@@ -588,6 +603,9 @@ def prepare_icsi(
     assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}"
     assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported"
 
+    if save_to_wav:
+        assert output_dir is not None, "output_dir must be specified when saving to wav"
+
     if output_dir is not None:
         output_dir = Path(output_dir)
         output_dir.mkdir(parents=True, exist_ok=True)
@@ -604,7 +622,10 @@ def prepare_icsi(
     if mic == "ihm" or mic == "mdm":
         audio_paths = audio_dir.rglob(f"chan[{channels}].sph")
         audio = prepare_audio_grouped(
-            list(audio_paths), channel_to_idx_map if mic == "ihm" else None
+            list(audio_paths),
+            channel_to_idx_map if mic == "ihm" else None,
+            save_to_wav,
+            output_dir,
         )
     elif mic == "sdm" or mic == "ihm-mix":
         audio_paths = (
@@ -612,7 +633,7 @@ def prepare_icsi(
             if len(channels)
             else audio_dir.rglob("*.wav")
         )
-        audio = prepare_audio_single(list(audio_paths))
+        audio = prepare_audio_single(list(audio_paths), save_to_wav, output_dir)
 
     # Supervisions
     logging.info("Preparing supervision manifests")

From c4b957df6387d7c13ba75a73998b0d54709f49a9 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 22 May 2023 10:37:07 -0400
Subject: [PATCH 15/32] add test for mixing cut with recording

---
 lhotse/cut/set.py           | 2 +-
 test/cut/test_cut_mixing.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 81851794c..ad5b3e1a2 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -2610,7 +2610,7 @@ def mix(
         )
         snr = None
 
-    if reference_cut.num_features is not None:
+    if reference_cut.num_features is not None and mixed_in_cut.num_features is not None:
         assert (
             reference_cut.num_features == mixed_in_cut.num_features
         ), "Cannot mix cuts with different feature dimensions."
diff --git a/test/cut/test_cut_mixing.py b/test/cut/test_cut_mixing.py
index 8cac69d48..2f0b06ab1 100644
--- a/test/cut/test_cut_mixing.py
+++ b/test/cut/test_cut_mixing.py
@@ -324,6 +324,12 @@ def test_mix_cut_snr(libri_cut):
     assert E(feats) > E(feats_snr)
 
 
+def test_mix_cut_with_other_raises_error(libri_cut):
+    libri_cut = libri_cut.drop_features()
+    with pytest.raises(ValueError):
+        _ = libri_cut.mix(libri_cut.recording)
+
+
 def test_mix_cut_snr_truncate_snr_reference(libri_cut):
     mixed = libri_cut.pad(duration=20).mix(libri_cut, offset_other_by=10)
     mixed_snr = libri_cut.pad(duration=20).mix(libri_cut, offset_other_by=10, snr=10)

From 752be696df73e6e8907e62dd4def04cea636cc60 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 8 Jun 2023 09:51:11 -0400
Subject: [PATCH 16/32] style fix

---
 lhotse/augmentation/loudness.py | 4 ++--
 lhotse/cut/set.py               | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/lhotse/augmentation/loudness.py b/lhotse/augmentation/loudness.py
index 590167ecd..e937c1e57 100644
--- a/lhotse/augmentation/loudness.py
+++ b/lhotse/augmentation/loudness.py
@@ -6,7 +6,7 @@
 import torch
 
 from lhotse.augmentation.transform import AudioTransform
-from lhotse.utils import Seconds, is_module_available
+from lhotse.utils import EPSILON, Seconds, is_module_available
 
 
 @dataclass
@@ -60,7 +60,7 @@ def normalize_loudness(
 
     # measure the loudness first
     meter = pyln.Meter(
-        sampling_rate, block_size=min(0.4, duration)
+        sampling_rate, block_size=min(0.4, duration - EPSILON)
     )  # create BS.1770 meter
     loudness = meter.integrated_loudness(audio.T)
 
diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index ec6168b3c..105deccd4 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -2242,7 +2242,8 @@ def _save_worker(cuts: List[Cut], features: List[np.ndarray]) -> None:
                 if isinstance(cut, MixedCut):
                     # If this was a mixed cut, we will just discard its
                     # recordings and create a new mono cut that has just
-                    # the features attached.
+                    # the features attached. We will also set its `channel`
+                    # to 0, since we are creating a mono cut.
                     feat_manifest.recording_id = cut.id
                     cut = MonoCut(
                         id=cut.id,
@@ -2251,7 +2252,8 @@ def _save_worker(cuts: List[Cut], features: List[np.ndarray]) -> None:
                         channel=0,
                         # Update supervisions recording_id for consistency
                         supervisions=[
-                            fastcopy(s, recording_id=cut.id) for s in cut.supervisions
+                            fastcopy(s, recording_id=cut.id, channel=0)
+                            for s in cut.supervisions
                         ],
                         features=feat_manifest,
                         recording=None,

From 2171d7ecfdf51d6d18c72e6c0f8862831f894250 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 12 Jun 2023 07:31:17 -0400
Subject: [PATCH 17/32] add data prep for voxpopuli

---
 docs/corpus.rst                       |   2 +
 lhotse/bin/modes/recipes/__init__.py  |   1 +
 lhotse/bin/modes/recipes/voxpopuli.py |  84 ++++++++
 lhotse/recipes/__init__.py            |   1 +
 lhotse/recipes/voxpopuli.py           | 286 ++++++++++++++++++++++++++
 5 files changed, 374 insertions(+)
 create mode 100644 lhotse/bin/modes/recipes/voxpopuli.py
 create mode 100644 lhotse/recipes/voxpopuli.py

diff --git a/docs/corpus.rst b/docs/corpus.rst
index e00bead1d..174f0e2b1 100644
--- a/docs/corpus.rst
+++ b/docs/corpus.rst
@@ -167,6 +167,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_vctk`
   * - VoxCeleb
     - :func:`lhotse.recipes.prepare_voxceleb`
+  * - VoxPopuli
+    - :func:`lhotse.recipes.prepare_voxpopuli`
   * - WenetSpeech
     - :func:`lhotse.recipes.prepare_wenet_speech`
   * - YesNo
diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
index d5efb5a38..c82791b81 100644
--- a/lhotse/bin/modes/recipes/__init__.py
+++ b/lhotse/bin/modes/recipes/__init__.py
@@ -67,6 +67,7 @@
 from .uwb_atcc import *
 from .vctk import *
 from .voxceleb import *
+from .voxpopuli import *
 from .wenet_speech import *
 from .xbmu_amdo31 import *
 from .yesno import *
diff --git a/lhotse/bin/modes/recipes/voxpopuli.py b/lhotse/bin/modes/recipes/voxpopuli.py
new file mode 100644
index 000000000..08bffd54f
--- /dev/null
+++ b/lhotse/bin/modes/recipes/voxpopuli.py
@@ -0,0 +1,84 @@
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes import download_voxpopuli, prepare_voxpopuli
+from lhotse.recipes.voxpopuli import (
+    LANGUAGES,
+    LANGUAGES_V2,
+    S2S_SRC_LANGUAGES,
+    S2S_TGT_LANGUAGES,
+)
+from lhotse.utils import Pathlike
+
+__all__ = ["voxpopuli"]
+
+
+@prepare.command()
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "--task",
+    type=click.Choice(["asr", "s2s", "lm"]),
+    default="asr",
+    help="The task for which to prepare the VoxPopuli data.",
+    show_default=True,
+)
+@click.option(
+    "--lang",
+    type=click.Choice(LANGUAGES + LANGUAGES_V2),
+    default="en",
+    help="The language to prepare (only used if task is asr or lm).",
+    show_default=True,
+)
+@click.option(
+    "--src-lang",
+    type=click.Choice(S2S_SRC_LANGUAGES),
+    default=None,
+    help="The source language (only used if task is s2s).",
+    show_default=True,
+)
+@click.option(
+    "--tgt-lang",
+    type=click.Choice(S2S_TGT_LANGUAGES),
+    default=None,
+    help="The target language (only used if task is s2s).",
+    show_default=True,
+)
+@click.option(
+    "--num-jobs",
+    "-j",
+    type=int,
+    default=1,
+    help="Number of parallel jobs (can provide small speed-ups).",
+    show_default=True,
+)
+def voxpopuli(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    task: str,
+    lang: str,
+    src_lang: str,
+    tgt_lang: str,
+    num_jobs: int,
+):
+    """voxpopuli data preparation."""
+    prepare_voxpopuli(
+        corpus_dir,
+        output_dir=output_dir,
+        task=task,
+        lang=lang,
+        source_lang=src_lang,
+        target_lang=tgt_lang,
+    )
+
+
+@download.command()
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "--subset",
+    type=click.Choice(["asr", "10k", "100k", "400k"] + LANGUAGES + LANGUAGES_V2),
+    default="asr",
+)
+def voxpopuli(target_dir: Pathlike):
+    """voxpopuli download."""
+    download_voxpopuli(target_dir)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
index 77df2e8b3..b07706d60 100644
--- a/lhotse/recipes/__init__.py
+++ b/lhotse/recipes/__init__.py
@@ -67,6 +67,7 @@
 from .uwb_atcc import download_uwb_atcc, prepare_uwb_atcc
 from .vctk import download_vctk, prepare_vctk
 from .voxceleb import download_voxceleb1, download_voxceleb2, prepare_voxceleb
+from .voxpopuli import download_voxpopuli, prepare_voxpopuli
 from .wenet_speech import prepare_wenet_speech
 from .xbmu_amdo31 import download_xbmu_amdo31, prepare_xbmu_amdo31
 from .yesno import download_yesno, prepare_yesno
diff --git a/lhotse/recipes/voxpopuli.py b/lhotse/recipes/voxpopuli.py
new file mode 100644
index 000000000..5a29c2931
--- /dev/null
+++ b/lhotse/recipes/voxpopuli.py
@@ -0,0 +1,286 @@
+"""
+VoxPopuli provides
+
+- 400K hours of unlabelled speech data for 23 languages
+- 1.8K hours of transcribed speech data for 16 languages
+- 17.3K hours of speech-to-speech interpretation data for 15x15 directions
+- 29 hours of transcribed speech data of non-native English intended for research in ASR
+for accented speech (15 L2 accents)
+
+The raw data is collected from 2009-2020 European Parliament event recordings.
+For details about the corpus, please refer to the website:
+https://github.com/facebookresearch/voxpopuli
+
+Reference:
+Wang, Changhan et al. “VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
+Learning, Semi-Supervised Learning and Interpretation.” Annual Meeting of the Association
+for Computational Linguistics (2021).
+
+This script is based on code from the repository linked above.
+
+NOTE: Our data preparation is slightly different from the original repository. In particular,
+we only use the metadata to create manifests, i.e., we do not create segment-level wav files,
+unlike the original repository. In this way, we can avoid duplicating the audio files.
+"""
+import csv
+import gzip
+import logging
+import shutil
+import tempfile
+from ast import literal_eval
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch.hub import download_url_to_file
+from torchaudio.datasets.utils import _extract_tar
+from tqdm import tqdm
+
+from lhotse import (
+    RecordingSet,
+    SupervisionSegment,
+    SupervisionSet,
+    validate_recordings_and_supervisions,
+)
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
+from lhotse.utils import Pathlike
+
+# fmt: off
+LANGUAGES = [
+    "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
+    "sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
+]
+LANGUAGES_V2 = [f"{x}_v2" for x in LANGUAGES]
+
+YEARS = list(range(2009, 2020 + 1))
+
+ASR_LANGUAGES = [
+    "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
+    "sk", "sl", "et", "lt"
+]
+ASR_ACCENTED_LANGUAGES = [
+    "en_accented"
+]
+
+S2S_SRC_LANGUAGES = ASR_LANGUAGES
+
+S2S_TGT_LANGUAGES = [
+    "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
+    "sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
+]
+
+S2S_TGT_LANGUAGES_WITH_HUMAN_TRANSCRIPTION = ["en", "fr", "es"]
+
+DOWNLOAD_BASE_URL = "https://dl.fbaipublicfiles.com/voxpopuli"
+# fmt: on
+
+
+def download_voxpopuli(
+    target_dir: Pathlike = ".",
+    subset: Optional[str] = "asr",
+) -> Path:
+    """
+    Download and untar/unzip the VoxPopuli dataset.
+
+    :param target_dir: Pathlike, the path of the dir to storage the dataset.
+    :param subset: str, the subset of the dataset to download, can be one of "400k", "100k",
+        "10k", "asr", or any of the languages in LANGUAGES or LANGUAGES_V2.
+    :return: the path to downloaded and extracted directory with data.
+    """
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    if subset in LANGUAGES_V2:
+        languages = [subset.split("_")[0]]
+        years = YEARS + [f"{y}_2" for y in YEARS]
+    elif subset in LANGUAGES:
+        languages = [subset]
+        years = YEARS
+    else:
+        languages = {
+            "400k": LANGUAGES,
+            "100k": LANGUAGES,
+            "10k": LANGUAGES,
+            "asr": ["original"],
+        }.get(subset, None)
+        years = {
+            "400k": YEARS + [f"{y}_2" for y in YEARS],
+            "100k": YEARS,
+            "10k": [2019, 2020],
+            "asr": YEARS,
+        }.get(subset, None)
+
+    url_list = []
+    for l in languages:
+        for y in years:
+            url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar")
+
+    out_root = target_dir / "raw_audios"
+    out_root.mkdir(exist_ok=True, parents=True)
+    logging.info(f"{len(url_list)} files to download...")
+    for url in tqdm(url_list):
+        tar_path = out_root / Path(url).name
+        download_url_to_file(url, out_root.as_posix(), Path(url).name)
+        _extract_tar(tar_path.as_posix())
+        tar_path.unlink()
+
+    return target_dir
+
+
+def prepare_voxpopuli(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+    task: str = "asr",
+    lang: str = "en",
+    source_lang: Optional[str] = None,
+    target_lang: Optional[str] = None,
+    num_jobs: int = 1,
+) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
+    """
+    Prepares and returns the VoxPopuli manifests which consist of Recordings and Supervisions.
+
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :param task: str, the task to prepare the manifests for, can be one of "asr", "s2s", "lm".
+    :param lang: str, the language to prepare the manifests for, can be one of LANGUAGES
+        or LANGUAGES_V2. This is used for "asr" and "lm" tasks.
+    :param source_lang: str, the source language for the s2s task, can be one of S2S_SRC_LANGUAGES.
+    :param target_lang: str, the target language for the s2s task, can be one of S2S_TGT_LANGUAGES.
+    :param num_jobs: int, the number of parallel jobs to use for preparing the manifests.
+    :return: Dict[str, Union[RecordingSet, SupervisionSet]], the manifests.
+    """
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True, parents=True)
+
+    if task == "asr":
+        assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
+        manifests = _prepare_voxpopuli_asr(
+            corpus_dir, output_dir, lang, num_jobs=num_jobs
+        )
+    elif task == "s2s":
+        assert (
+            source_lang in S2S_SRC_LANGUAGES
+        ), f"Unsupported source language: {source_lang}"
+        assert (
+            target_lang in S2S_TGT_LANGUAGES
+        ), f"Unsupported target language: {target_lang}"
+        manifests = _prepare_voxpopuli_s2s(corpus_dir, source_lang, target_lang)
+    elif task == "lm":
+        assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
+        manifests = _prepare_voxpopuli_lm(corpus_dir, lang)
+
+    for k, v in manifests.items():
+        recordings, supervisions = fix_manifests(**v)
+        validate_recordings_and_supervisions(
+            recordings=recordings, supervisions=supervisions
+        )
+        manifests[k]["recordings"] = recordings
+        manifests[k]["supervisions"] = supervisions
+
+        lang_affix = f"{source_lang}-{target_lang}" if task == "s2s" else lang
+        if output_dir is not None:
+            recordings.to_file(
+                output_dir / f"voxpopuli-{task}-{lang_affix}_recordings_{k}.jsonl.gz"
+            )
+            supervisions.to_file(
+                output_dir / f"voxpopuli-{task}-{lang_affix}_supervisions_{k}.jsonl.gz"
+            )
+
+    return manifests
+
+
+def _prepare_voxpopuli_asr(
+    corpus_dir: Path, output_dir: Path, lang: str, num_jobs: int = 1
+) -> Tuple[RecordingSet, SupervisionSet]:
+    """
+    Download metadata TSV and prepare manifests for the ASR task.
+    """
+    # First create recordings. We remove the affix "_original" from the recording ID
+    logging.info("Preparing recordings (this may take a few minutes)...")
+    in_root = corpus_dir / "raw_audios" / "original"
+    recordings = RecordingSet.from_dir(
+        in_root,
+        "*.ogg",
+        num_jobs=num_jobs,
+        recording_id=lambda x: x.stem.replace("_original", ""),
+    )
+
+    # Now create supervisions
+    temp_dir = Path(tempfile.mkdtemp(prefix=f"voxpopuli_asr_", dir=output_dir))
+
+    # Get metadata TSV
+    url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{lang}.tsv.gz"
+    tsv_path = temp_dir / Path(url).name
+    if not tsv_path.exists():
+        download_url_to_file(url, tsv_path)
+    with gzip.open(tsv_path, "rt") as f:
+        metadata = [x for x in csv.DictReader(f, delimiter="|")]
+
+    # Get segment into list (train, dev, test)
+    segments = defaultdict(list)
+    # We also keep a count of the number of segments per recording
+    num_segments = defaultdict(lambda: 0)
+
+    for r in tqdm(metadata):
+        split = r["split"]
+        if split not in ["train", "dev", "test"]:
+            continue
+        reco_id = r["session_id"]
+        start_time = float(r["start_time"])
+        duration = float(r["end_time"]) - start_time
+
+        num_segments[reco_id] += 1
+        segments[split].append(
+            SupervisionSegment(
+                id=f"{reco_id}-{num_segments[reco_id]}",
+                recording_id=reco_id,
+                start=round(start_time, ndigits=8),
+                duration=round(duration, ndigits=8),
+                channel=0,
+                language=lang,
+                speaker=r["speaker_id"],
+                gender=r["gender"],
+                text=r["normed_text"],
+                custom={
+                    "orig_text": r["original_text"],
+                },
+            )
+        )
+
+    # Get list of recording IDs for each split
+    reco_ids = defaultdict(list)
+    for split, segs in segments.items():
+        reco_ids[split] = sorted(set([s.recording_id for s in segs]))
+
+    manifests = defaultdict(dict)
+    for split in ["train", "dev", "test"]:
+        manifests[split]["recordings"] = recordings.filter(
+            lambda r: r.id in reco_ids[split]
+        )
+        manifests[split]["supervisions"] = SupervisionSet.from_segments(segments[split])
+
+    # Delete temp dir along with its contents
+    shutil.rmtree(temp_dir)
+    return manifests
+
+
+def _prepare_voxpopuli_s2s(
+    corpus_dir: Path, source_lang: str, target_lang: str
+) -> Tuple[RecordingSet, SupervisionSet]:
+    """
+    Prepare the manifests for the s2s task.
+    """
+    raise NotImplementedError
+
+
+def _prepare_voxpopuli_lm(corpus_dir: Path, lang: str) -> Tuple[RecordingSet, None]:
+    """
+    Prepare the manifests for the lm task.
+    """
+    raise NotImplementedError

From c5efbd7e707a4e4146f4e0a4eebec8eb7c8ce99e Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Sun, 30 Jul 2023 03:39:06 -0400
Subject: [PATCH 18/32] small changes in recipes

---
 lhotse/bin/modes/recipes/tedlium.py | 14 +++++++++++++-
 lhotse/recipes/earnings21.py        |  5 ++++-
 lhotse/recipes/tedlium.py           | 10 +++++++---
 lhotse/recipes/utils.py             | 24 ++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/lhotse/bin/modes/recipes/tedlium.py b/lhotse/bin/modes/recipes/tedlium.py
index 45893eb82..8370789cc 100644
--- a/lhotse/bin/modes/recipes/tedlium.py
+++ b/lhotse/bin/modes/recipes/tedlium.py
@@ -27,8 +27,19 @@
     default=1,
     help="How many threads to use (can give good speed-ups with slow disks).",
 )
+@click.option(
+    "--normalize-text",
+    type=click.Choice(["none", "upper", "kaldi"], case_sensitive=False),
+    default="none",
+    help="Type of text normalization to apply (no normalization, by default). "
+    "Selecting `kaldi` will remove <unk> tokens and join suffixes.",
+)
 def tedlium(
-    tedlium_dir: Pathlike, output_dir: Pathlike, parts: List[str], num_jobs: int
+    tedlium_dir: Pathlike,
+    output_dir: Pathlike,
+    parts: List[str],
+    num_jobs: int,
+    normalize_text: str,
 ):
     """
     TED-LIUM v3 recording and supervision manifest preparation.
@@ -38,6 +49,7 @@ def tedlium(
         output_dir=output_dir,
         dataset_parts=parts,
         num_jobs=num_jobs,
+        normalize_text=normalize_text,
     )
 
 
diff --git a/lhotse/recipes/earnings21.py b/lhotse/recipes/earnings21.py
index 2409d39fd..1ae483f22 100644
--- a/lhotse/recipes/earnings21.py
+++ b/lhotse/recipes/earnings21.py
@@ -82,7 +82,10 @@ def download_earnings21(
             if "earnings21" in f:
                 zip.extract(f, path=target_dir)
 
-    shutil.move(target_dir / "speech-datasets-main" / "earnings21", target_dir)
+    # For Python < 3.9, shutil.move() gives error with PosixPath
+    shutil.move(
+        str(target_dir / "speech-datasets-main" / "earnings21"), str(target_dir)
+    )
     shutil.rmtree(target_dir / "speech-datasets-main")
 
     completed_detector.touch()
diff --git a/lhotse/recipes/tedlium.py b/lhotse/recipes/tedlium.py
index 7df4fa8a4..15c132e38 100644
--- a/lhotse/recipes/tedlium.py
+++ b/lhotse/recipes/tedlium.py
@@ -45,6 +45,7 @@
 import shutil
 import tarfile
 from concurrent.futures.thread import ThreadPoolExecutor
+from functools import partial
 from pathlib import Path
 from typing import Dict, Optional, Sequence, Union
 
@@ -55,6 +56,7 @@
     validate_recordings_and_supervisions,
 )
 from lhotse.qa import fix_manifests
+from lhotse.recipes.utils import normalize_text_tedlium
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
 TEDLIUM_PARTS = ("train", "dev", "test")
@@ -88,6 +90,7 @@ def prepare_tedlium(
     output_dir: Optional[Pathlike] = None,
     dataset_parts: Union[str, Sequence[str]] = TEDLIUM_PARTS,
     num_jobs: int = 1,
+    normalize_text: str = "none",
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
     Prepare manifests for the TED-LIUM v3 corpus.
@@ -122,8 +125,9 @@ def prepare_tedlium(
                 f"You might be missing some parts of TEDLIUM..."
             )
             futures = []
+            _parse_stm_worker = partial(_parse_stm_file, normalize_text=normalize_text)
             for stm in stms:
-                futures.append(ex.submit(_parse_stm_file, stm))
+                futures.append(ex.submit(_parse_stm_worker, stm))
 
             segments = []
             for future in futures:
@@ -144,7 +148,7 @@ def prepare_tedlium(
     return corpus
 
 
-def _parse_stm_file(stm: str) -> SupervisionSegment:
+def _parse_stm_file(stm: str, normalize_text: str = "none") -> SupervisionSegment:
     """Helper function to parse a single STM file."""
     segments = []
     with stm.open() as f:
@@ -161,7 +165,7 @@ def _parse_stm_file(stm: str) -> SupervisionSegment:
                     start=start,
                     duration=round(end - start, ndigits=8),
                     channel=0,
-                    text=text,
+                    text=normalize_text_tedlium(text, normalize_text),
                     language="English",
                     speaker=rec_id,
                 )
diff --git a/lhotse/recipes/utils.py b/lhotse/recipes/utils.py
index aea7fc513..efd50030e 100644
--- a/lhotse/recipes/utils.py
+++ b/lhotse/recipes/utils.py
@@ -180,6 +180,30 @@ def normalize_text_chime6(text: str, normalize: str = "upper") -> str:
         return text
 
 
+def normalize_text_tedlium(text: str, normalize: str = "upper") -> str:
+    """
+    Text normalization similar to Kaldi's TEDLIUM-3 recipe.
+    """
+    if normalize == "none":
+        return text
+    elif normalize == "upper":
+        return text.upper()
+    elif normalize == "kaldi":
+        # Kaldi style text normalization
+        import re
+
+        # remove tokens such as "[NOISE]"
+        text = re.sub(r"\[[^\]]+\]", "", text)
+        # remove "<unk>"
+        text = re.sub(r"<unk>", "", text)
+        # join suffixes with words, e.g. they 're -> they're
+        text = re.sub(r"(\w+) '(\w+)", r"\1'\2", text)
+        # join dangling "'" with next word, e.g. ' 60s -> '60s, ' cause -> 'cause
+        text = re.sub(r"' (\w+)", r"'\1", text)
+
+        return text.strip()
+
+
 class TimeFormatConverter:
     @staticmethod
     def hms_to_seconds(time: str) -> float:

From 826dbc260ee4a71becd222e3c1b1048876ff62be Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 3 Aug 2023 08:49:20 -0400
Subject: [PATCH 19/32] changes for max segment duration

---
 lhotse/bin/modes/__init__.py    |  1 +
 lhotse/bin/modes/supervision.py | 73 +++++++++++++++++++++++++++++++++
 lhotse/cut/base.py              | 47 ++++++++++++++++++++-
 lhotse/cut/data.py              |  5 +++
 lhotse/cut/mono.py              | 31 +++++++++-----
 lhotse/cut/multi.py             | 34 +++++++--------
 lhotse/cut/set.py               | 38 +++++++++++++----
 lhotse/supervision.py           | 22 ++++++++--
 lhotse/utils.py                 |  8 +++-
 9 files changed, 217 insertions(+), 42 deletions(-)
 create mode 100644 lhotse/bin/modes/supervision.py

diff --git a/lhotse/bin/modes/__init__.py b/lhotse/bin/modes/__init__.py
index 6df392373..5ab11db32 100644
--- a/lhotse/bin/modes/__init__.py
+++ b/lhotse/bin/modes/__init__.py
@@ -6,5 +6,6 @@
 from .manipulation import *
 from .recipes import *
 from .shar import *
+from .supervision import *
 from .validate import *
 from .workflows import *
diff --git a/lhotse/bin/modes/supervision.py b/lhotse/bin/modes/supervision.py
new file mode 100644
index 000000000..0c49a2b18
--- /dev/null
+++ b/lhotse/bin/modes/supervision.py
@@ -0,0 +1,73 @@
+import click
+from tqdm import tqdm
+
+from lhotse.bin.modes.cli_base import cli
+from lhotse.serialization import load_manifest_lazy_or_eager
+from lhotse.supervision import SupervisionSet
+from lhotse.utils import Pathlike
+
+
+@cli.group()
+def supervision():
+    """Commands related to manipulating supervision manifests."""
+    pass
+
+
+@supervision.command()
+@click.argument("in_supervision_manifest", type=click.Path(allow_dash=True))
+@click.argument("out_supervision_manifest", type=click.Path(allow_dash=True))
+@click.option(
+    "--ctm-file",
+    type=click.Path(exists=True, dir_okay=False),
+    help="CTM file containing alignments to add.",
+)
+@click.option(
+    "--alignment-type",
+    type=str,
+    default="word",
+    help="Type of alignment to add (default = `word`).",
+)
+@click.option(
+    "--match-channel/--no-match-channel",
+    default=False,
+    help="Whether to match channel between CTM and SupervisionSegment (default = False).",
+)
+@click.option(
+    "--verbose",
+    "-v",
+    is_flag=True,
+    default=False,
+    help="Whether to print verbose output.",
+)
+def with_alignment_from_ctm(
+    in_supervision_manifest: Pathlike,
+    out_supervision_manifest: Pathlike,
+    ctm_file: Pathlike,
+    alignment_type: str,
+    match_channel: bool,
+    verbose: bool,
+):
+    """
+    Add alignments from CTM file to the supervision set.
+
+    :param in_supervision_manifest: Path to input supervision manifest.
+    :param out_supervision_manifest: Path to output supervision manifest.
+    :param ctm_file: Path to CTM file.
+    :param alignment_type: Alignment type (optional, default = `word`).
+    :param match_channel: if True, also match channel between CTM and SupervisionSegment
+    :param verbose: Whether to print verbose output.
+    :return: A new SupervisionSet with AlignmentItem objects added to the segments.
+    """
+    supervisions = load_manifest_lazy_or_eager(in_supervision_manifest, SupervisionSet)
+    supervisions = supervisions.with_alignment_from_ctm(
+        ctm_file=ctm_file,
+        type=alignment_type,
+        match_channel=match_channel,
+        verbose=verbose,
+    )
+    with SupervisionSet.open_writer(out_supervision_manifest, overwrite=True) as writer:
+        supervisions = (
+            tqdm(supervisions, desc="Writing supervisions") if verbose else supervisions
+        )
+        for s in supervisions:
+            writer.write(s)
diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index 8c2519687..f7dede029 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -510,6 +510,7 @@ def trim_to_alignments(
         self,
         type: str,
         max_pause: Optional[Seconds] = None,
+        max_segment_duration: Optional[Seconds] = None,
         delimiter: str = " ",
         keep_all_channels: bool = False,
     ) -> "CutSet":  # noqa: F821
@@ -517,7 +518,8 @@ def trim_to_alignments(
         Splits the current :class:`.Cut` into its constituent alignment items (:class:`.AlignmentItem`).
         These cuts have identical start times and durations as the alignment item. Additionally,
         the `max_pause` option can be used to merge alignment items that are separated by a pause
-        shorter than `max_pause`.
+        shorter than `max_pause`. If `max_segment_duration` is specified, the merging will be
+        performed only if the resulting segment is shorter than `max_segment_duration`.
 
         For the case of a multi-channel cut with multiple alignments, we can either trim
         while respecting the supervision channels (in which case output cut has the same channels
@@ -531,6 +533,21 @@ def trim_to_alignments(
         .. hint:: If a MultiCut is trimmed and the resulting trimmed cut contains a single channel,
             we convert it to a MonoCut.
 
+        .. hint:: If you have a Cut with multiple supervision segments and you want to trim it to
+            the word-level alignment, you can use the :meth:`.Cut.merge_supervisions` method
+            first to merge the supervisions into a single one, followed by the
+            :meth:`.Cut.trim_to_alignments` method. For example::
+
+                >>> cut = cut.merge_supervisions(type='word', delimiter=' ')
+                >>> cut = cut.trim_to_alignments(type='word', max_pause=1.0)
+
+        .. hint:: The above technique can also be used to segment long cuts into roughly equal
+            duration segments, while respecting alignment boundaries. For example, to split a
+            Cut into 10s segments, you can do::
+
+                >>> cut = cut.merge_supervisions(type='word', delimiter=' ')
+                >>> cut = cut.trim_to_alignments(type='word', max_pause=10.0, max_segment_duration=10.0)
+
         :param type: The type of the alignment to trim to (e.g. "word").
         :param max_pause: The maximum pause allowed between the alignments to merge them. If ``None``,
             no merging will be performed. [default: None]
@@ -546,6 +563,10 @@ def trim_to_alignments(
             # Set to a negative value so that no merging is performed.
             max_pause = -1.0
 
+        if max_segment_duration is None:
+            # Set to the cut duration so that resulting segments are always smaller.
+            max_segment_duration = self.duration
+
         # For the implementation, we first create new supervisions for the cut, and then
         # use the `trim_to_supervisions` method to do the actual trimming.
         new_supervisions = []
@@ -561,6 +582,9 @@ def trim_to_alignments(
             # Merge the alignments if needed. We also keep track of the indices of the
             # merged alignments in the original list. This is needed to create the
             # `alignment` field in the new supervisions.
+            # NOTE: We use the `AlignmentItem` class here for convenience --- the merged
+            # alignments are not actual alignment items, but rather just a way to keep
+            # track of merged segments.
             merged_alignments = [(alignments[0], [0])]
             for i, item in enumerate(alignments[1:]):
                 # If alignment item is blank, skip it. Sometimes, blank alignment items
@@ -568,7 +592,10 @@ def trim_to_alignments(
                 if item.symbol.strip() == "":
                     continue
                 prev_item, prev_indices = merged_alignments[-1]
-                if item.start - prev_item.end <= max_pause:
+                if (
+                    item.start - prev_item.end <= max_pause
+                    and item.end - prev_item.start <= max_segment_duration
+                ):
                     new_item = AlignmentItem(
                         symbol=delimiter.join([prev_item.symbol, item.symbol]),
                         start=prev_item.start,
@@ -700,6 +727,7 @@ def cut_into_windows(
         duration: Seconds,
         hop: Optional[Seconds] = None,
         keep_excessive_supervisions: bool = True,
+        use_alignment_if_exists: Optional[str] = None,
     ) -> "CutSet":  # noqa: F821
         """
         Return a list of shorter cuts, made by traversing this cut in windows of
@@ -712,10 +740,25 @@ def cut_into_windows(
         :param hop: Shift between the windows in the new cuts in seconds.
         :param keep_excessive_supervisions: bool. When a cut is truncated in the
             middle of a supervision segment, should the supervision be kept.
+        :param use_alignment_if_exists: Optional str. If provided, the corresponding alignments will
+            be used to cut the supervisions according to the time. This could mean that resulting
+            cut durations are slightly different than the requested ``duration``, since we will
+            try to align the supervisions to the alignment boundaries.
         :return: a list of cuts made from shorter duration windows.
         """
         from .set import CutSet
 
+        if use_alignment_if_exists is not None:
+            # Only check the first supervision (checking all would be too slow)
+            assert (
+                use_alignment_if_exists in self.supervisions[0].alignment
+            ), f"Supervision does not have alignment of type {use_alignment_if_exists}"
+            return self.cut_into_windows_with_alignment(
+                alignment_type=use_alignment_if_exists,
+                duration=duration,
+                hop=hop,
+            )
+
         if not hop:
             hop = duration
         new_cuts = []
diff --git a/lhotse/cut/data.py b/lhotse/cut/data.py
index c13fa24cf..2f4796193 100644
--- a/lhotse/cut/data.py
+++ b/lhotse/cut/data.py
@@ -493,6 +493,10 @@ def truncate(
             By default, the duration is (end of the cut before truncation) - (offset).
         :param keep_excessive_supervisions: bool. Since trimming may happen inside a SupervisionSegment,
             the caller has an option to either keep or discard such supervisions.
+        :param use_alignment_if_exists: Optional str. If provided, the corresponding alignments will
+            be used to cut the supervisions according to the time. This could mean that resulting
+            cut durations are slightly different than the requested ``duration``, since we will
+            try to align the supervisions to the alignment boundaries.
         :param preserve_id: bool. Should the truncated cut keep the same ID or get a new, random one.
         :param _supervisions_index: an IntervalTree; when passed, allows to speed up processing of Cuts with a very
             large number of supervisions. Intended as an internal parameter.
@@ -1042,6 +1046,7 @@ def filter_supervisions(
     @abstractmethod
     def merge_supervisions(
         self,
+        merge_policy: str = "delimiter",
         custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None,
         **kwargs,
     ) -> "DataCut":
diff --git a/lhotse/cut/mono.py b/lhotse/cut/mono.py
index 239245ebf..38960767c 100644
--- a/lhotse/cut/mono.py
+++ b/lhotse/cut/mono.py
@@ -2,7 +2,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from functools import reduce
+from functools import partial, reduce
 from operator import add
 from typing import Any, Callable, Iterable, List, Optional
 
@@ -194,19 +194,22 @@ def reverb_rir(
             )
 
     def merge_supervisions(
-        self, custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None
+        self,
+        merge_policy: str = "delimiter",
+        custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None,
     ) -> "MonoCut":
         """
         Return a copy of the cut that has all of its supervisions merged into
         a single segment.
 
         The new start is the start of the earliest superivion, and the new duration
-        is a minimum spanning duration for all the supervisions.
-
-        The text fields are concatenated with a whitespace, and all other string fields
-        (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
-        This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
+        is a minimum spanning duration for all the supervisions. The text fields of
+        all segments are concatenated with a whitespace.
 
+        :param merge_policy: one of "keep_first" or "delimiter". If "keep_first", we
+            keep only the first segment's field value, otherwise all string fields
+            (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
+            This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
         :param custom_merge_fn: a function that will be called to merge custom fields values.
             We expect ``custom_merge_fn`` to handle all possible custom keys.
             When not provided, we will treat all custom values as strings.
@@ -248,16 +251,22 @@ def merge_supervisions(
                 f"recognition models (cut id: {self.id})."
             )
 
+        merge_func_ = partial(
+            merge_items_with_delimiter,
+            delimiter="#",
+            return_first=(merge_policy == "keep_first"),
+        )
+
         msup = SupervisionSegment(
-            id=merge_items_with_delimiter(s.id for s in sups),
+            id=merge_func_(s.id for s in sups),
             recording_id=sups[0].recording_id,
             start=mstart,
             duration=mduration,
             channel=sups[0].channel,
             text=" ".join(s.text for s in sups if s.text),
-            speaker=merge_items_with_delimiter(s.speaker for s in sups if s.speaker),
-            language=merge_items_with_delimiter(s.language for s in sups if s.language),
-            gender=merge_items_with_delimiter(s.gender for s in sups if s.gender),
+            speaker=merge_func_(s.speaker for s in sups if s.speaker),
+            language=merge_func_(s.language for s in sups if s.language),
+            gender=merge_func_(s.gender for s in sups if s.gender),
             custom={
                 k: merge_custom(
                     k,
diff --git a/lhotse/cut/multi.py b/lhotse/cut/multi.py
index c81921c0e..a370ca27d 100644
--- a/lhotse/cut/multi.py
+++ b/lhotse/cut/multi.py
@@ -1,7 +1,7 @@
 import logging
 import warnings
 from dataclasses import dataclass
-from functools import reduce
+from functools import partial, reduce
 from itertools import groupby
 from operator import add
 from typing import Any, Callable, Iterable, List, Optional, Union
@@ -211,6 +211,7 @@ def reverb_rir(
 
     def merge_supervisions(
         self,
+        merge_policy: str = "delimiter",
         merge_channels: bool = True,
         custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None,
     ) -> "MultiCut":
@@ -222,12 +223,13 @@ def merge_supervisions(
         ``channel`` attribute will not change in this case.
 
         The new start is the start of the earliest superivion, and the new duration
-        is a minimum spanning duration for all the supervisions.
-
-        The text fields are concatenated with a whitespace, and all other string fields
-        (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
-        This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
+        is a minimum spanning duration for all the supervisions. The text fields of
+        all segments are concatenated with a whitespace.
 
+        :param merge_policy: one of "keep_first" or "delimiter". If "keep_first", we
+            keep only the first segment's field value, otherwise all string fields
+            (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
+            This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
         :param merge_channels: If true, we will merge all supervisions into a single segment.
             If false, we will merge supervisions per channel group. Default: True.
         :param custom_merge_fn: a function that will be called to merge custom fields values.
@@ -265,6 +267,12 @@ def merge_supervisions(
                 )
             }
 
+        merge_func_ = partial(
+            merge_items_with_delimiter,
+            delimiter="#",
+            return_first=(merge_policy == "keep_first"),
+        )
+
         msups = []
         text_overlap_warning = False
         for channel, csups in sups_by_channel.items():
@@ -293,21 +301,15 @@ def merge_supervisions(
 
             msups.append(
                 SupervisionSegment(
-                    id=merge_items_with_delimiter(s.id for s in csups),
+                    id=merge_func_(s.id for s in csups),
                     recording_id=csups[0].recording_id,
                     start=mstart,
                     duration=mduration,
                     channel=list(channel),
                     text=" ".join(s.text for s in csups if s.text),
-                    speaker=merge_items_with_delimiter(
-                        s.speaker for s in csups if s.speaker
-                    ),
-                    language=merge_items_with_delimiter(
-                        s.language for s in csups if s.language
-                    ),
-                    gender=merge_items_with_delimiter(
-                        s.gender for s in csups if s.gender
-                    ),
+                    speaker=merge_func_(s.speaker for s in csups if s.speaker),
+                    language=merge_func_(s.language for s in csups if s.language),
+                    gender=merge_func_(s.gender for s in csups if s.gender),
                     custom={
                         k: merge_custom(
                             k,
diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 105deccd4..07e75ac7e 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -1119,19 +1119,22 @@ def filter_supervisions(
         return self.map(lambda cut: cut.filter_supervisions(predicate))
 
     def merge_supervisions(
-        self, custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None
+        self,
+        merge_policy: str = "delimiter",
+        custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None,
     ) -> "CutSet":
         """
         Return a copy of the cut that has all of its supervisions merged into
         a single segment.
 
         The new start is the start of the earliest superivion, and the new duration
-        is a minimum spanning duration for all the supervisions.
-
-        The text fields are concatenated with a whitespace, and all other string fields
-        (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
-        This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
+        is a minimum spanning duration for all the supervisions. The text fields of
+        all segments are concatenated with a whitespace.
 
+        :param merge_policy: one of "keep_first" or "delimiter". If "keep_first", we
+            keep only the first segment's field value, otherwise all string fields
+            (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
+            This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
         :param custom_merge_fn: a function that will be called to merge custom fields values.
             We expect ``custom_merge_fn`` to handle all possible custom keys.
             When not provided, we will treat all custom values as strings.
@@ -1139,7 +1142,9 @@ def merge_supervisions(
             ``custom_merge_fn(custom_key, [s.custom[custom_key] for s in sups])``
         """
         return self.map(
-            lambda cut: cut.merge_supervisions(custom_merge_fn=custom_merge_fn)
+            lambda cut: cut.merge_supervisions(
+                merge_policy=merge_policy, custom_merge_fn=custom_merge_fn
+            )
         )
 
     def trim_to_supervisions(
@@ -1233,6 +1238,7 @@ def trim_to_alignments(
         self,
         type: str,
         max_pause: Seconds = 0.0,
+        max_segment_duration: Optional[Seconds] = None,
         delimiter: str = " ",
         keep_all_channels: bool = False,
         num_jobs: int = 1,
@@ -1267,6 +1273,7 @@ def trim_to_alignments(
                             _trim_to_alignments_single,
                             type=type,
                             max_pause=max_pause,
+                            max_segment_duration=max_segment_duration,
                             delimiter=delimiter,
                             keep_all_channels=keep_all_channels,
                         ),
@@ -1282,6 +1289,7 @@ def trim_to_alignments(
             _trim_to_alignments_single,
             type=type,
             max_pause=max_pause,
+            max_segment_duration=max_segment_duration,
             delimiter=delimiter,
             keep_all_channels=keep_all_channels,
         )
@@ -1580,6 +1588,7 @@ def cut_into_windows(
         duration: Seconds,
         hop: Optional[Seconds] = None,
         keep_excessive_supervisions: bool = True,
+        use_alignment_if_exists: Optional[str] = None,
         num_jobs: int = 1,
     ) -> "CutSet":
         """
@@ -1593,6 +1602,10 @@ def cut_into_windows(
         :param hop: Shift between the windows in the new cuts in seconds.
         :param keep_excessive_supervisions: bool. When a cut is truncated in the middle of a supervision segment,
             should the supervision be kept.
+        :param use_alignment_if_exists: Optional str. If provided, the corresponding alignments will
+            be used to cut the supervisions according to the time. This could mean that resulting
+            cut durations are slightly different than the requested ``duration``, since we will
+            try to align the supervisions to the alignment boundaries.
         :param num_jobs: The number of parallel workers.
         :return: a new CutSet with cuts made from shorter duration windows.
         """
@@ -1610,6 +1623,7 @@ def cut_into_windows(
                             duration=duration,
                             hop=hop,
                             keep_excessive_supervisions=keep_excessive_supervisions,
+                            use_alignment_if_exists=use_alignment_if_exists,
                         ),
                     )
                 )
@@ -1624,6 +1638,7 @@ def cut_into_windows(
             duration=duration,
             hop=hop,
             keep_excessive_supervisions=keep_excessive_supervisions,
+            use_alignment_if_exists=use_alignment_if_exists,
         )
         return result
 
@@ -3352,12 +3367,17 @@ def find_segments_with_speaker_count(
 
 
 def _cut_into_windows_single(
-    cuts: CutSet, duration, hop, keep_excessive_supervisions
+    cuts: CutSet,
+    duration,
+    hop,
+    keep_excessive_supervisions,
+    use_alignment_if_exists,
 ) -> CutSet:
     return cuts.cut_into_windows(
         duration=duration,
         hop=hop,
         keep_excessive_supervisions=keep_excessive_supervisions,
+        use_alignment_if_exists=use_alignment_if_exists,
     ).to_eager()
 
 
@@ -3380,12 +3400,14 @@ def _trim_to_alignments_single(
     cuts: CutSet,
     type,
     max_pause,
+    max_segment_duration,
     delimiter,
     keep_all_channels,
 ) -> CutSet:
     return cuts.trim_to_alignments(
         type=type,
         max_pause=max_pause,
+        max_segment_duration=max_segment_duration,
         delimiter=delimiter,
         keep_all_channels=keep_all_channels,
     ).to_eager()
diff --git a/lhotse/supervision.py b/lhotse/supervision.py
index 4ba5acb05..6b4b02469 100644
--- a/lhotse/supervision.py
+++ b/lhotse/supervision.py
@@ -14,6 +14,8 @@
     Union,
 )
 
+from tqdm import tqdm
+
 from lhotse.lazy import AlgorithmMixin
 from lhotse.serialization import Serializable
 from lhotse.utils import (
@@ -637,7 +639,11 @@ def from_rttm(path: Union[Pathlike, Iterable[Pathlike]]) -> "SupervisionSet":
         return SupervisionSet.from_segments(segments)
 
     def with_alignment_from_ctm(
-        self, ctm_file: Pathlike, type: str = "word", match_channel: bool = False
+        self,
+        ctm_file: Pathlike,
+        type: str = "word",
+        match_channel: bool = False,
+        verbose: bool = False,
     ) -> "SupervisionSet":
         """
         Add alignments from CTM file to the supervision set.
@@ -645,14 +651,20 @@ def with_alignment_from_ctm(
         :param ctm: Path to CTM file.
         :param type: Alignment type (optional, default = `word`).
         :param match_channel: if True, also match channel between CTM and SupervisionSegment
+        :param verbose: if True, show progress bar
         :return: A new SupervisionSet with AlignmentItem objects added to the segments.
         """
         ctm_words = []
+        # Sometimes the channels may not be integers, so we map them here.
+        channel_to_int = {}
         with open(ctm_file) as f:
+            f = tqdm(f, desc="Reading words from CTM file") if verbose else f
             for line in f:
                 reco_id, channel, start, duration, symbol = line.strip().split()
+                channel_to_int[channel] = len(channel_to_int)
+                channel = channel_to_int[channel]
                 ctm_words.append(
-                    (reco_id, int(channel), float(start), float(duration), symbol)
+                    (reco_id, channel, float(start), float(duration), symbol)
                 )
         ctm_words = sorted(ctm_words, key=lambda x: (x[0], x[2]))
         reco_to_ctm = defaultdict(
@@ -661,7 +673,11 @@ def with_alignment_from_ctm(
         segments = []
         num_total = len(ctm_words)
         num_overspanned = 0
-        for reco_id in set([s.recording_id for s in self]):
+        recordings = set([s.recording_id for s in self])
+        recordings = (
+            tqdm(recordings, desc="Adding alignments") if verbose else recordings
+        )
+        for reco_id in recordings:
             if reco_id in reco_to_ctm:
                 for seg in self.find(recording_id=reco_id):
                     alignment = [
diff --git a/lhotse/utils.py b/lhotse/utils.py
index 37cc6c337..fa3b2f6bd 100644
--- a/lhotse/utils.py
+++ b/lhotse/utils.py
@@ -659,15 +659,19 @@ def compute_start_duration_for_extended_cut(
 
 
 def merge_items_with_delimiter(
-    values: Iterable[str], prefix: str = "cat", delimiter: str = "#"
+    values: Iterable[str],
+    prefix: str = "cat",
+    delimiter: str = "#",
+    return_first: bool = False,
 ) -> Optional[str]:
     # e.g.
     # values = ["1125-76840-0001", "1125-53670-0003"]
     # return "cat#1125-76840-0001#1125-53670-0003"
+    # if return_first is True, return "1125-76840-0001"
     values = list(values)
     if len(values) == 0:
         return None
-    if len(values) == 1:
+    if len(values) == 1 or return_first:
         return values[0]
     return delimiter.join(chain([prefix], values))
 

From e297c13b25172c9812d156195553576b7c3574e7 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 3 Aug 2023 09:02:31 -0400
Subject: [PATCH 20/32] remove extra code

---
 lhotse/cut/data.py | 4 ----
 lhotse/cut/set.py  | 7 -------
 2 files changed, 11 deletions(-)

diff --git a/lhotse/cut/data.py b/lhotse/cut/data.py
index 2f4796193..3cb0ccead 100644
--- a/lhotse/cut/data.py
+++ b/lhotse/cut/data.py
@@ -493,10 +493,6 @@ def truncate(
             By default, the duration is (end of the cut before truncation) - (offset).
         :param keep_excessive_supervisions: bool. Since trimming may happen inside a SupervisionSegment,
             the caller has an option to either keep or discard such supervisions.
-        :param use_alignment_if_exists: Optional str. If provided, the corresponding alignments will
-            be used to cut the supervisions according to the time. This could mean that resulting
-            cut durations are slightly different than the requested ``duration``, since we will
-            try to align the supervisions to the alignment boundaries.
         :param preserve_id: bool. Should the truncated cut keep the same ID or get a new, random one.
         :param _supervisions_index: an IntervalTree; when passed, allows to speed up processing of Cuts with a very
             large number of supervisions. Intended as an internal parameter.
diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 07e75ac7e..fbc1ab0b8 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -1588,7 +1588,6 @@ def cut_into_windows(
         duration: Seconds,
         hop: Optional[Seconds] = None,
         keep_excessive_supervisions: bool = True,
-        use_alignment_if_exists: Optional[str] = None,
         num_jobs: int = 1,
     ) -> "CutSet":
         """
@@ -1602,10 +1601,6 @@ def cut_into_windows(
         :param hop: Shift between the windows in the new cuts in seconds.
         :param keep_excessive_supervisions: bool. When a cut is truncated in the middle of a supervision segment,
             should the supervision be kept.
-        :param use_alignment_if_exists: Optional str. If provided, the corresponding alignments will
-            be used to cut the supervisions according to the time. This could mean that resulting
-            cut durations are slightly different than the requested ``duration``, since we will
-            try to align the supervisions to the alignment boundaries.
         :param num_jobs: The number of parallel workers.
         :return: a new CutSet with cuts made from shorter duration windows.
         """
@@ -1623,7 +1618,6 @@ def cut_into_windows(
                             duration=duration,
                             hop=hop,
                             keep_excessive_supervisions=keep_excessive_supervisions,
-                            use_alignment_if_exists=use_alignment_if_exists,
                         ),
                     )
                 )
@@ -1638,7 +1632,6 @@ def cut_into_windows(
             duration=duration,
             hop=hop,
             keep_excessive_supervisions=keep_excessive_supervisions,
-            use_alignment_if_exists=use_alignment_if_exists,
         )
         return result
 

From 4711576f0cb6fe44a5be623e0f1e2a86bcce61c9 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 3 Aug 2023 09:27:16 -0400
Subject: [PATCH 21/32] add alignment scores from CTM

---
 lhotse/supervision.py                     | 30 +++++++++++++++++-----
 test/fixtures/supervision_with_scores.ctm |  5 ++++
 test/test_supervision_set.py              | 31 +++++++++++++++++++++++
 3 files changed, 60 insertions(+), 6 deletions(-)
 create mode 100644 test/fixtures/supervision_with_scores.ctm

diff --git a/lhotse/supervision.py b/lhotse/supervision.py
index 4ba5acb05..f6a52e60c 100644
--- a/lhotse/supervision.py
+++ b/lhotse/supervision.py
@@ -650,9 +650,17 @@ def with_alignment_from_ctm(
         ctm_words = []
         with open(ctm_file) as f:
             for line in f:
-                reco_id, channel, start, duration, symbol = line.strip().split()
+                reco_id, channel, start, duration, symbol, *score = line.strip().split()
+                score = float(score[0]) if score else None
                 ctm_words.append(
-                    (reco_id, int(channel), float(start), float(duration), symbol)
+                    (
+                        reco_id,
+                        int(channel),
+                        float(start),
+                        float(duration),
+                        symbol,
+                        score,
+                    )
                 )
         ctm_words = sorted(ctm_words, key=lambda x: (x[0], x[2]))
         reco_to_ctm = defaultdict(
@@ -665,7 +673,12 @@ def with_alignment_from_ctm(
             if reco_id in reco_to_ctm:
                 for seg in self.find(recording_id=reco_id):
                     alignment = [
-                        AlignmentItem(symbol=word[4], start=word[2], duration=word[3])
+                        AlignmentItem(
+                            symbol=word[4],
+                            start=word[2],
+                            duration=word[3],
+                            score=word[5],
+                        )
                         for word in reco_to_ctm[reco_id]
                         if overspans(seg, TimeSpan(word[2], word[2] + word[3]))
                         and (seg.channel == word[1] or not match_channel)
@@ -697,9 +710,14 @@ def write_alignment_to_ctm(self, ctm_file: Pathlike, type: str = "word") -> None
                 if type in s.alignment:
                     for ali in s.alignment[type]:
                         c = s.channel[0] if isinstance(s.channel, list) else s.channel
-                        f.write(
-                            f"{s.recording_id} {c} {ali.start:.02f} {ali.duration:.02f} {ali.symbol}\n"
-                        )
+                        if ali.score is None:
+                            f.write(
+                                f"{s.recording_id} {c} {ali.start:.02f} {ali.duration:.02f} {ali.symbol}\n"
+                            )
+                        else:
+                            f.write(
+                                f"{s.recording_id} {c} {ali.start:.02f} {ali.duration:.02f} {ali.symbol} {ali.score:.02f}\n"
+                            )
 
     def to_dicts(self) -> Iterable[dict]:
         return (s.to_dict() for s in self)
diff --git a/test/fixtures/supervision_with_scores.ctm b/test/fixtures/supervision_with_scores.ctm
new file mode 100644
index 000000000..6a1e25c91
--- /dev/null
+++ b/test/fixtures/supervision_with_scores.ctm
@@ -0,0 +1,5 @@
+recording-1 0 0.10 0.08 transcript 0.9
+recording-1 0 0.18 0.02 of 0.8
+recording-1 0 0.20 0.03 the 0.85
+recording-1 0 0.23 0.07 first 0.7
+recording-1 0 0.30 0.10 segment 0.98
diff --git a/test/test_supervision_set.py b/test/test_supervision_set.py
index 3311c769a..8adc8b151 100644
--- a/test/test_supervision_set.py
+++ b/test/test_supervision_set.py
@@ -20,6 +20,13 @@ def external_supervision_set() -> SupervisionSet:
     ).with_alignment_from_ctm("test/fixtures/supervision.ctm")
 
 
+@pytest.fixture
+def external_supervision_set_with_scores() -> SupervisionSet:
+    return SupervisionSet.from_json(
+        "test/fixtures/supervision.json"
+    ).with_alignment_from_ctm("test/fixtures/supervision_with_scores.ctm")
+
+
 @pytest.fixture
 def external_alignment() -> Dict[str, List[AlignmentItem]]:
     return {
@@ -33,6 +40,19 @@ def external_alignment() -> Dict[str, List[AlignmentItem]]:
     }
 
 
+@pytest.fixture
+def external_alignment_with_scores() -> Dict[str, List[AlignmentItem]]:
+    return {
+        "word": [
+            AlignmentItem("transcript", 0.1, 0.08, 0.9),
+            AlignmentItem("of", 0.18, 0.02, 0.8),
+            AlignmentItem("the", 0.2, 0.03, 0.85),
+            AlignmentItem("first", 0.23, 0.07, 0.7),
+            AlignmentItem("segment", 0.3, 0.1, 0.98),
+        ]
+    }
+
+
 def test_supervision_map(external_supervision_set):
     for s in external_supervision_set.map(remove_spaces_from_segment_text):
         if s.text is not None:
@@ -186,6 +206,17 @@ def test_supervision_set_with_alignment_from_ctm(
         assert type(seg) == SupervisionSegment
 
 
+def test_supervision_set_with_alignment_from_ctm_with_scores(
+    external_supervision_set_with_scores, external_alignment_with_scores
+):
+    segment = external_supervision_set_with_scores["segment-1"]
+    assert external_alignment_with_scores == segment.alignment
+    assert external_supervision_set_with_scores["segment-2"].alignment == {"word": []}
+    assert external_supervision_set_with_scores["segment-3"].alignment == {"word": []}
+    for seg in external_supervision_set_with_scores:
+        assert type(seg) == SupervisionSegment
+
+
 def test_supervision_set_write_alignment_to_ctm(external_supervision_set, tmp_path):
     tmp_ctm_file = tmp_path / "alignment.ctm"
     external_supervision_set.write_alignment_to_ctm(tmp_ctm_file)

From 128374f50dacf2bc6700d50aae462a94133bb0f4 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 3 Aug 2023 09:29:39 -0400
Subject: [PATCH 22/32] minor change

---
 lhotse/supervision.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lhotse/supervision.py b/lhotse/supervision.py
index f6a52e60c..3d99f38bc 100644
--- a/lhotse/supervision.py
+++ b/lhotse/supervision.py
@@ -651,7 +651,6 @@ def with_alignment_from_ctm(
         with open(ctm_file) as f:
             for line in f:
                 reco_id, channel, start, duration, symbol, *score = line.strip().split()
-                score = float(score[0]) if score else None
                 ctm_words.append(
                     (
                         reco_id,
@@ -659,7 +658,7 @@ def with_alignment_from_ctm(
                         float(start),
                         float(duration),
                         symbol,
-                        score,
+                        float(score[0]) if score else None,
                     )
                 )
         ctm_words = sorted(ctm_words, key=lambda x: (x[0], x[2]))

From 5b51cc4a2e7d784a9f455acbe0e8cbff0255c4e6 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 7 Aug 2023 17:47:59 -0400
Subject: [PATCH 23/32] made suggested changes

---
 lhotse/cut/base.py |  4 ++--
 lhotse/cut/mono.py | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index f7dede029..2075dd9a7 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -518,8 +518,8 @@ def trim_to_alignments(
         Splits the current :class:`.Cut` into its constituent alignment items (:class:`.AlignmentItem`).
         These cuts have identical start times and durations as the alignment item. Additionally,
         the `max_pause` option can be used to merge alignment items that are separated by a pause
-        shorter than `max_pause`. If `max_segment_duration` is specified, the merging will be
-        performed only if the resulting segment is shorter than `max_segment_duration`.
+        shorter than `max_pause`. If `max_segment_duration` is specified, we will keep merging
+        consecutive segments until the duration of the merged segment exceeds `max_segment_duration`.
 
         For the case of a multi-channel cut with multiple alignments, we can either trim
         while respecting the supervision channels (in which case output cut has the same channels
diff --git a/lhotse/cut/mono.py b/lhotse/cut/mono.py
index 38960767c..73026c377 100644
--- a/lhotse/cut/mono.py
+++ b/lhotse/cut/mono.py
@@ -216,6 +216,12 @@ def merge_supervisions(
             It will be called roughly like:
             ``custom_merge_fn(custom_key, [s.custom[custom_key] for s in sups])``
         """
+        merge_func_ = partial(
+            merge_items_with_delimiter,
+            delimiter="#",
+            return_first=(merge_policy == "keep_first"),
+        )
+
         # "m" stands for merged in variable names below
 
         if custom_merge_fn is not None:
@@ -223,7 +229,7 @@ def merge_supervisions(
             merge_custom = custom_merge_fn
         else:
             # Merge the string representations of custom fields.
-            merge_custom = lambda k, vs: merge_items_with_delimiter(map(str, vs))
+            merge_custom = lambda k, vs: merge_func_(map(str, vs))
 
         sups = sorted(self.supervisions, key=lambda s: s.start)
 
@@ -251,12 +257,6 @@ def merge_supervisions(
                 f"recognition models (cut id: {self.id})."
             )
 
-        merge_func_ = partial(
-            merge_items_with_delimiter,
-            delimiter="#",
-            return_first=(merge_policy == "keep_first"),
-        )
-
         msup = SupervisionSegment(
             id=merge_func_(s.id for s in sups),
             recording_id=sups[0].recording_id,

From a3474475af14ee210816e1a5e418788245836e94 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Fri, 11 Aug 2023 11:48:57 -0400
Subject: [PATCH 24/32] apply change to multi custom merge func

---
 lhotse/cut/multi.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/lhotse/cut/multi.py b/lhotse/cut/multi.py
index a370ca27d..20303f6aa 100644
--- a/lhotse/cut/multi.py
+++ b/lhotse/cut/multi.py
@@ -238,13 +238,20 @@ def merge_supervisions(
             It will be called roughly like:
             ``custom_merge_fn(custom_key, [s.custom[custom_key] for s in sups])``
         """
+        merge_func_ = partial(
+            merge_items_with_delimiter,
+            delimiter="#",
+            return_first=(merge_policy == "keep_first"),
+        )
+
         # "m" stands for merged in variable names below
+
         if custom_merge_fn is not None:
             # Merge custom fields with the user-provided function.
             merge_custom = custom_merge_fn
         else:
             # Merge the string representations of custom fields.
-            merge_custom = lambda k, vs: merge_items_with_delimiter(map(str, vs))
+            merge_custom = lambda k, vs: merge_func_(map(str, vs))
 
         sups = sorted(self.supervisions, key=lambda s: s.start)
         if len(sups) <= 1:
@@ -267,12 +274,6 @@ def merge_supervisions(
                 )
             }
 
-        merge_func_ = partial(
-            merge_items_with_delimiter,
-            delimiter="#",
-            return_first=(merge_policy == "keep_first"),
-        )
-
         msups = []
         text_overlap_warning = False
         for channel, csups in sups_by_channel.items():

From b9c170423b37784a3cdc35195e1b86ad2191ddf3 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Fri, 11 Aug 2023 11:52:50 -0400
Subject: [PATCH 25/32] remove old code

---
 lhotse/cut/base.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index 2075dd9a7..5965f97d4 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -727,7 +727,6 @@ def cut_into_windows(
         duration: Seconds,
         hop: Optional[Seconds] = None,
         keep_excessive_supervisions: bool = True,
-        use_alignment_if_exists: Optional[str] = None,
     ) -> "CutSet":  # noqa: F821
         """
         Return a list of shorter cuts, made by traversing this cut in windows of
@@ -740,25 +739,10 @@ def cut_into_windows(
         :param hop: Shift between the windows in the new cuts in seconds.
         :param keep_excessive_supervisions: bool. When a cut is truncated in the
             middle of a supervision segment, should the supervision be kept.
-        :param use_alignment_if_exists: Optional str. If provided, the corresponding alignments will
-            be used to cut the supervisions according to the time. This could mean that resulting
-            cut durations are slightly different than the requested ``duration``, since we will
-            try to align the supervisions to the alignment boundaries.
         :return: a list of cuts made from shorter duration windows.
         """
         from .set import CutSet
 
-        if use_alignment_if_exists is not None:
-            # Only check the first supervision (checking all would be too slow)
-            assert (
-                use_alignment_if_exists in self.supervisions[0].alignment
-            ), f"Supervision does not have alignment of type {use_alignment_if_exists}"
-            return self.cut_into_windows_with_alignment(
-                alignment_type=use_alignment_if_exists,
-                duration=duration,
-                hop=hop,
-            )
-
         if not hop:
             hop = duration
         new_cuts = []

From 22d18d60ceffffc7c2a209d0cc3db34793fd464c Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Fri, 11 Aug 2023 13:23:52 -0400
Subject: [PATCH 26/32] fix failing tests

---
 lhotse/cut/set.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index e3a134564..70f439fe0 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -3376,13 +3376,11 @@ def _cut_into_windows_single(
     duration,
     hop,
     keep_excessive_supervisions,
-    use_alignment_if_exists,
 ) -> CutSet:
     return cuts.cut_into_windows(
         duration=duration,
         hop=hop,
         keep_excessive_supervisions=keep_excessive_supervisions,
-        use_alignment_if_exists=use_alignment_if_exists,
     ).to_eager()
 
 

From 958a8ecfe9e065c37924e979a3f20fb04bcb2f07 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Fri, 11 Aug 2023 13:35:47 -0400
Subject: [PATCH 27/32] add tests for trim to alignments with max segment
 duration

---
 test/cut/test_cut_trim_to_supervisions.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/test/cut/test_cut_trim_to_supervisions.py b/test/cut/test_cut_trim_to_supervisions.py
index 5bb93c573..f4cbe4dd2 100644
--- a/test/cut/test_cut_trim_to_supervisions.py
+++ b/test/cut/test_cut_trim_to_supervisions.py
@@ -502,9 +502,16 @@ def test_multi_cut_trim_to_supervisions_do_not_keep_all_channels_raises(multi_cu
         )
 
 
-@pytest.mark.parametrize(["max_pause", "expected_cuts"], [(0.0, 5), (0.2, 4)])
-def test_cut_trim_to_alignments(mono_cut, max_pause, expected_cuts):
-    cuts = mono_cut.trim_to_alignments("word", max_pause=max_pause)
+@pytest.mark.parametrize(
+    ["max_pause", "max_segment_duration", "expected_cuts"],
+    [(0.0, None, 5), (0.1, 5, 4), (0.1, 2, 5), (0.2, None, 4)],
+)
+def test_cut_trim_to_alignments(
+    mono_cut, max_pause, max_segment_duration, expected_cuts
+):
+    cuts = mono_cut.trim_to_alignments(
+        "word", max_pause=max_pause, max_segment_duration=max_segment_duration
+    )
     assert len(cuts) == expected_cuts
 
 

From 90f2fe082527f97256a65864bc171e8041b9ce42 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Fri, 11 Aug 2023 15:08:46 -0400
Subject: [PATCH 28/32] add tests for merge supervisions

---
 lhotse/cut/mixed.py                     | 33 +++++++-----
 test/cut/test_cut_merge_supervisions.py | 71 +++++++++++++++++--------
 2 files changed, 69 insertions(+), 35 deletions(-)

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
index 6eaeda792..f26ce6431 100644
--- a/lhotse/cut/mixed.py
+++ b/lhotse/cut/mixed.py
@@ -1341,29 +1341,38 @@ def map_supervisions(
         return new_mixed_cut
 
     def merge_supervisions(
-        self, custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None
+        self,
+        merge_policy: str = "delimiter",
+        custom_merge_fn: Optional[Callable[[str, Iterable[Any]], Any]] = None,
     ) -> "MixedCut":
         """
         Return a copy of the cut that has all of its supervisions merged into
         a single segment.
 
         The new start is the start of the earliest superivion, and the new duration
-        is a minimum spanning duration for all the supervisions.
-
-        The text fields are concatenated with a whitespace, and all other string fields
-        (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
-        This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
+        is a minimum spanning duration for all the supervisions. The text fields are
+        concatenated with a whitespace.
 
         .. note:: If you're using individual tracks of a mixed cut, note that this transform
              drops all the supervisions in individual tracks and assigns the merged supervision
              in the first :class:`.DataCut` found in ``self.tracks``.
 
+        :param merge_policy: one of "keep_first" or "delimiter". If "keep_first", we
+            keep only the first segment's field value, otherwise all string fields
+            (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
+            This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
         :param custom_merge_fn: a function that will be called to merge custom fields values.
             We expect ``custom_merge_fn`` to handle all possible custom keys.
             When not provided, we will treat all custom values as strings.
             It will be called roughly like:
             ``custom_merge_fn(custom_key, [s.custom[custom_key] for s in sups])``
         """
+        merge_func_ = partial(
+            merge_items_with_delimiter,
+            delimiter="#",
+            return_first=(merge_policy == "keep_first"),
+        )
+
         # "m" stands for merged in variable names below
 
         if custom_merge_fn is not None:
@@ -1371,7 +1380,7 @@ def merge_supervisions(
             merge_custom = custom_merge_fn
         else:
             # Merge the string representations of custom fields.
-            merge_custom = lambda k, vs: merge_items_with_delimiter(map(str, vs))
+            merge_custom = lambda k, vs: merge_func_(map(str, vs))
 
         sups = sorted(self.supervisions, key=lambda s: s.start)
 
@@ -1400,18 +1409,18 @@ def merge_supervisions(
             )
 
         msup = SupervisionSegment(
-            id=merge_items_with_delimiter(s.id for s in sups),
+            id=merge_func_(s.id for s in sups),
             # Make merged recording_id is a mix of recording_ids.
-            recording_id=merge_items_with_delimiter(s.recording_id for s in sups),
+            recording_id=merge_func_(s.recording_id for s in sups),
             start=mstart,
             duration=mduration,
             # Hardcode -1 to indicate no specific channel, as the supervisions might have
             # come from different channels in their original recordings.
             channel=-1,
             text=" ".join(s.text for s in sups if s.text),
-            speaker=merge_items_with_delimiter(s.speaker for s in sups if s.speaker),
-            language=merge_items_with_delimiter(s.language for s in sups if s.language),
-            gender=merge_items_with_delimiter(s.gender for s in sups if s.gender),
+            speaker=merge_func_(s.speaker for s in sups if s.speaker),
+            language=merge_func_(s.language for s in sups if s.language),
+            gender=merge_func_(s.gender for s in sups if s.gender),
             custom={
                 k: merge_custom(
                     k,
diff --git a/test/cut/test_cut_merge_supervisions.py b/test/cut/test_cut_merge_supervisions.py
index 651cb335f..0f6102cdf 100644
--- a/test/cut/test_cut_merge_supervisions.py
+++ b/test/cut/test_cut_merge_supervisions.py
@@ -10,7 +10,8 @@
 )
 
 
-def test_mono_cut_merge_supervisions():
+@pytest.mark.parametrize("merge_policy", ["delimiter", "keep_first"])
+def test_mono_cut_merge_supervisions(merge_policy):
     cut = dummy_cut(
         0,
         duration=10,
@@ -21,14 +22,13 @@ def test_mono_cut_merge_supervisions():
     )
     assert len(cut.supervisions) == 2
 
-    mcut = cut.merge_supervisions()
+    mcut = cut.merge_supervisions(merge_policy=merge_policy)
 
     # original not modified
     assert len(cut.supervisions) == 2
     assert len(mcut.supervisions) == 1
 
     s = mcut.supervisions[0]
-    assert s.id == "cat#dummy-segment-0000#dummy-segment-0001"
     assert s.recording_id == "dummy-recording-0000"  # not changed
     assert s.recording_id == cut.supervisions[0].recording_id
     assert s.start == 1
@@ -36,11 +36,19 @@ def test_mono_cut_merge_supervisions():
     assert s.duration == 7
     assert s.channel == 0
     assert s.text == "irrelevant irrelevant"
-    assert s.language == "cat#irrelevant#irrelevant"
-    assert s.speaker == "cat#irrelevant#irrelevant"
-    assert s.gender == "cat#irrelevant#irrelevant"
     assert s.custom is not None
-    assert s.custom["custom_field"] == "cat#irrelevant#irrelevant"
+    if merge_policy == "delimiter":
+        assert s.id == "cat#dummy-segment-0000#dummy-segment-0001"
+        assert s.language == "cat#irrelevant#irrelevant"
+        assert s.speaker == "cat#irrelevant#irrelevant"
+        assert s.gender == "cat#irrelevant#irrelevant"
+        assert s.custom["custom_field"] == "cat#irrelevant#irrelevant"
+    else:
+        assert s.id == "dummy-segment-0000"
+        assert s.language == "irrelevant"
+        assert s.speaker == "irrelevant"
+        assert s.gender == "irrelevant"
+        assert s.custom["custom_field"] == "irrelevant"
 
 
 def test_mono_cut_merge_supervisions_identity():
@@ -93,32 +101,41 @@ def test_padding_cut_merge_supervisions():
     assert cut == mcut
 
 
-def test_mixed_cut_merge_supervisions():
+@pytest.mark.parametrize("merge_policy", ["delimiter", "keep_first"])
+def test_mixed_cut_merge_supervisions(merge_policy):
     cut0 = dummy_cut(0, supervisions=[dummy_supervision(0)])
     cut1 = dummy_cut(1, supervisions=[dummy_supervision(1)])
     # overlapping supervisions -- note that we don't do anything smart for them.
     mixed = cut0.mix(cut1, offset_other_by=0.5)
     assert len(mixed.supervisions) == 2
 
-    mcut = mixed.merge_supervisions()
+    mcut = mixed.merge_supervisions(merge_policy=merge_policy)
 
     # original not modified
     assert len(mixed.supervisions) == 2
     assert len(mcut.supervisions) == 1
 
     s = mcut.supervisions[0]
-    assert s.id == "cat#dummy-segment-0000#dummy-segment-0001"
-    assert s.recording_id == "cat#dummy-recording-0000#dummy-recording-0001"
+    assert s.custom is not None
+    if merge_policy == "delimiter":
+        assert s.id == "cat#dummy-segment-0000#dummy-segment-0001"
+        assert s.recording_id == "cat#dummy-recording-0000#dummy-recording-0001"
+        assert s.language == "cat#irrelevant#irrelevant"
+        assert s.speaker == "cat#irrelevant#irrelevant"
+        assert s.gender == "cat#irrelevant#irrelevant"
+        assert s.custom["custom_field"] == "cat#irrelevant#irrelevant"
+    else:
+        assert s.id == "dummy-segment-0000"
+        assert s.recording_id == "dummy-recording-0000"
+        assert s.language == "irrelevant"
+        assert s.speaker == "irrelevant"
+        assert s.gender == "irrelevant"
+        assert s.custom["custom_field"] == "irrelevant"
     assert s.start == 0
     assert s.end == 1.5
     assert s.duration == 1.5
     assert s.channel == -1
     assert s.text == "irrelevant irrelevant"
-    assert s.language == "cat#irrelevant#irrelevant"
-    assert s.speaker == "cat#irrelevant#irrelevant"
-    assert s.gender == "cat#irrelevant#irrelevant"
-    assert s.custom is not None
-    assert s.custom["custom_field"] == "cat#irrelevant#irrelevant"
 
 
 def test_mixed_cut_merge_supervisions_identity():
@@ -128,7 +145,8 @@ def test_mixed_cut_merge_supervisions_identity():
     assert cut == mcut
 
 
-def test_multi_cut_merge_supervisions_simple():
+@pytest.mark.parametrize("merge_policy", ["delimiter", "keep_first"])
+def test_multi_cut_merge_supervisions_simple(merge_policy):
     cut = dummy_multi_cut(
         0,
         duration=10,
@@ -139,14 +157,13 @@ def test_multi_cut_merge_supervisions_simple():
     )
     assert len(cut.supervisions) == 2
 
-    mcut = cut.merge_supervisions()
+    mcut = cut.merge_supervisions(merge_policy=merge_policy)
 
     # original not modified
     assert len(cut.supervisions) == 2
     assert len(mcut.supervisions) == 1
 
     s = mcut.supervisions[0]
-    assert s.id == "cat#dummy-segment-0000#dummy-segment-0001"
     assert s.recording_id == "dummy-recording-0000"  # not changed
     assert s.recording_id == cut.supervisions[0].recording_id
     assert s.start == 1
@@ -154,11 +171,19 @@ def test_multi_cut_merge_supervisions_simple():
     assert s.duration == 7
     assert s.channel == [0]
     assert s.text == "irrelevant irrelevant"
-    assert s.language == "cat#irrelevant#irrelevant"
-    assert s.speaker == "cat#irrelevant#irrelevant"
-    assert s.gender == "cat#irrelevant#irrelevant"
     assert s.custom is not None
-    assert s.custom["custom_field"] == "cat#irrelevant#irrelevant"
+    if merge_policy == "delimiter":
+        assert s.id == "cat#dummy-segment-0000#dummy-segment-0001"
+        assert s.language == "cat#irrelevant#irrelevant"
+        assert s.speaker == "cat#irrelevant#irrelevant"
+        assert s.gender == "cat#irrelevant#irrelevant"
+        assert s.custom["custom_field"] == "cat#irrelevant#irrelevant"
+    else:
+        assert s.id == "dummy-segment-0000"
+        assert s.language == "irrelevant"
+        assert s.speaker == "irrelevant"
+        assert s.gender == "irrelevant"
+        assert s.custom["custom_field"] == "irrelevant"
 
 
 @pytest.mark.parametrize("merge_channels", [True, False])

From 5ecc856436520875345e2e9b9b74e3dec1933d67 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Wed, 23 Aug 2023 10:02:05 -0400
Subject: [PATCH 29/32] fix bug in eval2000

---
 lhotse/recipes/eval2000.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lhotse/recipes/eval2000.py b/lhotse/recipes/eval2000.py
index eed73ffe2..1cbd233bf 100644
--- a/lhotse/recipes/eval2000.py
+++ b/lhotse/recipes/eval2000.py
@@ -68,7 +68,7 @@ def prepare_eval2000(
         output_dir = Path(output_dir)
         output_dir.mkdir(parents=True, exist_ok=True)
         recordings.to_file(output_dir / "eval2000_recordings_all.jsonl.gz")
-        supervision_set.to_file(output_dir / "eval2000_supervisions_unnorm.jsonl.gz")
+        supervisions.to_file(output_dir / "eval2000_supervisions_unnorm.jsonl.gz")
     return {"recordings": recordings, "supervisions": supervisions}
 
 

From 293082fcbad5d1dd656222d24b4ca45f17bc0b7b Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Wed, 23 Aug 2023 10:04:33 -0400
Subject: [PATCH 30/32] remove storing unnecessary things

---
 lhotse/recipes/eval2000.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lhotse/recipes/eval2000.py b/lhotse/recipes/eval2000.py
index 1cbd233bf..29aaf56fc 100644
--- a/lhotse/recipes/eval2000.py
+++ b/lhotse/recipes/eval2000.py
@@ -60,9 +60,9 @@ def prepare_eval2000(
         )
         for group in groups
     )
-    segment_supervision = make_segments(transcript_dir_path)
-    supervision_set = SupervisionSet.from_segments(segment_supervision)
-    recordings, supervisions = fix_manifests(recordings, supervision_set)
+    segments = make_segments(transcript_dir_path)
+    supervisions = SupervisionSet.from_segments(segments)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
     if output_dir is not None:
         output_dir = Path(output_dir)

From c4fe1c9df4f7dcd0e701cd7e442d92186d04d136 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Wed, 23 Aug 2023 11:54:09 -0400
Subject: [PATCH 31/32] add fix_manifests for all recipes

---
 lhotse/recipes/adept.py            |  2 ++
 lhotse/recipes/aidatatang_200zh.py |  2 ++
 lhotse/recipes/aishell.py          |  2 ++
 lhotse/recipes/aishell2.py         |  4 ++-
 lhotse/recipes/aishell3.py         |  3 +-
 lhotse/recipes/aishell4.py         |  2 ++
 lhotse/recipes/ali_meeting.py      |  5 +--
 lhotse/recipes/aspire.py           |  3 +-
 lhotse/recipes/atcosim.py          |  6 ++++
 lhotse/recipes/audio_mnist.py      |  2 ++
 lhotse/recipes/bengaliai_speech.py |  5 +++
 lhotse/recipes/broadcast_news.py   |  2 ++
 lhotse/recipes/bvcc.py             | 21 ++++++++++++
 lhotse/recipes/cmu_arctic.py       |  6 ++--
 lhotse/recipes/cmu_indic.py        |  6 ++--
 lhotse/recipes/cmu_kids.py         | 18 +++++-----
 lhotse/recipes/commonvoice.py      |  7 ++++
 lhotse/recipes/csj.py              |  5 +++
 lhotse/recipes/cslu_kids.py        | 16 +++++----
 lhotse/recipes/daily_talk.py       |  2 ++
 lhotse/recipes/dihard3.py          |  5 +--
 lhotse/recipes/earnings21.py       |  3 +-
 lhotse/recipes/earnings22.py       |  3 +-
 lhotse/recipes/gale_arabic.py      |  8 ++---
 lhotse/recipes/gale_mandarin.py    | 10 +++---
 lhotse/recipes/heroico.py          |  3 +-
 lhotse/recipes/hifitts.py          |  3 ++
 lhotse/recipes/himia.py            |  4 ++-
 lhotse/recipes/iwslt22_ta.py       | 27 ++++++---------
 lhotse/recipes/kespeech.py         | 19 +++++++----
 lhotse/recipes/l2_arctic.py        |  3 ++
 lhotse/recipes/librilight.py       |  7 +++-
 lhotse/recipes/librimix.py         |  9 ++++-
 lhotse/recipes/librispeech.py      |  5 ++-
 lhotse/recipes/ljspeech.py         |  3 +-
 lhotse/recipes/magicdata.py        |  4 ++-
 lhotse/recipes/mgb2.py             |  3 +-
 lhotse/recipes/mls.py              |  3 +-
 lhotse/recipes/mobvoihotwords.py   |  3 +-
 lhotse/recipes/mtedx.py            | 10 ++----
 lhotse/recipes/nsc.py              | 13 ++++---
 lhotse/recipes/peoples_speech.py   |  3 +-
 lhotse/recipes/primewords.py       |  3 +-
 lhotse/recipes/speechcommands.py   | 20 ++++++++++-
 lhotse/recipes/spgispeech.py       |  4 +++
 lhotse/recipes/stcmds.py           |  4 ++-
 lhotse/recipes/tal_asr.py          |  4 ++-
 lhotse/recipes/tal_csasr.py        |  4 ++-
 lhotse/recipes/thchs_30.py         |  4 ++-
 lhotse/recipes/timit.py            |  4 ++-
 lhotse/recipes/uwb_atcc.py         | 54 +++++++++++++++++-------------
 lhotse/recipes/vctk.py             |  6 ++--
 lhotse/recipes/voxceleb.py         |  9 ++++-
 lhotse/recipes/voxpopuli.py        |  7 +---
 lhotse/recipes/xbmu_amdo31.py      | 17 ++++++----
 lhotse/recipes/yesno.py            |  3 +-
 56 files changed, 273 insertions(+), 140 deletions(-)

diff --git a/lhotse/recipes/adept.py b/lhotse/recipes/adept.py
index 2c998fac9..a309ccb25 100644
--- a/lhotse/recipes/adept.py
+++ b/lhotse/recipes/adept.py
@@ -34,6 +34,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, resumable_download
 
 ADEPT_URL = "https://zenodo.org/record/5117102/files/ADEPT.zip"
@@ -140,6 +141,7 @@ def prepare_adept(
         )
 
     supervisions = SupervisionSet.from_segments(supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/aidatatang_200zh.py b/lhotse/recipes/aidatatang_200zh.py
index ce8404e0c..5bc2b4a83 100644
--- a/lhotse/recipes/aidatatang_200zh.py
+++ b/lhotse/recipes/aidatatang_200zh.py
@@ -20,6 +20,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
@@ -135,6 +136,7 @@ def prepare_aidatatang_200zh(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/aishell.py b/lhotse/recipes/aishell.py
index d6d72f3cc..d73a85474 100644
--- a/lhotse/recipes/aishell.py
+++ b/lhotse/recipes/aishell.py
@@ -16,6 +16,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
@@ -140,6 +141,7 @@ def prepare_aishell(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/aishell2.py b/lhotse/recipes/aishell2.py
index 1428d3641..3a6be874f 100644
--- a/lhotse/recipes/aishell2.py
+++ b/lhotse/recipes/aishell2.py
@@ -11,6 +11,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
 
@@ -73,7 +74,7 @@ def text_normalize(line: str) -> str:
     IC0975W0451 明年二月底小成
     ID0114W0368 我感觉就是在不断拉抽屉
     ID0115W0198 我公司员工不存在持有和泰创投股份的情况
-    
+
     """
     new_line = []
     line = list(line)
@@ -161,6 +162,7 @@ def prepare_aishell2(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/aishell3.py b/lhotse/recipes/aishell3.py
index 6cee56b4e..ad2e217ea 100644
--- a/lhotse/recipes/aishell3.py
+++ b/lhotse/recipes/aishell3.py
@@ -25,6 +25,7 @@
     validate_recordings_and_supervisions,
 )
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
@@ -159,7 +160,7 @@ def prepare_aishell3(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
-
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/aishell4.py b/lhotse/recipes/aishell4.py
index fff86ee4a..ee26fa706 100644
--- a/lhotse/recipes/aishell4.py
+++ b/lhotse/recipes/aishell4.py
@@ -35,6 +35,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, is_module_available, resumable_download, safe_extract
 
@@ -174,6 +175,7 @@ def prepare_aishell4(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/ali_meeting.py b/lhotse/recipes/ali_meeting.py
index 50540bda0..eb2c3b61b 100644
--- a/lhotse/recipes/ali_meeting.py
+++ b/lhotse/recipes/ali_meeting.py
@@ -25,8 +25,9 @@
 
 from tqdm import tqdm
 
-from lhotse import fix_manifests, validate_recordings_and_supervisions
+from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import normalize_text_alimeeting
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, is_module_available, resumable_download, safe_extract
@@ -204,11 +205,11 @@ def prepare_ali_meeting(
                         )
                         supervisions.append(segment)
 
+        # Fix manifests
         recording_set, supervision_set = fix_manifests(
             RecordingSet.from_recordings(recordings),
             SupervisionSet.from_segments(supervisions),
         )
-        # Fix manifests
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/aspire.py b/lhotse/recipes/aspire.py
index 71e79508a..0a672bf24 100644
--- a/lhotse/recipes/aspire.py
+++ b/lhotse/recipes/aspire.py
@@ -34,8 +34,9 @@
 from pathlib import Path
 from typing import Dict, NamedTuple, Optional, Union
 
-from lhotse import fix_manifests, validate_recordings_and_supervisions
+from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import AudioSource, Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, Seconds
 
diff --git a/lhotse/recipes/atcosim.py b/lhotse/recipes/atcosim.py
index ddec6f0fb..e29a90060 100644
--- a/lhotse/recipes/atcosim.py
+++ b/lhotse/recipes/atcosim.py
@@ -18,6 +18,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import (
     Pathlike,
@@ -245,4 +246,9 @@ def prepare_atcosim(
 
     recordings = RecordingSet.from_jsonl_lazy(recs_writer.path)
     supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path)
+
+    logging.warning(
+        "Manifests are lazily materialized. You may want to call `lhotse.qa.fix_manifests()`"
+        " to ensure that all supervisions fall within the corresponding recordings."
+    )
     return recordings, supervisions
diff --git a/lhotse/recipes/audio_mnist.py b/lhotse/recipes/audio_mnist.py
index 6b52597cb..646652a36 100644
--- a/lhotse/recipes/audio_mnist.py
+++ b/lhotse/recipes/audio_mnist.py
@@ -27,6 +27,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.serialization import load_json
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download
@@ -132,6 +133,7 @@ def prepare_audio_mnist(
         )
 
     supervisions = SupervisionSet.from_segments(supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/bengaliai_speech.py b/lhotse/recipes/bengaliai_speech.py
index f48c34176..e8531f7ba 100644
--- a/lhotse/recipes/bengaliai_speech.py
+++ b/lhotse/recipes/bengaliai_speech.py
@@ -30,6 +30,7 @@
     set_ffmpeg_torchaudio_info_enabled,
 )
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.recipes.utils import manifests_exist
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -189,6 +190,10 @@ def prepare_bengaliai_speech(
             num_jobs=num_jobs,
         )
 
+        # Fix manifests
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
         if output_dir is not None:
             supervision_set.to_file(
                 output_dir / f"bengaliai_speech_supervisions_{part}.jsonl.gz"
diff --git a/lhotse/recipes/broadcast_news.py b/lhotse/recipes/broadcast_news.py
index efaa6a525..5ba4da0f2 100644
--- a/lhotse/recipes/broadcast_news.py
+++ b/lhotse/recipes/broadcast_news.py
@@ -19,6 +19,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, check_and_rglob, recursion_limit
 
@@ -65,6 +66,7 @@ def prepare_broadcast_news(
         chain.from_iterable(sups["segments"] for sups in supervisions_list)
     )
 
+    recordings, segment_supervisions = fix_manifests(recordings, segment_supervisions)
     validate_recordings_and_supervisions(recordings, segment_supervisions)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/bvcc.py b/lhotse/recipes/bvcc.py
index 3319c6749..eda134d52 100644
--- a/lhotse/recipes/bvcc.py
+++ b/lhotse/recipes/bvcc.py
@@ -9,6 +9,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike
 
 
@@ -76,6 +77,11 @@ def prepare_bvcc(
         )
     )
     main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup)
+
+    # Fix manifests
+    main1_dev_recs, main1_dev_sup = fix_manifests(main1_dev_recs, main1_dev_sup)
+    validate_recordings_and_supervisions(main1_dev_recs, main1_dev_sup)
+
     manifests["main1_dev"] = {
         "recordings": main1_dev_recs,
         "supervisions": main1_dev_sup,
@@ -90,6 +96,11 @@ def prepare_bvcc(
         )
     )
     main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup)
+
+    # Fix manifests
+    main1_train_recs, main1_train_sup = fix_manifests(main1_train_recs, main1_train_sup)
+    validate_recordings_and_supervisions(main1_train_recs, main1_train_sup)
+
     manifests["main1_train"] = {
         "recordings": main1_train_recs,
         "supervisions": main1_train_sup,
@@ -134,6 +145,11 @@ def prepare_bvcc(
         )
     )
     ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup)
+
+    # Fix_manifests
+    ood1_dev_recs, ood1_dev_sup = fix_manifests(ood1_dev_recs, ood1_dev_sup)
+    validate_recordings_and_supervisions(ood1_dev_recs, ood1_dev_sup)
+
     manifests["ood1_dev"] = {
         "recordings": ood1_dev_recs,
         "supervisions": ood1_dev_sup,
@@ -148,6 +164,11 @@ def prepare_bvcc(
         )
     )
     ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup)
+
+    # Fix manifests
+    ood1_train_recs, ood1_train_sup = fix_manifests(ood1_train_recs, ood1_train_sup)
+    validate_recordings_and_supervisions(ood1_train_recs, ood1_train_sup)
+
     manifests["ood1_train"] = {
         "recordings": ood1_train_recs,
         "supervisions": ood1_train_sup,
diff --git a/lhotse/recipes/cmu_arctic.py b/lhotse/recipes/cmu_arctic.py
index 9eb388d8e..d4a4d3adf 100644
--- a/lhotse/recipes/cmu_arctic.py
+++ b/lhotse/recipes/cmu_arctic.py
@@ -35,7 +35,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
-from lhotse.qa import remove_missing_recordings_and_supervisions
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
 BASE_URL = "http://festvox.org/cmu_arctic/packed/"
@@ -167,9 +167,7 @@ def prepare_cmu_arctic(
     supervisions = SupervisionSet.from_segments(supervisions)
 
     # There seem to be 20 recordings missing; remove the before validation
-    recordings, supervisions = remove_missing_recordings_and_supervisions(
-        recordings, supervisions
-    )
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/cmu_indic.py b/lhotse/recipes/cmu_indic.py
index f3e427b99..fc651c4f3 100644
--- a/lhotse/recipes/cmu_indic.py
+++ b/lhotse/recipes/cmu_indic.py
@@ -30,7 +30,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
-from lhotse.qa import remove_missing_recordings_and_supervisions
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
 BASE_URL = "http://festvox.org/h2r_indic/"
@@ -194,9 +194,7 @@ def prepare_cmu_indic(
     supervisions = SupervisionSet.from_segments(supervisions)
 
     # There seem to be 20 recordings missing; remove the before validation
-    recordings, supervisions = remove_missing_recordings_and_supervisions(
-        recordings, supervisions
-    )
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/cmu_kids.py b/lhotse/recipes/cmu_kids.py
index 31e8f3f83..1fa2e79e5 100644
--- a/lhotse/recipes/cmu_kids.py
+++ b/lhotse/recipes/cmu_kids.py
@@ -3,22 +3,22 @@
 
 Summary of corpus from LDC webpage:
 
-This database is comprised of sentences read aloud by children. It was originally designed 
-in order to create a training set of children's speech for the SPHINX II automatic speech 
+This database is comprised of sentences read aloud by children. It was originally designed
+in order to create a training set of children's speech for the SPHINX II automatic speech
 recognizer for its use in the LISTEN project at Carnegie Mellon University.
 
-The children range in age from six to eleven (see details below) and were in first through 
-third grades (the 11-year-old was in 6th grade) at the time of recording. There were 24 male 
+The children range in age from six to eleven (see details below) and were in first through
+third grades (the 11-year-old was in 6th grade) at the time of recording. There were 24 male
 and 52 female speakers. There are 5,180 utterances in all.
 
 The speakers come from two separate populations:
 
- 1. SIM95: They were recorded in the summer of 1995 and were enrolled in either the Chatham 
-    College Summer Camp or the Mount Lebanon Extended Day Summer Fun program in Pittsburgh. 
+ 1. SIM95: They were recorded in the summer of 1995 and were enrolled in either the Chatham
+    College Summer Camp or the Mount Lebanon Extended Day Summer Fun program in Pittsburgh.
     They were recorded on-site. There are 44 speakers and 3,333 utterances in this set. They
     "good" reading examples.
- 2. FP: These are examples of errorful reading and dialectic variants. The readers come from 
-    Fort Pitt School in Pittsburgh and were recorded in April 1996. There are 32 speakers and 
+ 2. FP: These are examples of errorful reading and dialectic variants. The readers come from
+    Fort Pitt School in Pittsburgh and were recorded in April 1996. There are 32 speakers and
     1,847 utterances in this set.
 
 The user should be aware that the speakers' dialect partly reflects what is locally called "Pittsburghese."
@@ -36,6 +36,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
 
@@ -129,6 +130,7 @@ def prepare_cmu_kids(
     recordings = RecordingSet.from_recordings(recordings)
     supervisions = SupervisionSet.from_segments(supervisions)
 
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     manifests = {
diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py
index f66033d3b..5a1040645 100644
--- a/lhotse/recipes/commonvoice.py
+++ b/lhotse/recipes/commonvoice.py
@@ -30,6 +30,7 @@
     validate_recordings_and_supervisions,
 )
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, is_module_available, resumable_download, safe_extract
 
@@ -305,6 +306,12 @@ def prepare_commonvoice(
                 num_jobs=num_jobs,
             )
 
+            # Fix manifests
+            recording_set, supervision_set = fix_manifests(
+                recording_set, supervision_set
+            )
+            validate_recordings_and_supervisions(recording_set, supervision_set)
+
             supervision_set.to_file(
                 output_dir / f"cv-{lang}_supervisions_{part}.jsonl.gz"
             )
diff --git a/lhotse/recipes/csj.py b/lhotse/recipes/csj.py
index 58434de6d..2a860dbbd 100644
--- a/lhotse/recipes/csj.py
+++ b/lhotse/recipes/csj.py
@@ -116,6 +116,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -889,6 +890,10 @@ def prepare_manifests(
 
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
+
+            recording_set, supervision_set = fix_manifests(
+                recording_set, supervision_set
+            )
             validate_recordings_and_supervisions(recording_set, supervision_set)
 
             if manifest_dir:
diff --git a/lhotse/recipes/cslu_kids.py b/lhotse/recipes/cslu_kids.py
index cc3e487c2..6cf0142b3 100644
--- a/lhotse/recipes/cslu_kids.py
+++ b/lhotse/recipes/cslu_kids.py
@@ -3,19 +3,19 @@
 
 Summary of corpus from LDC webpage:
 
-Collection of spontaneous and prompted speech from 1100 children between Kindergarten 
-and Grade 10 in the Forest Grove School District in Oregon. All children -- approximately 
-100 children at each grade level -- read approximately 60 items from a total list of 319 
-phonetically-balanced but simple words, sentences or digit strings. Each utterance of 
-spontaneous speech begins with a recitation of the alphabet and contains a monologue of 
-about one minute in duration. This release consists of 1017 files containing approximately 
+Collection of spontaneous and prompted speech from 1100 children between Kindergarten
+and Grade 10 in the Forest Grove School District in Oregon. All children -- approximately
+100 children at each grade level -- read approximately 60 items from a total list of 319
+phonetically-balanced but simple words, sentences or digit strings. Each utterance of
+spontaneous speech begins with a recitation of the alphabet and contains a monologue of
+about one minute in duration. This release consists of 1017 files containing approximately
 8-10 minutes of speech per speaker. Corresponding word-level transcriptions are also included.
 
 Prompted speech is verified and divided into following categories:
 
 1 Good: Only the target word is said.
 2 Maybe: Target word is present, but there's other junk in the file.
-3 Bad: Target word is not said. 
+3 Bad: Target word is not said.
 4 Puff: Same as good, but w/ an air puff.
 
 This data is not available for free - your institution needs to have an LDC subscription.
@@ -29,6 +29,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, check_and_rglob
 
@@ -128,6 +129,7 @@ def prepare_cslu_kids(
     recordings = RecordingSet.from_recordings(recordings)
     supervisions = SupervisionSet.from_segments(supervisions)
 
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     manifests = {
diff --git a/lhotse/recipes/daily_talk.py b/lhotse/recipes/daily_talk.py
index 076543ea4..53948fb9c 100644
--- a/lhotse/recipes/daily_talk.py
+++ b/lhotse/recipes/daily_talk.py
@@ -17,6 +17,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
+from lhotse.qa import fix_manifests
 from lhotse.serialization import load_json
 from lhotse.utils import Pathlike, is_module_available
 
@@ -104,6 +105,7 @@ def prepare_daily_talk(
                 )
             )
     supervisions = SupervisionSet.from_segments(supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(
         recordings=recordings, supervisions=supervisions
     )
diff --git a/lhotse/recipes/dihard3.py b/lhotse/recipes/dihard3.py
index 680afe065..22ff40139 100644
--- a/lhotse/recipes/dihard3.py
+++ b/lhotse/recipes/dihard3.py
@@ -4,7 +4,7 @@
     The DIHARD III corpus consists of multi-domain data prepared to evaluate
     "hard" speaker diarization. It was used for evaluation in the Third DIHARD
     Challenge, organized by NIST and LDC in Winter 2020. It consists of monologues,
-    map task dialogues, broadcast interviews, sociolinguistic interviews, meeting 
+    map task dialogues, broadcast interviews, sociolinguistic interviews, meeting
     speech, speech in restaurants, clinical recordings, and YouTube videos.
     More details can be found at:
     https://dihardchallenge.github.io/dihard3/docs/third_dihard_eval_plan_v1.2.pdf
@@ -17,7 +17,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, check_and_rglob
@@ -80,6 +80,7 @@ def prepare_dihard3(
                 )
             )
 
+        recordings, supervisions = fix_manifests(recordings, supervisions)
         validate_recordings_and_supervisions(recordings, supervisions)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/earnings21.py b/lhotse/recipes/earnings21.py
index 1ae483f22..cee899937 100644
--- a/lhotse/recipes/earnings21.py
+++ b/lhotse/recipes/earnings21.py
@@ -28,7 +28,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download
@@ -166,6 +166,7 @@ def prepare_earnings21(
         supervision_segments.append(s)
     supervision_set = SupervisionSet.from_segments(supervision_segments)
 
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
     validate_recordings_and_supervisions(recording_set, supervision_set)
     if output_dir is not None:
         supervision_set.to_file(output_dir / "earnings21_supervisions_all.jsonl.gz")
diff --git a/lhotse/recipes/earnings22.py b/lhotse/recipes/earnings22.py
index 0aafb058b..1da7e5bfb 100644
--- a/lhotse/recipes/earnings22.py
+++ b/lhotse/recipes/earnings22.py
@@ -28,7 +28,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -153,6 +153,7 @@ def prepare_earnings22(
         supervision_segments.append(s)
     supervision_set = SupervisionSet.from_segments(supervision_segments)
 
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
     validate_recordings_and_supervisions(recording_set, supervision_set)
     if output_dir is not None:
         supervision_set.to_file(output_dir / "earnings22_supervisions_all.jsonl.gz")
diff --git a/lhotse/recipes/gale_arabic.py b/lhotse/recipes/gale_arabic.py
index 9ffa4dcf6..353d9ffb8 100644
--- a/lhotse/recipes/gale_arabic.py
+++ b/lhotse/recipes/gale_arabic.py
@@ -41,8 +41,8 @@
 The `S` corpora contain speech data and the `T` corpora contain the corresponding
 transcriptions. This recipe prepares any subset of these corpora provided as
 arguments, but pairs of speech and transcript corpora must be present. E.g.
-to only prepare phase 3 news speech, the arguments 
-`audio_dirs = ["/export/data/LDC2016S07","/export/data/LDC2017S02"]` and 
+to only prepare phase 3 news speech, the arguments
+`audio_dirs = ["/export/data/LDC2016S07","/export/data/LDC2017S02"]` and
 `transcript_dirs = ["/export/data/LDC2016T17","/export/data/LDC2017T04"]` must
 be provided to the `prepare_gale_arabic` method.
 
@@ -57,7 +57,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
-from lhotse.qa import trim_supervisions_to_recordings
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, check_and_rglob, is_module_available
 
@@ -136,7 +136,7 @@ def prepare_gale_arabic(
     supervisions = SupervisionSet.from_segments(parse_transcripts(transcript_paths))
 
     # Some supervisions exceed recording boundaries, so here we trim them
-    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     manifests = defaultdict(dict)
diff --git a/lhotse/recipes/gale_mandarin.py b/lhotse/recipes/gale_mandarin.py
index 4fc711630..dfa6e8782 100644
--- a/lhotse/recipes/gale_mandarin.py
+++ b/lhotse/recipes/gale_mandarin.py
@@ -4,13 +4,13 @@
 Audio: LDC2013S08, LDC2013S04, LDC2014S09, LDC2015S06, LDC2015S13, LDC2016S03
 Text: LDC2013T20, LDC2013T08, LDC2014T28, LDC2015T09, LDC2015T25, LDC2016T12
 
-# Training:  Testing: 
+# Training:  Testing:
 
 The `S` corpora contain speech data and the `T` corpora contain the corresponding
 transcriptions. This recipe prepares any subset of these corpora provided as
 arguments, but pairs of speech and transcript corpora must be present. E.g.
-to only prepare phase 3 news speech, the arguments 
-`audio_dirs = ["/export/data/LDC2013S08","/export/data/LDC2014S09"]` and 
+to only prepare phase 3 news speech, the arguments
+`audio_dirs = ["/export/data/LDC2013S08","/export/data/LDC2014S09"]` and
 `transcript_dirs = ["/export/data/LDC2013T20","/export/data/LDC2014T28"]` must
 be provided to the `prepare_gale_mandarin` method.
 
@@ -26,7 +26,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
-from lhotse.qa import trim_supervisions_to_recordings
+from lhotse.qa import fix_manifests
 from lhotse.recipes.nsc import check_dependencies
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, check_and_rglob, is_module_available
@@ -117,7 +117,7 @@ def prepare_gale_mandarin(
     ).filter(lambda s: s.recording_id in audio_paths)
 
     # Some supervisions exceed recording boundaries, so here we trim them
-    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     TEST = [
diff --git a/lhotse/recipes/heroico.py b/lhotse/recipes/heroico.py
index 3fc61c892..65f7d9174 100644
--- a/lhotse/recipes/heroico.py
+++ b/lhotse/recipes/heroico.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import Any, Dict, NamedTuple, Optional, Union
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import AudioSource, Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -277,6 +277,7 @@ def prepare_heroico(
             for idx in audio.recordings
         )
 
+        audio, supervision = fix_manifests(audio, supervision)
         validate_recordings_and_supervisions(audio, supervision)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/hifitts.py b/lhotse/recipes/hifitts.py
index 397f71d29..14a088a67 100644
--- a/lhotse/recipes/hifitts.py
+++ b/lhotse/recipes/hifitts.py
@@ -34,6 +34,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
+from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.serialization import load_jsonl
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -203,6 +204,8 @@ def prepare_single_partition(
         )
     recordings = RecordingSet.from_recordings(recordings)
     supervisions = SupervisionSet.from_segments(supervisions)
+
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
     return recordings, supervisions
 
diff --git a/lhotse/recipes/himia.py b/lhotse/recipes/himia.py
index 9e879151c..59b070fb2 100644
--- a/lhotse/recipes/himia.py
+++ b/lhotse/recipes/himia.py
@@ -20,7 +20,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -225,6 +225,7 @@ def _prepare_train_dev_test(
             supervisions.append(segment)
     recording_set = RecordingSet.from_recordings(recordings)
     supervision_set = SupervisionSet.from_segments(supervisions)
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
     validate_recordings_and_supervisions(recording_set, supervision_set)
     return recording_set, supervision_set
 
@@ -278,6 +279,7 @@ def _prepare_cw_test(corpus_path: Path) -> Tuple[RecordingSet, SupervisionSet]:
 
     recording_set = RecordingSet.from_recordings(recordings)
     supervision_set = SupervisionSet.from_segments(supervisions)
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
     validate_recordings_and_supervisions(recording_set, supervision_set)
     return recording_set, supervision_set
 
diff --git a/lhotse/recipes/iwslt22_ta.py b/lhotse/recipes/iwslt22_ta.py
index b7ff3dd0e..4ade14010 100644
--- a/lhotse/recipes/iwslt22_ta.py
+++ b/lhotse/recipes/iwslt22_ta.py
@@ -1,14 +1,14 @@
 # Copyright    2023  Johns Hopkins        (authors: Amir Hussein, Matthew Wiesner)
 
 """
-The IWSLT Tunisian dataset is a 3-way parallel dataset consisting of approximately 160 hours 
-and 200,000 lines of aligned audio, Tunisian transcripts, and English translations. This dataset 
-comprises conversational telephone speech recorded at a sampling rate of 8kHz. The train, dev, 
-and test1 splits of the iwslt2022 shared task correspond to catalog number LDC2022E01. Please 
-note that access to this data requires an LDC subscription from your institution.To obtain this 
-dataset, you should download the predefined splits by running the following command: 
-git clone https://github.com/kevinduh/iwslt22-dialect.git. For more detailed information about 
-the shared task, please refer to the task paper available at this link: 
+The IWSLT Tunisian dataset is a 3-way parallel dataset consisting of approximately 160 hours
+and 200,000 lines of aligned audio, Tunisian transcripts, and English translations. This dataset
+comprises conversational telephone speech recorded at a sampling rate of 8kHz. The train, dev,
+and test1 splits of the iwslt2022 shared task correspond to catalog number LDC2022E01. Please
+note that access to this data requires an LDC subscription from your institution.To obtain this
+dataset, you should download the predefined splits by running the following command:
+git clone https://github.com/kevinduh/iwslt22-dialect.git. For more detailed information about
+the shared task, please refer to the task paper available at this link:
 https://aclanthology.org/2022.iwslt-1.10/.
 """
 
@@ -30,10 +30,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
-from lhotse.qa import (
-    remove_missing_recordings_and_supervisions,
-    trim_supervisions_to_recordings,
-)
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike
 
 # English annotation rules:
@@ -163,11 +160,7 @@ def prepare_iwslt22_ta(
         supervisions = deduplicate_supervisions(supervisions)
         supervisions = SupervisionSet.from_segments(supervisions)
         recordings = RecordingSet.from_recordings(recordings.values())
-        recordings, supervisions = remove_missing_recordings_and_supervisions(
-            recordings,
-            supervisions,
-        )
-        supervisions = trim_supervisions_to_recordings(recordings, supervisions)
+        recordings, supervisions = fix_manifests(recordings, supervisions)
         validate_recordings_and_supervisions(recordings, supervisions)
         for split in ("train", "dev", "test1"):
             sups_ = supervisions.filter(lambda s: s.recording_id in split_files[split])
diff --git a/lhotse/recipes/kespeech.py b/lhotse/recipes/kespeech.py
index fa838c248..1c5e76156 100644
--- a/lhotse/recipes/kespeech.py
+++ b/lhotse/recipes/kespeech.py
@@ -1,10 +1,10 @@
 """
-The KeSpeech is an open source speech dataset, KeSpeech, which involves 1,542 hours of speech 
-signals recorded by 27,237 speakers in 34 cities in China, and the pronunciation includes 
-standard Mandarin and its 8 subdialects. The new dataset possesses several properties. 
-The dataset provides multiple labels including content transcription, speaker identity and 
-subdialect, hence supporting a variety of speech processing tasks, such as speech recognition, 
-speaker recognition, and subdialect identification, as well as other advanced techniques 
+The KeSpeech is an open source speech dataset, KeSpeech, which involves 1,542 hours of speech
+signals recorded by 27,237 speakers in 34 cities in China, and the pronunciation includes
+standard Mandarin and its 8 subdialects. The new dataset possesses several properties.
+The dataset provides multiple labels including content transcription, speaker identity and
+subdialect, hence supporting a variety of speech processing tasks, such as speech recognition,
+speaker recognition, and subdialect identification, as well as other advanced techniques
 like multi-task learning and conditional learning.
 
 Full paper: https://openreview.net/forum?id=b3Zoeq2sCLq
@@ -19,7 +19,7 @@
 from tqdm.auto import tqdm
 
 from lhotse.audio import AudioSource, Recording, RecordingSet, info
-from lhotse.qa import validate_recordings_and_supervisions
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.recipes.utils import manifests_exist
 from lhotse.serialization import load_jsonl
 from lhotse.supervision import SupervisionSegment, SupervisionSet
@@ -97,6 +97,11 @@ def prepare_kespeech(
 
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
+
+            # Fix manifests
+            recording_set, supervision_set = fix_manifests(
+                recording_set, supervision_set
+            )
             validate_recordings_and_supervisions(recording_set, supervision_set)
 
             if output_dir is not None:
diff --git a/lhotse/recipes/l2_arctic.py b/lhotse/recipes/l2_arctic.py
index 9967ad0ef..aec00d8ca 100644
--- a/lhotse/recipes/l2_arctic.py
+++ b/lhotse/recipes/l2_arctic.py
@@ -26,6 +26,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike
 
 SPEAKER_DESCRIPTION = """
@@ -164,6 +165,8 @@ def prepare_l2_arctic(
         )
     supervisions = SupervisionSet.from_segments(supervisions)
 
+    # Fix the manifests to make sure they are valid
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     splits = {
diff --git a/lhotse/recipes/librilight.py b/lhotse/recipes/librilight.py
index 6d08413a8..3dd0dff31 100644
--- a/lhotse/recipes/librilight.py
+++ b/lhotse/recipes/librilight.py
@@ -4,7 +4,7 @@
 Libri-light is a benchmark for the training of automatic speech recognition (ASR)
 systems with limited or no supervision.
 
-It contains a large dataset of 60K hours of unlabelled speech from audiobooks in 
+It contains a large dataset of 60K hours of unlabelled speech from audiobooks in
 English and a small labelled dataset (10h, 1h, and 10 min) plus metrics,
 trainable baseline models, and pretrained models that use these datasets.
 
@@ -23,6 +23,7 @@
 from tqdm.auto import tqdm
 
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.recipes.utils import manifests_exist
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -98,6 +99,10 @@ def _prepare_subset(
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
 
+        # Fix manifests
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
     return recording_set, supervision_set
 
 
diff --git a/lhotse/recipes/librimix.py b/lhotse/recipes/librimix.py
index 60b5a2a20..abb34e6d6 100644
--- a/lhotse/recipes/librimix.py
+++ b/lhotse/recipes/librimix.py
@@ -5,7 +5,7 @@
 from typing import Dict, Optional, Union
 from zipfile import ZipFile
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import AudioSource, Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, Seconds, resumable_download
@@ -71,6 +71,11 @@ def prepare_librimix(
         if row["length"] / sampling_rate > min_segment_seconds
     )
     supervision_sources = make_corresponding_supervisions(audio_sources)
+
+    # Fix manifests and validate them
+    audio_sources, supervision_sources = fix_manifests(
+        audio_sources, supervision_sources
+    )
     validate_recordings_and_supervisions(audio_sources, supervision_sources)
     if output_dir is not None:
         audio_sources.to_file(output_dir / "librimix_recordings_sources.jsonl.gz")
@@ -100,6 +105,7 @@ def prepare_librimix(
             if row["length"] / sampling_rate > min_segment_seconds
         )
         supervision_mix = make_corresponding_supervisions(audio_mix)
+        audio_mix, supervision_mix = fix_manifests(audio_mix, supervision_mix)
         validate_recordings_and_supervisions(audio_mix, supervision_mix)
         if output_dir is not None:
             audio_mix.to_file(output_dir / "librimix_recordings_mix.jsonl.gz")
@@ -126,6 +132,7 @@ def prepare_librimix(
             if row["length"] / sampling_rate > min_segment_seconds
         )
         supervision_noise = make_corresponding_supervisions(audio_noise)
+        audio_noise, supervision_noise = fix_manifests(audio_noise, supervision_noise)
         validate_recordings_and_supervisions(audio_noise, supervision_noise)
         if output_dir is not None:
             audio_noise.to_file(output_dir / "librimix_recordings_noise.jsonl.gz")
diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py
index 1145b9a9b..0b654514a 100644
--- a/lhotse/recipes/librispeech.py
+++ b/lhotse/recipes/librispeech.py
@@ -9,7 +9,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
@@ -204,6 +204,9 @@ def prepare_librispeech(
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
 
+            recording_set, supervision_set = fix_manifests(
+                recording_set, supervision_set
+            )
             validate_recordings_and_supervisions(recording_set, supervision_set)
 
             if output_dir is not None:
diff --git a/lhotse/recipes/ljspeech.py b/lhotse/recipes/ljspeech.py
index 4464a31fb..ab777b6c3 100644
--- a/lhotse/recipes/ljspeech.py
+++ b/lhotse/recipes/ljspeech.py
@@ -16,7 +16,7 @@
 from pathlib import Path
 from typing import Dict, Optional, Union
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.features import Fbank
 from lhotse.features.base import TorchaudioFeatureExtractor
@@ -95,6 +95,7 @@ def prepare_ljspeech(
     recording_set = RecordingSet.from_recordings(recordings)
     supervision_set = SupervisionSet.from_segments(supervisions)
 
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
     validate_recordings_and_supervisions(recording_set, supervision_set)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/magicdata.py b/lhotse/recipes/magicdata.py
index 4f9e4ffca..2fa595d45 100644
--- a/lhotse/recipes/magicdata.py
+++ b/lhotse/recipes/magicdata.py
@@ -16,7 +16,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -185,6 +185,8 @@ def prepare_magicdata(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/mgb2.py b/lhotse/recipes/mgb2.py
index 5dd509c3b..d6bddde53 100644
--- a/lhotse/recipes/mgb2.py
+++ b/lhotse/recipes/mgb2.py
@@ -168,7 +168,8 @@ def prepare_mgb2(
 
             if text_cleaning is True:
                 supervisions = supervisions.transform_text(cleaning)
-            recordings, supervisions = fix_manifests(recordings, supervisions)
+
+        recordings, supervisions = fix_manifests(recordings, supervisions)
         validate_recordings_and_supervisions(recordings, supervisions)
 
         # saving recordings and supervisions
diff --git a/lhotse/recipes/mls.py b/lhotse/recipes/mls.py
index 952e9f911..05c821e99 100644
--- a/lhotse/recipes/mls.py
+++ b/lhotse/recipes/mls.py
@@ -12,7 +12,8 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import *
+from lhotse import RecordingSet, SupervisionSegment, SupervisionSet
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.utils import Pathlike
 
 
diff --git a/lhotse/recipes/mobvoihotwords.py b/lhotse/recipes/mobvoihotwords.py
index c45233628..e3879c07f 100644
--- a/lhotse/recipes/mobvoihotwords.py
+++ b/lhotse/recipes/mobvoihotwords.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 from typing import Dict, Optional, Union
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import SupervisionSegment, SupervisionSet
@@ -137,6 +137,7 @@ def prepare_mobvoihotwords(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/mtedx.py b/lhotse/recipes/mtedx.py
index 7df2dfeab..486a41fda 100644
--- a/lhotse/recipes/mtedx.py
+++ b/lhotse/recipes/mtedx.py
@@ -39,10 +39,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
-from lhotse.qa import (
-    remove_missing_recordings_and_supervisions,
-    trim_supervisions_to_recordings,
-)
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, is_module_available, resumable_download, safe_extract
 
 # Keep Markings such as vowel signs, all letters, and decimal numbers
@@ -226,10 +223,7 @@ def prepare_single_mtedx_language(
                 logging.warning(f"No supervisions found in {text_dir}")
             supervisions = SupervisionSet.from_segments(supervisions)
 
-            recordings, supervisions = remove_missing_recordings_and_supervisions(
-                recordings, supervisions
-            )
-            supervisions = trim_supervisions_to_recordings(recordings, supervisions)
+            recordings, supervisions = fix_manifests(recordings, supervisions)
             validate_recordings_and_supervisions(recordings, supervisions)
 
             manifests[split] = {
diff --git a/lhotse/recipes/nsc.py b/lhotse/recipes/nsc.py
index c7b1320fb..5f002c428 100644
--- a/lhotse/recipes/nsc.py
+++ b/lhotse/recipes/nsc.py
@@ -39,6 +39,7 @@
     validate_recordings_and_supervisions,
 )
 from lhotse.parallel import parallel_map
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike
 
 logger = logging.getLogger(__name__)
@@ -156,17 +157,15 @@ def prepare_nsc(
     else:
         raise ValueError(f"Unknown dataset part: {dataset_part}")
 
-    validate_recordings_and_supervisions(**manifests)
+    # Fix the manifests to make sure they are valid
+    recordings, supervisions = fix_manifests(**manifests)
+    validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:
         output_dir = Path(output_dir)
         output_dir.mkdir(parents=True, exist_ok=True)
-        manifests["supervisions"].to_file(
-            output_dir / f"nsc_supervisions_{dataset_part}.jsonl.gz"
-        )
-        manifests["recordings"].to_file(
-            output_dir / f"nsc_recordings_{dataset_part}.jsonl.gz"
-        )
+        supervisions.to_file(output_dir / f"nsc_supervisions_{dataset_part}.jsonl.gz")
+        recordings.to_file(output_dir / f"nsc_recordings_{dataset_part}.jsonl.gz")
 
     return manifests
 
diff --git a/lhotse/recipes/peoples_speech.py b/lhotse/recipes/peoples_speech.py
index a25b40c54..799442fcd 100644
--- a/lhotse/recipes/peoples_speech.py
+++ b/lhotse/recipes/peoples_speech.py
@@ -20,7 +20,7 @@
 from tqdm.auto import tqdm
 
 from lhotse.audio import AudioSource, Recording, RecordingSet, info
-from lhotse.qa import validate_recordings_and_supervisions
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.recipes.utils import manifests_exist
 from lhotse.serialization import load_jsonl
 from lhotse.supervision import SupervisionSegment, SupervisionSet
@@ -107,6 +107,7 @@ def _prepare_subset(
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
 
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(
             recordings=recording_set, supervisions=supervision_set
         )
diff --git a/lhotse/recipes/primewords.py b/lhotse/recipes/primewords.py
index af316d89b..752cad2df 100644
--- a/lhotse/recipes/primewords.py
+++ b/lhotse/recipes/primewords.py
@@ -13,7 +13,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -119,6 +119,7 @@ def prepare_primewords(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/speechcommands.py b/lhotse/recipes/speechcommands.py
index 7c5c5185b..10b3d75ad 100644
--- a/lhotse/recipes/speechcommands.py
+++ b/lhotse/recipes/speechcommands.py
@@ -19,7 +19,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import SupervisionSegment, SupervisionSet
@@ -182,6 +182,12 @@ def _prepare_train(
     train_recording_set = RecordingSet.from_recordings(train_recordings)
     train_supervision_set = SupervisionSet.from_segments(train_supervisions)
 
+    # Fix manifests
+    train_recording_set, train_supervision_set = fix_manifests(
+        train_recording_set, train_supervision_set
+    )
+    validate_recordings_and_supervisions(train_recording_set, train_supervision_set)
+
     return train_recording_set, train_supervision_set
 
 
@@ -238,6 +244,12 @@ def _prepare_valid(
     valid_recording_set = RecordingSet.from_recordings(valid_recordings)
     valid_supervision_set = SupervisionSet.from_segments(valid_supervisions)
 
+    # Fix manifests
+    valid_recording_set, valid_supervision_set = fix_manifests(
+        valid_recording_set, valid_supervision_set
+    )
+    validate_recordings_and_supervisions(valid_recording_set, valid_supervision_set)
+
     return valid_recording_set, valid_supervision_set
 
 
@@ -307,6 +319,12 @@ def _prepare_test(
     test_recording_set = RecordingSet.from_recordings(test_recordings)
     test_supervision_set = SupervisionSet.from_segments(test_supervisions)
 
+    # Fix manifests
+    test_recording_set, test_supervision_set = fix_manifests(
+        test_recording_set, test_supervision_set
+    )
+    validate_recordings_and_supervisions(test_recording_set, test_supervision_set)
+
     return test_recording_set, test_supervision_set
 
 
diff --git a/lhotse/recipes/spgispeech.py b/lhotse/recipes/spgispeech.py
index c3cd901c8..7c679a172 100644
--- a/lhotse/recipes/spgispeech.py
+++ b/lhotse/recipes/spgispeech.py
@@ -167,4 +167,8 @@ def audio_read_worker(p: Path) -> Recording:
             "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
         }
 
+    logging.warning(
+        "Manifests are lazily materialized. You may want to call `lhotse.qa.fix_manifests()`"
+        " to ensure that all supervisions fall within the corresponding recordings."
+    )
     return manifests
diff --git a/lhotse/recipes/stcmds.py b/lhotse/recipes/stcmds.py
index 6881b7faa..3d1903729 100644
--- a/lhotse/recipes/stcmds.py
+++ b/lhotse/recipes/stcmds.py
@@ -14,7 +14,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -129,6 +129,8 @@ def prepare_stcmds(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/tal_asr.py b/lhotse/recipes/tal_asr.py
index b1adb2bf0..b4ceed750 100644
--- a/lhotse/recipes/tal_asr.py
+++ b/lhotse/recipes/tal_asr.py
@@ -10,7 +10,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -91,6 +91,8 @@ def prepare_tal_asr(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/tal_csasr.py b/lhotse/recipes/tal_csasr.py
index f4f5b774f..092c4337a 100644
--- a/lhotse/recipes/tal_csasr.py
+++ b/lhotse/recipes/tal_csasr.py
@@ -10,7 +10,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -117,6 +117,8 @@ def prepare_tal_csasr(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/thchs_30.py b/lhotse/recipes/thchs_30.py
index 4fd7df43a..c797245d9 100644
--- a/lhotse/recipes/thchs_30.py
+++ b/lhotse/recipes/thchs_30.py
@@ -15,7 +15,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -138,6 +138,8 @@ def prepare_thchs_30(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/timit.py b/lhotse/recipes/timit.py
index f965705de..aa79e5332 100644
--- a/lhotse/recipes/timit.py
+++ b/lhotse/recipes/timit.py
@@ -13,7 +13,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download
@@ -143,6 +143,8 @@ def prepare_timit(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/uwb_atcc.py b/lhotse/recipes/uwb_atcc.py
index e270fd8f7..0fb233d16 100644
--- a/lhotse/recipes/uwb_atcc.py
+++ b/lhotse/recipes/uwb_atcc.py
@@ -15,7 +15,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import (
@@ -342,28 +342,6 @@ def strip_accents(s):
     ("STAND BYE", "STANDBY"),
 )
 
-BRACKET_PADDING_PATTERN1 = re.compile(r"([\w\.\+])(\[|\()")
-BRACKET_PADDING_PATTERN2 = re.compile(r"(\]|\))([\w\+])")
-COMMENT_PATTERN = re.compile(r"\[comment_\|].*?\[\|_comment]")
-BACKGROUND_SPEECH_PATTERN = re.compile(
-    r"\[background_speech_\|](.*?)\[\|_background_speech]"
-)
-NOISE_PATTERN = re.compile(r"\[noise_\|](.*?)\[\|_noise]")
-SPEAKER_PATTERN = re.compile(r"\[speaker_\|](.*?)\[\|_speaker]")
-DECIMAL_NUMBER_PATTERN = re.compile(r"\.([0-9])")
-NUMBER_DECIMAL_PATTERN = re.compile(r"([0-9])\.")
-PHONETIC_INTERRUPTED_PATTERN1 = re.compile(r"([A-Z]+\+)")
-PHONETIC_INTERRUPTED_PATTERN2 = re.compile(r"(\+[A-Z]+)")
-INTERRUPTED_PATTERN1 = re.compile(r"(\w+\+)")
-INTERRUPTED_PATTERN2 = re.compile(r"(\+\w+)")
-ABBREVIATION_PATTERN = re.compile(r"\(((\w*|\s*|\+)*)\(((\w*|\s*)*)\)\)")
-SPLIT_NUMERIC_ALPHA = re.compile(r"([0-9])([A-Za-z])")
-SPLIT_ALPHA_NUMERIC = re.compile(r"([A-Za-z])([0-9])")
-NO_ENG_PATTERN = re.compile(r"\[NO_ENG_\|](.*?)\[\|_NO_ENG]")
-CZECH_PATTERN = re.compile(r"\[CZECH_\|](.*?)\[\|_CZECH]")
-UNINTELLIGIBLE_PATTERN = re.compile(r"\[UNINTELLIGIBLE_\|](.*?)\[\|_UNINTELLIGIBLE]")
-WHITESPACE_PATTERN = re.compile(r"  +")
-
 
 def text_normalize(
     text: str,
@@ -375,11 +353,37 @@ def text_normalize(
     partial_sym: str,  # When None, will output partial words
     unknown_sym: str,
 ):
+
     assert is_module_available(
         "num2words"
     ), "Please run 'pip install num2words' for number to word normalization."
     from num2words import num2words
 
+    # regex patterns
+    BRACKET_PADDING_PATTERN1 = re.compile(r"([\w\.\+])(\[|\()")
+    BRACKET_PADDING_PATTERN2 = re.compile(r"(\]|\))([\w\+])")
+    COMMENT_PATTERN = re.compile(r"\[comment_\|].*?\[\|_comment]")
+    BACKGROUND_SPEECH_PATTERN = re.compile(
+        r"\[background_speech_\|](.*?)\[\|_background_speech]"
+    )
+    NOISE_PATTERN = re.compile(r"\[noise_\|](.*?)\[\|_noise]")
+    SPEAKER_PATTERN = re.compile(r"\[speaker_\|](.*?)\[\|_speaker]")
+    DECIMAL_NUMBER_PATTERN = re.compile(r"\.([0-9])")
+    NUMBER_DECIMAL_PATTERN = re.compile(r"([0-9])\.")
+    PHONETIC_INTERRUPTED_PATTERN1 = re.compile(r"([A-Z]+\+)")
+    PHONETIC_INTERRUPTED_PATTERN2 = re.compile(r"(\+[A-Z]+)")
+    INTERRUPTED_PATTERN1 = re.compile(r"(\w+\+)")
+    INTERRUPTED_PATTERN2 = re.compile(r"(\+\w+)")
+    ABBREVIATION_PATTERN = re.compile(r"\(((\w*|\s*|\+)*)\(((\w*|\s*)*)\)\)")
+    SPLIT_NUMERIC_ALPHA = re.compile(r"([0-9])([A-Za-z])")
+    SPLIT_ALPHA_NUMERIC = re.compile(r"([A-Za-z])([0-9])")
+    NO_ENG_PATTERN = re.compile(r"\[NO_ENG_\|](.*?)\[\|_NO_ENG]")
+    CZECH_PATTERN = re.compile(r"\[CZECH_\|](.*?)\[\|_CZECH]")
+    UNINTELLIGIBLE_PATTERN = re.compile(
+        r"\[UNINTELLIGIBLE_\|](.*?)\[\|_UNINTELLIGIBLE]"
+    )
+    WHITESPACE_PATTERN = re.compile(r"  +")
+
     text = BRACKET_PADDING_PATTERN1.sub(r"\1 \2", text)
     text = BRACKET_PADDING_PATTERN2.sub(r"\1 \2", text)
     text = text.replace("](", "] (")
@@ -516,6 +520,9 @@ def prepare_uwb_atcc(
     recordings = []
     supervisions = []
 
+    # regex pattern for multiple whitespaces
+    WHITESPACE_PATTERN = re.compile(r"  +")
+
     from tqdm.auto import tqdm
 
     for t in tqdm(trs_files, desc="Preparing"):
@@ -625,6 +632,7 @@ def prepare_uwb_atcc(
     recording_set = RecordingSet.from_recordings(recordings)
     supervision_set = SupervisionSet.from_segments(supervisions)
 
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
     validate_recordings_and_supervisions(recording_set, supervision_set)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/vctk.py b/lhotse/recipes/vctk.py
index 5799a44ef..84bb32db9 100644
--- a/lhotse/recipes/vctk.py
+++ b/lhotse/recipes/vctk.py
@@ -92,7 +92,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
-from lhotse.qa import remove_missing_recordings_and_supervisions
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, resumable_download
 
 EDINBURGH_VCTK_URL = (
@@ -223,9 +223,7 @@ def prepare_vctk(
 
     # note(pzelasko): There were 172 recordings without supervisions when I ran it.
     #                 I am just removing them.
-    recordings, supervisions = remove_missing_recordings_and_supervisions(
-        recordings, supervisions
-    )
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:
diff --git a/lhotse/recipes/voxceleb.py b/lhotse/recipes/voxceleb.py
index 53c2d9202..ef442d324 100644
--- a/lhotse/recipes/voxceleb.py
+++ b/lhotse/recipes/voxceleb.py
@@ -50,7 +50,7 @@
     SupervisionSet,
 )
 from lhotse.manipulation import combine
-from lhotse.qa import validate_recordings_and_supervisions
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.utils import Pathlike, resumable_download
 
 VOXCELEB1_PARTS_URL = [
@@ -264,7 +264,14 @@ def prepare_voxceleb(
             continue
         recordings = manifests[split]["recordings"]
         supervisions = manifests[split]["supervisions"]
+
+        # Fix manifests and validate
+        recordings, supervisions = fix_manifests(recordings, supervisions)
         validate_recordings_and_supervisions(recordings, supervisions)
+
+        # Write the manifests to the output directory
+        manifests[split]["recordings"] = recordings
+        manifests[split]["supervisions"] = supervisions
         if output_dir is not None:
             recordings.to_file(output_dir / f"voxceleb_recordings_{split}.jsonl.gz")
             supervisions.to_file(output_dir / f"voxceleb_supervisions_{split}.jsonl.gz")
diff --git a/lhotse/recipes/voxpopuli.py b/lhotse/recipes/voxpopuli.py
index 2cc4d37e4..8d49d9055 100644
--- a/lhotse/recipes/voxpopuli.py
+++ b/lhotse/recipes/voxpopuli.py
@@ -37,12 +37,7 @@
 from torch.hub import download_url_to_file
 from tqdm import tqdm
 
-from lhotse import (
-    RecordingSet,
-    SupervisionSegment,
-    SupervisionSet,
-    validate_recordings_and_supervisions,
-)
+from lhotse import RecordingSet, SupervisionSegment, SupervisionSet
 from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.utils import Pathlike, safe_extract
 
diff --git a/lhotse/recipes/xbmu_amdo31.py b/lhotse/recipes/xbmu_amdo31.py
index e690490bd..0c5005ec5 100644
--- a/lhotse/recipes/xbmu_amdo31.py
+++ b/lhotse/recipes/xbmu_amdo31.py
@@ -3,14 +3,14 @@
 XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University.
 publicly available on https://huggingface.co/datasets/syzym/xbmu_amdo31
 
-XBMU-AMDO31 dataset is a speech recognition corpus of Amdo Tibetan dialect. 
-The open source corpus contains 31 hours of speech data and resources related 
-to build speech recognition systems,including transcribed texts and a Tibetan 
+XBMU-AMDO31 dataset is a speech recognition corpus of Amdo Tibetan dialect.
+The open source corpus contains 31 hours of speech data and resources related
+to build speech recognition systems,including transcribed texts and a Tibetan
 pronunciation lexicon.
-(The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused 
+(The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused
 for the Amdo dialect because of the uniformity of the Tibetan language)
-The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR). 
-It was recorded by 66 native speakers of Amdo Tibetan, and the recorded audio was processed and manually inspected. 
+The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR).
+It was recorded by 66 native speakers of Amdo Tibetan, and the recorded audio was processed and manually inspected.
 The dataset has three splits: train, evaluation (dev) and test.Each speaker had approximately 450 sentences,
 with a small number of individuals having fewer than 200 sen.
 
@@ -33,7 +33,7 @@
 
 from tqdm.auto import tqdm
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, is_module_available, safe_extract
@@ -148,6 +148,9 @@ def prepare_xbmu_amdo31(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+
+        # Fix manifests and validate
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:
diff --git a/lhotse/recipes/yesno.py b/lhotse/recipes/yesno.py
index 3e030e0a3..ee64266c3 100644
--- a/lhotse/recipes/yesno.py
+++ b/lhotse/recipes/yesno.py
@@ -30,7 +30,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
-from lhotse import validate_recordings_and_supervisions
+from lhotse import fix_manifests, validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
@@ -152,6 +152,7 @@ def prepare_yesno(
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
 
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:

From 97d44437ae098e3b8efe9eb2a0bbd8c152e798cc Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 14 Sep 2023 14:11:09 -0400
Subject: [PATCH 32/32] fix unzipping

---
 lhotse/recipes/libricss.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lhotse/recipes/libricss.py b/lhotse/recipes/libricss.py
index e7724fadc..cf2ff5cc3 100644
--- a/lhotse/recipes/libricss.py
+++ b/lhotse/recipes/libricss.py
@@ -10,10 +10,13 @@
 import json
 import logging
 import subprocess
+import zipfile
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, Union
 
+from tqdm import tqdm
+
 from lhotse import (
     CutSet,
     RecordingSet,
@@ -120,7 +123,9 @@ def download_libricss(target_dir: Pathlike, force_download: bool = False) -> Pat
     # Extract the zipped file
     if not corpus_dir.exists() or force_download:
         logging.info(f"Extracting {corpus_zip} to {target_dir}")
-        corpus_zip.unzip(target_dir)
+        with zipfile.ZipFile(corpus_zip, "r") as corpus_zip:
+            for member in tqdm(corpus_zip.infolist(), desc="Extracting"):
+                corpus_zip.extract(member, target_dir)
 
     return target_dir