Add recipe for the yes_no dataset. (#380)

* Add recipe for the yes_no dataset. * Minor fixes.
lhotse-speech · Aug 21, 2021 · 2a1410b · 2a1410b
1 parent 9b12055
commit 2a1410b
Show file tree

Hide file tree

Showing 20 changed files with 221 additions and 30 deletions.
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -101,6 +101,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_tedlium`
   * - VCTK
     - :func:`lhotse.recipes.prepare_vctk`
+  * - YesNo
+    - :func:`lhotse.recipes.prepare_yesno`
 
 
 Adding new corpora

diff --git a/lhotse/audio.py b/lhotse/audio.py
@@ -133,7 +133,7 @@ class Recording:
 
     Note that :class:`~lhotse.audio.Recording` can represent both a single utterance (e.g., in LibriSpeech)
     and a 1-hour session with multiple channels and speakers (e.g., in AMI).
-    In the latter case, it is paritioned into data suitable for model training using :class:`~lhotse.cut.Cut`.
+    In the latter case, it is partitioned into data suitable for model training using :class:`~lhotse.cut.Cut`.
 
     .. hint::
         Lhotse reads audio recordings using `pysoundfile`_ and `audioread`_, similarly to librosa,
@@ -828,7 +828,7 @@ def add_to_mix(
                     f"To perform mix, energy must be non-zero and non-negative (got {added_audio_energy}). "
                 )
             target_energy = self.reference_energy * (10.0 ** (-snr / 10))
-            # When mixing time-domain singals, we are working with root-power (field) quantities,
+            # When mixing time-domain signals, we are working with root-power (field) quantities,
             # whereas the energy ratio applies to power quantities. To compute the gain correctly,
             # we need to take a square root of the energy ratio.
             gain = sqrt(target_energy / added_audio_energy)

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -26,3 +26,4 @@
 from .switchboard import *
 from .tedlium import *
 from .vctk import *
+from .yesno import *
diff --git a/lhotse/bin/modes/recipes/yesno.py b/lhotse/bin/modes/recipes/yesno.py
@@ -0,0 +1,19 @@
+import click
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.yesno import download_yesno, prepare_yesno
+from lhotse.utils import Pathlike
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+def yesno(target_dir: Pathlike):
+    """yes_no dataset download."""
+    download_yesno(target_dir)
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+def yesno(corpus_dir: Pathlike, output_dir: Pathlike):
+    """yes_no data preparation."""
+    prepare_yesno(corpus_dir, output_dir=output_dir)
diff --git a/lhotse/cut.py b/lhotse/cut.py
@@ -2774,7 +2774,7 @@ def compute_and_store_features(
             you will end up with incorrect supervision information when using this API.
             E.g. for speed perturbation, use ``CutSet.perturb_speed()`` instead.
         :param storage_type: a ``FeaturesWriter`` subclass type.
-            It determines how the featurs are stored to disk,
+            It determines how the features are stored to disk,
             e.g. separate file per array, HDF5 files with multiple arrays, etc.
         :param executor: when provided, will be used to parallelize the feature extraction process.
             By default, we will instantiate a ProcessPoolExecutor.

diff --git a/lhotse/dataset/collation.py b/lhotse/dataset/collation.py
@@ -23,7 +23,7 @@ class TokenCollater:
 
     Returns:
         tokens_batch: IntTensor of shape (B, L)
-            B: batch dimensoion, number of input sentences
+            B: batch dimension, number of input sentences
             L: length of the longest sentence
         tokens_lens: IntTensor of shape (B,)
             Length of each sentence after adding <eos> and <bos>

diff --git a/lhotse/dataset/sampling/base.py b/lhotse/dataset/sampling/base.py
@@ -104,7 +104,7 @@ def set_epoch(self, epoch: int) -> None:
 
     def filter(self, predicate: Callable[[Cut], bool]) -> None:
         """
-        Add a constraint on invidual cuts that has to be satisfied to consider them.
+        Add a constraint on individual cuts that has to be satisfied to consider them.
 
         Can be useful when handling large, lazy manifests where it is not feasible to
         pre-filter them before instantiating the sampler.
@@ -340,7 +340,7 @@ def get_report(self) -> str:
         """Returns a string describing the statistics of the sampling process so far."""
         if self.total_batches == 0 or self.total_cuts == 0:
             return (
-                "Sampling statistics unvavailable: the SamplerDiagnostics received no cuts or batches. "
+                "Sampling statistics unavailable: the SamplerDiagnostics received no cuts or batches. "
                 "If this is unexpected, and you're using a custom sampler, ensure that the sampler "
                 "is registering the batches in SamplerDiagnostics."
             )

diff --git a/lhotse/dataset/sampling/bucketing.py b/lhotse/dataset/sampling/bucketing.py
@@ -178,7 +178,7 @@ def set_epoch(self, epoch: int) -> None:
 
     def filter(self, predicate: Callable[[Cut], bool]) -> None:
         """
-        Add a constraint on invidual cuts that has to be satisfied to consider them.
+        Add a constraint on individual cuts that has to be satisfied to consider them.
 
         Can be useful when handling large, lazy manifests where it is not feasible to
         pre-filter them before instantiating the sampler.

diff --git a/lhotse/dataset/sampling/single_cut.py b/lhotse/dataset/sampling/single_cut.py
@@ -168,8 +168,10 @@ def _next_batch(self) -> CutSet:
                     # No. We'll warn the user that the constrains might be too tight,
                     # and return the cut anyway.
                     warnings.warn(
-                        "The first cut drawn in batch collection violates the max_frames or max_cuts "
-                        "constraints - we'll return it anyway. Consider increasing max_frames/max_cuts."
+                        "The first cut drawn in batch collection violates "
+                        "the max_frames, max_cuts, or max_duration constraints - "
+                        "we'll return it anyway. "
+                        "Consider increasing max_frames/max_cuts/max_duration."
                     )
                     cuts.append(next_cut)
 

diff --git a/lhotse/dataset/sampling/zip.py b/lhotse/dataset/sampling/zip.py
@@ -95,7 +95,7 @@ def set_epoch(self, epoch: int) -> None:
 
     def filter(self, predicate: Callable[[Cut], bool]) -> None:
         """
-        Add a constraint on invidual cuts that has to be satisfied to consider them.
+        Add a constraint on individual cuts that has to be satisfied to consider them.
 
         Can be useful when handling large, lazy manifests where it is not feasible to
         pre-filter them before instantiating the sampler.

diff --git a/lhotse/dataset/signal_transforms.py b/lhotse/dataset/signal_transforms.py
@@ -132,7 +132,7 @@ def __init__(
             p=0.5,
     ):
         """
-        SpecAugment's contructor.
+        SpecAugment's constructor.
 
         :param time_warp_factor: parameter for the time warping; larger values mean more warping.
             Set to ``None``, or less than ``1``, to disable.

diff --git a/lhotse/dataset/speech_recognition.py b/lhotse/dataset/speech_recognition.py
@@ -11,7 +11,7 @@
 
 class K2SpeechRecognitionDataset(torch.utils.data.Dataset):
     """
-    The PyTorch Dataset for the speech recognition task using K2 library.
+    The PyTorch Dataset for the speech recognition task using k2 library.
 
     This dataset expects to be queried with lists of cut IDs,
     for which it loads features and automatically collates/batches them.
@@ -65,7 +65,7 @@ def __init__(
             input_strategy: BatchIO = PrecomputedFeatures(),
     ):
         """
-        K2 ASR IterableDataset constructor.
+        k2 ASR IterableDataset constructor.
 
         :param return_cuts: When ``True``, will additionally return a "cut" field in each batch with the Cut
             objects used to create that batch.
@@ -87,7 +87,7 @@ def __init__(
 
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
-        Return a new batch, with the batch size automatically determined using the contraints
+        Return a new batch, with the batch size automatically determined using the constraints
         of max_frames and max_cuts.
         """
         validate_for_asr(cuts)

diff --git a/lhotse/dataset/unsupervised.py b/lhotse/dataset/unsupervised.py
@@ -41,7 +41,7 @@ class UnsupervisedWaveformDataset(UnsupervisedDataset):
     """
     A variant of UnsupervisedDataset that provides waveform samples instead of features.
     The output is a tensor of shape (C, T), with C being the number of channels and T the number of audio samples.
-    In this implemenation, there will always be a single channel.
+    In this implementation, there will always be a single channel.
 
     Returns:
 

diff --git a/lhotse/kaldi.py b/lhotse/kaldi.py
@@ -15,7 +15,7 @@ def get_duration(
 ) -> float:
     """
     Read a audio file, it supports pipeline style wave path and real waveform.
-    
+
     :param path: Path to an audio file or a Kaldi-style pipe.
     :return: float duration of the recording, in seconds.
     """
@@ -61,7 +61,7 @@ def load_kaldi_data_dir(
     durations = {}
     for recording_id, path_or_cmd in recordings.items():
         duration = get_duration(path_or_cmd)
-        durations[recording_id] = duration 
+        durations[recording_id] = duration
 
     recording_set = RecordingSet.from_recordings(
         Recording(

diff --git a/lhotse/manifest.py b/lhotse/manifest.py
@@ -23,7 +23,7 @@
 FeatureSet:
    - Represents a set of extracted features associated with recordings.
      (could also have dynamic versions).
-   - Caution: we shouldn't assume that we are extracting features for the entire recording, or that the frame rates are always consant (perturbing the frame rates might be useful).
+   - Caution: we shouldn't assume that we are extracting features for the entire recording, or that the frame rates are always constant (perturbing the frame rates might be useful).
    - I don't want to obscure the relationship between segments and the original recordings by introducing another arbitrary level of id (like the 'cut' we discussed on the call, but see later).   What I am thinking is that the features would be accessible by recording-id, optional channel-info and time, maybe?   E.g. "what do you have for channel 0 of recording 'foo' between t=16 and t=22.2" ?
    - Again, we should make the metadata available separately from the actual data.
   - Please see the `lilcom` project on my github, which I have now finalized with the aim being to support compression of feature files in a general numpy-compatible way.  This will be useful here (but maybe shouldn't be visible from the interface).

diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -28,3 +28,4 @@
 from .tedlium import download_tedlium, prepare_tedlium
 from .timit import download_timit, prepare_timit
 from .vctk import download_vctk, prepare_vctk
+from .yesno import download_yesno, prepare_yesno
diff --git a/lhotse/recipes/aishell.py b/lhotse/recipes/aishell.py
@@ -59,7 +59,7 @@ def prepare_aishell(
     Returns the manifests which consist of the Recordings and Supervisions
     :param corpus_dir: Pathlike, the path of the data dir.
     :param output_dir: Pathlike, the path where to write the manifests.
-    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
     """
     corpus_dir = Path(corpus_dir)
     assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

diff --git a/lhotse/recipes/yesno.py b/lhotse/recipes/yesno.py
@@ -0,0 +1,166 @@
+"""
+About the yes no dataset:
+
+This dataset was created for the Kaldi project (see kaldi.sf.net), by a
+contributor who prefers to remain anonymous. The main point of the dataset
+is to provide an easy and fast way to test out the Kaldi scripts for free.
+
+The archive "waves_yesno.tar.gz" contains 60 .wav files, sampled at 8 kHz.
+All were recorded by the same male speaker, in Hebrew. In each file, the
+individual says 8 words; each word is either the Hebrew for "yes" or "no",
+so each file is a random sequence of 8 yes-es or noes. There is no separate
+transcription provided; the sequence is encoded in the filename, with 1 for
+yes and 0 for no, for instance:
+
+# tar -xvzf waves_yesno.tar.gz
+waves_yesno/1_0_1_1_1_0_1_0.wav
+waves_yesno/0_1_1_0_0_1_1_0.wav
+...
+
+The dataset can be downloaded from the following address:
+
+    https://www.openslr.org/1/
+
+"""
+
+import logging
+import shutil
+import tarfile
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+from lhotse import validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, urlretrieve_progress
+
+_DEFAULT_URL = "http://www.openslr.org/resources/1/waves_yesno.tar.gz"
+
+
+def download_yesno(
+    target_dir: Pathlike = ".",
+    force_download: Optional[bool] = False,
+    url: Optional[str] = _DEFAULT_URL,
+):
+    """Download and untar the dataset.
+    :param target_dir: Pathlike, the path of the dir to store the dataset.
+        The extracted files are saved to target_dir/waves_yesno/*.wav
+    :param force_download: Bool, if True, download the tar file no matter
+        whether it exists or not.
+    :param url: str, the url to download the dataset.
+    """
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+    extracted_dir = target_dir / "waves_yesno"
+
+    tar_path = target_dir / "waves_yesno.tar.gz"
+
+    completed_detector = extracted_dir / ".completed"
+    if completed_detector.is_file():
+        logging.info(f"Skipping - {completed_detector} exists.")
+        return
+
+    if force_download or not tar_path.is_file():
+        urlretrieve_progress(
+            f"{url}", filename=tar_path, desc=f"Downloading waves_yesno.tar.gz"
+        )
+
+    shutil.rmtree(extracted_dir, ignore_errors=True)
+
+    with tarfile.open(tar_path) as tar:
+        tar.extractall(path=target_dir)
+
+    completed_detector.touch()
+
+
+def _prepare_dataset(
+    dataset: List[Pathlike],
+) -> Tuple[List[Recording], List[SupervisionSegment]]:
+    """Build a list of Recording and SupervisionSegment from a list
+    of sound filenames.
+
+    :param dataset: List[Pathlike], a list of sound filenames
+    :return: a tuple containing a list of Recording and a list
+        of SupervisionSegment
+    """
+    word_map = {"0": "NO", "1": "YES"}
+
+    recordings = []
+    supervisions = []
+    for audio_path in dataset:
+        words = audio_path.stem.split("_")
+        assert len(words) == 8
+        assert set(words).union({"0", "1"}) == {"0", "1"}, f"words is: {words}"
+
+        words = [word_map[w] for w in words]
+        text = " ".join(words)
+
+        recording = Recording.from_file(audio_path)
+        recordings.append(recording)
+
+        segment = SupervisionSegment(
+            id=audio_path.stem,
+            recording_id=audio_path.stem,
+            start=0.0,
+            duration=recording.duration,
+            channel=0,
+            language="Hebrew",
+            text=text,
+        )
+        supervisions.append(segment)
+
+    return recordings, supervisions
+
+
+def prepare_yesno(
+    corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions.
+    When all the manifests are available in the ``output_dir``, it will simply
+    read and return them.
+
+    :param corpus_dir: Pathlike, the path of the data dir. It's expected to
+        contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
+        are 8 x's and each x is either 1 or 0.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: a Dict whose key is either "train" or "test", and the value is
+        Dicts with the keys 'recordings' and 'supervisions'.
+    """
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    wave_files = list(corpus_dir.glob("*.wav"))
+    assert len(wave_files) == 60
+
+    wave_files.sort()
+    train_set = wave_files[::2]
+    test_set = wave_files[1::2]
+
+    assert len(train_set) == 30
+    assert len(test_set) == 30
+
+    manifests = defaultdict(dict)
+    for name, dataset in zip(["train", "test"], [train_set, test_set]):
+        recordings, supervisions = _prepare_dataset(dataset)
+
+        recording_set = RecordingSet.from_recordings(recordings)
+        supervision_set = SupervisionSet.from_segments(supervisions)
+
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
+        if output_dir is not None:
+            supervision_set.to_json(output_dir / f"supervisions_{name}.json")
+            recording_set.to_json(output_dir / f"recordings_{name}.json")
+
+        manifests[name] = {
+            "recordings": recording_set,
+            "supervisions": supervision_set,
+        }
+
+    return manifests
diff --git a/lhotse/serialization.py b/lhotse/serialization.py
@@ -109,7 +109,7 @@ class SequentialJsonlWriter:
 
     This writer can be useful for continuing to write files that were previously
     stopped -- it will open the existing file and scan it for item IDs to skip
-    writing them later. It can also be quried for existing IDs so that the user
+    writing them later. It can also be queried for existing IDs so that the user
     code may skip preparing the corresponding manifets.
 
     Example:
@@ -202,7 +202,7 @@ def open_writer(cls, path: Pathlike, overwrite: bool = True) -> SequentialJsonlW
 
         This writer can be useful for continuing to write files that were previously
         stopped -- it will open the existing file and scan it for item IDs to skip
-        writing them later. It can also be quried for existing IDs so that the user
+        writing them later. It can also be queried for existing IDs so that the user
         code may skip preparing the corresponding manifets.
 
         Example: