Skip to content

Commit

Permalink
Add recipe for the yes_no dataset. (#380)
Browse files Browse the repository at this point in the history
* Add recipe for the yes_no dataset.

* Minor fixes.
  • Loading branch information
csukuangfj committed Aug 21, 2021
1 parent 9b12055 commit 2a1410b
Show file tree
Hide file tree
Showing 20 changed files with 221 additions and 30 deletions.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_tedlium`
* - VCTK
- :func:`lhotse.recipes.prepare_vctk`
* - YesNo
- :func:`lhotse.recipes.prepare_yesno`


Adding new corpora
Expand Down
4 changes: 2 additions & 2 deletions lhotse/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ class Recording:
Note that :class:`~lhotse.audio.Recording` can represent both a single utterance (e.g., in LibriSpeech)
and a 1-hour session with multiple channels and speakers (e.g., in AMI).
In the latter case, it is paritioned into data suitable for model training using :class:`~lhotse.cut.Cut`.
In the latter case, it is partitioned into data suitable for model training using :class:`~lhotse.cut.Cut`.
.. hint::
Lhotse reads audio recordings using `pysoundfile`_ and `audioread`_, similarly to librosa,
Expand Down Expand Up @@ -828,7 +828,7 @@ def add_to_mix(
f"To perform mix, energy must be non-zero and non-negative (got {added_audio_energy}). "
)
target_energy = self.reference_energy * (10.0 ** (-snr / 10))
# When mixing time-domain singals, we are working with root-power (field) quantities,
# When mixing time-domain signals, we are working with root-power (field) quantities,
# whereas the energy ratio applies to power quantities. To compute the gain correctly,
# we need to take a square root of the energy ratio.
gain = sqrt(target_energy / added_audio_energy)
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@
from .switchboard import *
from .tedlium import *
from .vctk import *
from .yesno import *
19 changes: 19 additions & 0 deletions lhotse/bin/modes/recipes/yesno.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import click
from lhotse.bin.modes import download, prepare
from lhotse.recipes.yesno import download_yesno, prepare_yesno
from lhotse.utils import Pathlike


@download.command(context_settings=dict(show_default=True))
@click.argument("target_dir", type=click.Path())
def yesno(target_dir: Pathlike):
"""yes_no dataset download."""
download_yesno(target_dir)


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
def yesno(corpus_dir: Pathlike, output_dir: Pathlike):
"""yes_no data preparation."""
prepare_yesno(corpus_dir, output_dir=output_dir)
2 changes: 1 addition & 1 deletion lhotse/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -2774,7 +2774,7 @@ def compute_and_store_features(
you will end up with incorrect supervision information when using this API.
E.g. for speed perturbation, use ``CutSet.perturb_speed()`` instead.
:param storage_type: a ``FeaturesWriter`` subclass type.
It determines how the featurs are stored to disk,
It determines how the features are stored to disk,
e.g. separate file per array, HDF5 files with multiple arrays, etc.
:param executor: when provided, will be used to parallelize the feature extraction process.
By default, we will instantiate a ProcessPoolExecutor.
Expand Down
2 changes: 1 addition & 1 deletion lhotse/dataset/collation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class TokenCollater:
Returns:
tokens_batch: IntTensor of shape (B, L)
B: batch dimensoion, number of input sentences
B: batch dimension, number of input sentences
L: length of the longest sentence
tokens_lens: IntTensor of shape (B,)
Length of each sentence after adding <eos> and <bos>
Expand Down
4 changes: 2 additions & 2 deletions lhotse/dataset/sampling/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def set_epoch(self, epoch: int) -> None:

def filter(self, predicate: Callable[[Cut], bool]) -> None:
"""
Add a constraint on invidual cuts that has to be satisfied to consider them.
Add a constraint on individual cuts that has to be satisfied to consider them.
Can be useful when handling large, lazy manifests where it is not feasible to
pre-filter them before instantiating the sampler.
Expand Down Expand Up @@ -340,7 +340,7 @@ def get_report(self) -> str:
"""Returns a string describing the statistics of the sampling process so far."""
if self.total_batches == 0 or self.total_cuts == 0:
return (
"Sampling statistics unvavailable: the SamplerDiagnostics received no cuts or batches. "
"Sampling statistics unavailable: the SamplerDiagnostics received no cuts or batches. "
"If this is unexpected, and you're using a custom sampler, ensure that the sampler "
"is registering the batches in SamplerDiagnostics."
)
Expand Down
2 changes: 1 addition & 1 deletion lhotse/dataset/sampling/bucketing.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def set_epoch(self, epoch: int) -> None:

def filter(self, predicate: Callable[[Cut], bool]) -> None:
"""
Add a constraint on invidual cuts that has to be satisfied to consider them.
Add a constraint on individual cuts that has to be satisfied to consider them.
Can be useful when handling large, lazy manifests where it is not feasible to
pre-filter them before instantiating the sampler.
Expand Down
6 changes: 4 additions & 2 deletions lhotse/dataset/sampling/single_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,10 @@ def _next_batch(self) -> CutSet:
# No. We'll warn the user that the constrains might be too tight,
# and return the cut anyway.
warnings.warn(
"The first cut drawn in batch collection violates the max_frames or max_cuts "
"constraints - we'll return it anyway. Consider increasing max_frames/max_cuts."
"The first cut drawn in batch collection violates "
"the max_frames, max_cuts, or max_duration constraints - "
"we'll return it anyway. "
"Consider increasing max_frames/max_cuts/max_duration."
)
cuts.append(next_cut)

Expand Down
2 changes: 1 addition & 1 deletion lhotse/dataset/sampling/zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def set_epoch(self, epoch: int) -> None:

def filter(self, predicate: Callable[[Cut], bool]) -> None:
"""
Add a constraint on invidual cuts that has to be satisfied to consider them.
Add a constraint on individual cuts that has to be satisfied to consider them.
Can be useful when handling large, lazy manifests where it is not feasible to
pre-filter them before instantiating the sampler.
Expand Down
2 changes: 1 addition & 1 deletion lhotse/dataset/signal_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def __init__(
p=0.5,
):
"""
SpecAugment's contructor.
SpecAugment's constructor.
:param time_warp_factor: parameter for the time warping; larger values mean more warping.
Set to ``None``, or less than ``1``, to disable.
Expand Down
6 changes: 3 additions & 3 deletions lhotse/dataset/speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class K2SpeechRecognitionDataset(torch.utils.data.Dataset):
"""
The PyTorch Dataset for the speech recognition task using K2 library.
The PyTorch Dataset for the speech recognition task using k2 library.
This dataset expects to be queried with lists of cut IDs,
for which it loads features and automatically collates/batches them.
Expand Down Expand Up @@ -65,7 +65,7 @@ def __init__(
input_strategy: BatchIO = PrecomputedFeatures(),
):
"""
K2 ASR IterableDataset constructor.
k2 ASR IterableDataset constructor.
:param return_cuts: When ``True``, will additionally return a "cut" field in each batch with the Cut
objects used to create that batch.
Expand All @@ -87,7 +87,7 @@ def __init__(

def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
"""
Return a new batch, with the batch size automatically determined using the contraints
Return a new batch, with the batch size automatically determined using the constraints
of max_frames and max_cuts.
"""
validate_for_asr(cuts)
Expand Down
2 changes: 1 addition & 1 deletion lhotse/dataset/unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class UnsupervisedWaveformDataset(UnsupervisedDataset):
"""
A variant of UnsupervisedDataset that provides waveform samples instead of features.
The output is a tensor of shape (C, T), with C being the number of channels and T the number of audio samples.
In this implemenation, there will always be a single channel.
In this implementation, there will always be a single channel.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions lhotse/kaldi.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_duration(
) -> float:
"""
Read a audio file, it supports pipeline style wave path and real waveform.
:param path: Path to an audio file or a Kaldi-style pipe.
:return: float duration of the recording, in seconds.
"""
Expand Down Expand Up @@ -61,7 +61,7 @@ def load_kaldi_data_dir(
durations = {}
for recording_id, path_or_cmd in recordings.items():
duration = get_duration(path_or_cmd)
durations[recording_id] = duration
durations[recording_id] = duration

recording_set = RecordingSet.from_recordings(
Recording(
Expand Down
2 changes: 1 addition & 1 deletion lhotse/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
FeatureSet:
- Represents a set of extracted features associated with recordings.
(could also have dynamic versions).
- Caution: we shouldn't assume that we are extracting features for the entire recording, or that the frame rates are always consant (perturbing the frame rates might be useful).
- Caution: we shouldn't assume that we are extracting features for the entire recording, or that the frame rates are always constant (perturbing the frame rates might be useful).
- I don't want to obscure the relationship between segments and the original recordings by introducing another arbitrary level of id (like the 'cut' we discussed on the call, but see later). What I am thinking is that the features would be accessible by recording-id, optional channel-info and time, maybe? E.g. "what do you have for channel 0 of recording 'foo' between t=16 and t=22.2" ?
- Again, we should make the metadata available separately from the actual data.
- Please see the `lilcom` project on my github, which I have now finalized with the aim being to support compression of feature files in a general numpy-compatible way. This will be useful here (but maybe shouldn't be visible from the interface).
Expand Down
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@
from .tedlium import download_tedlium, prepare_tedlium
from .timit import download_timit, prepare_timit
from .vctk import download_vctk, prepare_vctk
from .yesno import download_yesno, prepare_yesno
2 changes: 1 addition & 1 deletion lhotse/recipes/aishell.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def prepare_aishell(
Returns the manifests which consist of the Recordings and Supervisions
:param corpus_dir: Pathlike, the path of the data dir.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
Expand Down
166 changes: 166 additions & 0 deletions lhotse/recipes/yesno.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""
About the yes no dataset:
This dataset was created for the Kaldi project (see kaldi.sf.net), by a
contributor who prefers to remain anonymous. The main point of the dataset
is to provide an easy and fast way to test out the Kaldi scripts for free.
The archive "waves_yesno.tar.gz" contains 60 .wav files, sampled at 8 kHz.
All were recorded by the same male speaker, in Hebrew. In each file, the
individual says 8 words; each word is either the Hebrew for "yes" or "no",
so each file is a random sequence of 8 yes-es or noes. There is no separate
transcription provided; the sequence is encoded in the filename, with 1 for
yes and 0 for no, for instance:
# tar -xvzf waves_yesno.tar.gz
waves_yesno/1_0_1_1_1_0_1_0.wav
waves_yesno/0_1_1_0_0_1_1_0.wav
...
The dataset can be downloaded from the following address:
https://www.openslr.org/1/
"""

import logging
import shutil
import tarfile
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, urlretrieve_progress

_DEFAULT_URL = "http://www.openslr.org/resources/1/waves_yesno.tar.gz"


def download_yesno(
target_dir: Pathlike = ".",
force_download: Optional[bool] = False,
url: Optional[str] = _DEFAULT_URL,
):
"""Download and untar the dataset.
:param target_dir: Pathlike, the path of the dir to store the dataset.
The extracted files are saved to target_dir/waves_yesno/*.wav
:param force_download: Bool, if True, download the tar file no matter
whether it exists or not.
:param url: str, the url to download the dataset.
"""
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
extracted_dir = target_dir / "waves_yesno"

tar_path = target_dir / "waves_yesno.tar.gz"

completed_detector = extracted_dir / ".completed"
if completed_detector.is_file():
logging.info(f"Skipping - {completed_detector} exists.")
return

if force_download or not tar_path.is_file():
urlretrieve_progress(
f"{url}", filename=tar_path, desc=f"Downloading waves_yesno.tar.gz"
)

shutil.rmtree(extracted_dir, ignore_errors=True)

with tarfile.open(tar_path) as tar:
tar.extractall(path=target_dir)

completed_detector.touch()


def _prepare_dataset(
dataset: List[Pathlike],
) -> Tuple[List[Recording], List[SupervisionSegment]]:
"""Build a list of Recording and SupervisionSegment from a list
of sound filenames.
:param dataset: List[Pathlike], a list of sound filenames
:return: a tuple containing a list of Recording and a list
of SupervisionSegment
"""
word_map = {"0": "NO", "1": "YES"}

recordings = []
supervisions = []
for audio_path in dataset:
words = audio_path.stem.split("_")
assert len(words) == 8
assert set(words).union({"0", "1"}) == {"0", "1"}, f"words is: {words}"

words = [word_map[w] for w in words]
text = " ".join(words)

recording = Recording.from_file(audio_path)
recordings.append(recording)

segment = SupervisionSegment(
id=audio_path.stem,
recording_id=audio_path.stem,
start=0.0,
duration=recording.duration,
channel=0,
language="Hebrew",
text=text,
)
supervisions.append(segment)

return recordings, supervisions


def prepare_yesno(
corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions.
When all the manifests are available in the ``output_dir``, it will simply
read and return them.
:param corpus_dir: Pathlike, the path of the data dir. It's expected to
contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
are 8 x's and each x is either 1 or 0.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is either "train" or "test", and the value is
Dicts with the keys 'recordings' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

wave_files = list(corpus_dir.glob("*.wav"))
assert len(wave_files) == 60

wave_files.sort()
train_set = wave_files[::2]
test_set = wave_files[1::2]

assert len(train_set) == 30
assert len(test_set) == 30

manifests = defaultdict(dict)
for name, dataset in zip(["train", "test"], [train_set, test_set]):
recordings, supervisions = _prepare_dataset(dataset)

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)

validate_recordings_and_supervisions(recording_set, supervision_set)

if output_dir is not None:
supervision_set.to_json(output_dir / f"supervisions_{name}.json")
recording_set.to_json(output_dir / f"recordings_{name}.json")

manifests[name] = {
"recordings": recording_set,
"supervisions": supervision_set,
}

return manifests
4 changes: 2 additions & 2 deletions lhotse/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class SequentialJsonlWriter:
This writer can be useful for continuing to write files that were previously
stopped -- it will open the existing file and scan it for item IDs to skip
writing them later. It can also be quried for existing IDs so that the user
writing them later. It can also be queried for existing IDs so that the user
code may skip preparing the corresponding manifets.
Example:
Expand Down Expand Up @@ -202,7 +202,7 @@ def open_writer(cls, path: Pathlike, overwrite: bool = True) -> SequentialJsonlW
This writer can be useful for continuing to write files that were previously
stopped -- it will open the existing file and scan it for item IDs to skip
writing them later. It can also be quried for existing IDs so that the user
writing them later. It can also be queried for existing IDs so that the user
code may skip preparing the corresponding manifets.
Example:
Expand Down

0 comments on commit 2a1410b

Please sign in to comment.