Skip to content

Commit

Permalink
update_recipes
Browse files Browse the repository at this point in the history
- commonvoice : use "spawn" context in `ProcessPoolExecutor`
- eval2000 : bugfix (@click.option must begin with `--*`)
- librispeech : add `--to-lowercase` for librispeech dataprep (default are uppercase transcripts)
  • Loading branch information
KarelVesely84 committed Nov 6, 2023
1 parent 8a4b3b5 commit e67ea29
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 2 deletions.
2 changes: 1 addition & 1 deletion lhotse/bin/modes/recipes/eval2000.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
@click.argument("corpus-dir", type=click.Path(exists=True, file_okay=False))
@click.argument("output-dir", type=click.Path())
@click.option(
"transcript-dir",
"--transcript-dir",
type=click.Path(exists=True, file_okay=False),
default=None,
required=False,
Expand Down
8 changes: 8 additions & 0 deletions lhotse/bin/modes/recipes/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,18 @@
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
@click.option(
"--to-lowercase",
type=bool,
default=False,
help="Conversion of transcripts to lower-vase (originally in uppercase).",
)
def librispeech(
corpus_dir: Pathlike,
output_dir: Pathlike,
alignments_dir: Pathlike,
dataset_parts: Sequence[str],
to_lowercase: bool,
num_jobs: int,
):
"""(Mini) Librispeech ASR data preparation."""
Expand All @@ -50,6 +57,7 @@ def librispeech(
alignments_dir=alignments_dir,
num_jobs=num_jobs,
dataset_parts=dataset_parts,
to_lowercase=to_lowercase,
)


Expand Down
7 changes: 6 additions & 1 deletion lhotse/recipes/commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from collections import defaultdict
from concurrent.futures.process import ProcessPoolExecutor
from contextlib import contextmanager
from multiprocessing import get_context as mp_get_context
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union

Expand Down Expand Up @@ -198,7 +199,11 @@ def _prepare_part(
tsv_path = lang_path / f"{part}.tsv"

with disable_ffmpeg_torchaudio_info():
with ProcessPoolExecutor(num_jobs) as ex:
with ProcessPoolExecutor(
max_workers=num_jobs,
mp_context=mp_get_context("spawn"),
) as ex:

futures = []
recordings = []
supervisions = []
Expand Down
9 changes: 9 additions & 0 deletions lhotse/recipes/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def prepare_librispeech(
alignments_dir: Optional[Pathlike] = None,
dataset_parts: Union[str, Sequence[str]] = "auto",
output_dir: Optional[Pathlike] = None,
to_lowercase: bool = False,
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Expand All @@ -126,6 +127,8 @@ def prepare_librispeech(
:param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
By default we will infer which parts are available in ``corpus_dir``.
:param output_dir: Pathlike, the path where to write the manifests.
:param to_lowercase: Bool, if True, the transcripts are converted to lower-case.
:param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
Expand Down Expand Up @@ -204,6 +207,12 @@ def prepare_librispeech(
recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)

if to_lowercase:
to_lower = lambda text: text.lower()
supervision_set = SupervisionSet.from_segments(
[s.transform_text(to_lower) for s in supervision_set]
)

recording_set, supervision_set = fix_manifests(
recording_set, supervision_set
)
Expand Down

0 comments on commit e67ea29

Please sign in to comment.