diff --git a/lhotse/bin/modes/recipes/eval2000.py b/lhotse/bin/modes/recipes/eval2000.py index 6150a88bc..e3d278ce6 100644 --- a/lhotse/bin/modes/recipes/eval2000.py +++ b/lhotse/bin/modes/recipes/eval2000.py @@ -11,7 +11,7 @@ @click.argument("corpus-dir", type=click.Path(exists=True, file_okay=False)) @click.argument("output-dir", type=click.Path()) @click.option( - "transcript-dir", + "--transcript-dir", type=click.Path(exists=True, file_okay=False), default=None, required=False, diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py index 0f922c104..7d464dc45 100644 --- a/lhotse/bin/modes/recipes/librispeech.py +++ b/lhotse/bin/modes/recipes/librispeech.py @@ -34,11 +34,19 @@ default=1, help="How many threads to use (can give good speed-ups with slow disks).", ) +@click.option( + "--normalize-text", + type=click.Choice(["none", "lower"], case_sensitive=False), + default="none", + help="Conversion of transcripts to lower-case (originally in upper-case).", + show_default=True, +) def librispeech( corpus_dir: Pathlike, output_dir: Pathlike, alignments_dir: Pathlike, dataset_parts: Sequence[str], + normalize_text: str, num_jobs: int, ): """(Mini) Librispeech ASR data preparation.""" @@ -50,6 +58,7 @@ def librispeech( alignments_dir=alignments_dir, num_jobs=num_jobs, dataset_parts=dataset_parts, + normalize_text=normalize_text, ) diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py index 5a1040645..fd84329da 100644 --- a/lhotse/recipes/commonvoice.py +++ b/lhotse/recipes/commonvoice.py @@ -18,6 +18,7 @@ from collections import defaultdict from concurrent.futures.process import ProcessPoolExecutor from contextlib import contextmanager +from multiprocessing import get_context as mp_get_context from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union @@ -198,7 +199,11 @@ def _prepare_part( tsv_path = lang_path / f"{part}.tsv" with disable_ffmpeg_torchaudio_info(): - with ProcessPoolExecutor(num_jobs) as ex: + with ProcessPoolExecutor( + max_workers=num_jobs, + mp_context=mp_get_context("spawn"), + ) as ex: + futures = [] recordings = [] supervisions = [] diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py index 0b654514a..faa48f2a9 100644 --- a/lhotse/recipes/librispeech.py +++ b/lhotse/recipes/librispeech.py @@ -114,6 +114,7 @@ def prepare_librispeech( alignments_dir: Optional[Pathlike] = None, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, + normalize_text: str = "none", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ @@ -126,6 +127,9 @@ def prepare_librispeech( :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. + :param normalize_text: str, "none" or "lower", + for "lower" the transcripts are converted to lower-case. + :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) @@ -204,6 +208,13 @@ def prepare_librispeech( recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) + # Normalize text to lowercase + if normalize_text == "lower": + to_lower = lambda text: text.lower() + supervision_set = SupervisionSet.from_segments( + [s.transform_text(to_lower) for s in supervision_set] + ) + recording_set, supervision_set = fix_manifests( recording_set, supervision_set )