diff --git a/lhotse/bin/modes/recipes/eval2000.py b/lhotse/bin/modes/recipes/eval2000.py index 6150a88bc..e3d278ce6 100644 --- a/lhotse/bin/modes/recipes/eval2000.py +++ b/lhotse/bin/modes/recipes/eval2000.py @@ -11,7 +11,7 @@ @click.argument("corpus-dir", type=click.Path(exists=True, file_okay=False)) @click.argument("output-dir", type=click.Path()) @click.option( - "transcript-dir", + "--transcript-dir", type=click.Path(exists=True, file_okay=False), default=None, required=False, diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py index 0f922c104..3674f2011 100644 --- a/lhotse/bin/modes/recipes/librispeech.py +++ b/lhotse/bin/modes/recipes/librispeech.py @@ -34,11 +34,18 @@ default=1, help="How many threads to use (can give good speed-ups with slow disks).", ) +@click.option( + "--to-lowercase", + type=bool, + default=False, + help="Conversion of transcripts to lower-vase (originally in uppercase).", +) def librispeech( corpus_dir: Pathlike, output_dir: Pathlike, alignments_dir: Pathlike, dataset_parts: Sequence[str], + to_lowercase: bool, num_jobs: int, ): """(Mini) Librispeech ASR data preparation.""" @@ -50,6 +57,7 @@ def librispeech( alignments_dir=alignments_dir, num_jobs=num_jobs, dataset_parts=dataset_parts, + to_lowercase=to_lowercase, ) diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py index 5a1040645..fd84329da 100644 --- a/lhotse/recipes/commonvoice.py +++ b/lhotse/recipes/commonvoice.py @@ -18,6 +18,7 @@ from collections import defaultdict from concurrent.futures.process import ProcessPoolExecutor from contextlib import contextmanager +from multiprocessing import get_context as mp_get_context from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union @@ -198,7 +199,11 @@ def _prepare_part( tsv_path = lang_path / f"{part}.tsv" with disable_ffmpeg_torchaudio_info(): - with ProcessPoolExecutor(num_jobs) as ex: + with ProcessPoolExecutor( + max_workers=num_jobs, + mp_context=mp_get_context("spawn"), + ) as ex: + futures = [] recordings = [] supervisions = [] diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py index 0b654514a..e1bf49d0c 100644 --- a/lhotse/recipes/librispeech.py +++ b/lhotse/recipes/librispeech.py @@ -114,6 +114,7 @@ def prepare_librispeech( alignments_dir: Optional[Pathlike] = None, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, + to_lowercase: bool = False, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ @@ -126,6 +127,8 @@ def prepare_librispeech( :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. + :param to_lowercase: Bool, if True, the transcripts are converted to lower-case. + :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) @@ -204,6 +207,12 @@ def prepare_librispeech( recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) + if to_lowercase: + to_lower = lambda text: text.lower() + supervision_set = SupervisionSet.from_segments( + [s.transform_text(to_lower) for s in supervision_set] + ) + recording_set, supervision_set = fix_manifests( recording_set, supervision_set )