From e67ea2973d2fe943db7882f2eb864162565c64b0 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Mon, 6 Nov 2023 16:58:28 +0100 Subject: [PATCH 1/2] update_recipes - commonvoice : use "spawn" context in `ProcessPoolExecutor` - eval2000 : bugfix (@click.option must begin with `--*`) - librispeech : add `--to-lowercase` for librispeech dataprep (default are uppercase transcripts) --- lhotse/bin/modes/recipes/eval2000.py | 2 +- lhotse/bin/modes/recipes/librispeech.py | 8 ++++++++ lhotse/recipes/commonvoice.py | 7 ++++++- lhotse/recipes/librispeech.py | 9 +++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/lhotse/bin/modes/recipes/eval2000.py b/lhotse/bin/modes/recipes/eval2000.py index 6150a88bc..e3d278ce6 100644 --- a/lhotse/bin/modes/recipes/eval2000.py +++ b/lhotse/bin/modes/recipes/eval2000.py @@ -11,7 +11,7 @@ @click.argument("corpus-dir", type=click.Path(exists=True, file_okay=False)) @click.argument("output-dir", type=click.Path()) @click.option( - "transcript-dir", + "--transcript-dir", type=click.Path(exists=True, file_okay=False), default=None, required=False, diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py index 0f922c104..3674f2011 100644 --- a/lhotse/bin/modes/recipes/librispeech.py +++ b/lhotse/bin/modes/recipes/librispeech.py @@ -34,11 +34,18 @@ default=1, help="How many threads to use (can give good speed-ups with slow disks).", ) +@click.option( + "--to-lowercase", + type=bool, + default=False, + help="Conversion of transcripts to lower-vase (originally in uppercase).", +) def librispeech( corpus_dir: Pathlike, output_dir: Pathlike, alignments_dir: Pathlike, dataset_parts: Sequence[str], + to_lowercase: bool, num_jobs: int, ): """(Mini) Librispeech ASR data preparation.""" @@ -50,6 +57,7 @@ def librispeech( alignments_dir=alignments_dir, num_jobs=num_jobs, dataset_parts=dataset_parts, + to_lowercase=to_lowercase, ) diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py index 5a1040645..fd84329da 100644 --- a/lhotse/recipes/commonvoice.py +++ b/lhotse/recipes/commonvoice.py @@ -18,6 +18,7 @@ from collections import defaultdict from concurrent.futures.process import ProcessPoolExecutor from contextlib import contextmanager +from multiprocessing import get_context as mp_get_context from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union @@ -198,7 +199,11 @@ def _prepare_part( tsv_path = lang_path / f"{part}.tsv" with disable_ffmpeg_torchaudio_info(): - with ProcessPoolExecutor(num_jobs) as ex: + with ProcessPoolExecutor( + max_workers=num_jobs, + mp_context=mp_get_context("spawn"), + ) as ex: + futures = [] recordings = [] supervisions = [] diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py index 0b654514a..e1bf49d0c 100644 --- a/lhotse/recipes/librispeech.py +++ b/lhotse/recipes/librispeech.py @@ -114,6 +114,7 @@ def prepare_librispeech( alignments_dir: Optional[Pathlike] = None, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, + to_lowercase: bool = False, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ @@ -126,6 +127,8 @@ def prepare_librispeech( :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. + :param to_lowercase: Bool, if True, the transcripts are converted to lower-case. + :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) @@ -204,6 +207,12 @@ def prepare_librispeech( recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) + if to_lowercase: + to_lower = lambda text: text.lower() + supervision_set = SupervisionSet.from_segments( + [s.transform_text(to_lower) for s in supervision_set] + ) + recording_set, supervision_set = fix_manifests( recording_set, supervision_set ) From ae2518b989cfef0abba591220575e93d20d04cda Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Tue, 7 Nov 2023 11:17:09 +0100 Subject: [PATCH 2/2] librispeech, redoing text-norm as `--normalize-text=['none','lower']` as desh2608 suggested --- lhotse/bin/modes/recipes/librispeech.py | 13 +++++++------ lhotse/recipes/librispeech.py | 8 +++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py index 3674f2011..7d464dc45 100644 --- a/lhotse/bin/modes/recipes/librispeech.py +++ b/lhotse/bin/modes/recipes/librispeech.py @@ -35,17 +35,18 @@ help="How many threads to use (can give good speed-ups with slow disks).", ) @click.option( - "--to-lowercase", - type=bool, - default=False, - help="Conversion of transcripts to lower-vase (originally in uppercase).", + "--normalize-text", + type=click.Choice(["none", "lower"], case_sensitive=False), + default="none", + help="Conversion of transcripts to lower-case (originally in upper-case).", + show_default=True, ) def librispeech( corpus_dir: Pathlike, output_dir: Pathlike, alignments_dir: Pathlike, dataset_parts: Sequence[str], - to_lowercase: bool, + normalize_text: str, num_jobs: int, ): """(Mini) Librispeech ASR data preparation.""" @@ -57,7 +58,7 @@ def librispeech( alignments_dir=alignments_dir, num_jobs=num_jobs, dataset_parts=dataset_parts, - to_lowercase=to_lowercase, + normalize_text=normalize_text, ) diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py index e1bf49d0c..faa48f2a9 100644 --- a/lhotse/recipes/librispeech.py +++ b/lhotse/recipes/librispeech.py @@ -114,7 +114,7 @@ def prepare_librispeech( alignments_dir: Optional[Pathlike] = None, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, - to_lowercase: bool = False, + normalize_text: str = "none", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ @@ -127,7 +127,8 @@ def prepare_librispeech( :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. - :param to_lowercase: Bool, if True, the transcripts are converted to lower-case. + :param normalize_text: str, "none" or "lower", + for "lower" the transcripts are converted to lower-case. :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ @@ -207,7 +208,8 @@ def prepare_librispeech( recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) - if to_lowercase: + # Normalize text to lowercase + if normalize_text == "lower": to_lower = lambda text: text.lower() supervision_set = SupervisionSet.from_segments( [s.transform_text(to_lower) for s in supervision_set]