update_recipes

- commonvoice : use "spawn" context in `ProcessPoolExecutor` - eval2000 : bugfix (@click.option must begin with `--*`) - librispeech : add `--to-lowercase` for librispeech dataprep (default are uppercase transcripts)
lhotse-speech · Nov 6, 2023 · e67ea29 · e67ea29
1 parent 8a4b3b5
commit e67ea29
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 2 deletions.
diff --git a/lhotse/bin/modes/recipes/eval2000.py b/lhotse/bin/modes/recipes/eval2000.py
@@ -11,7 +11,7 @@
 @click.argument("corpus-dir", type=click.Path(exists=True, file_okay=False))
 @click.argument("output-dir", type=click.Path())
 @click.option(
-    "transcript-dir",
+    "--transcript-dir",
     type=click.Path(exists=True, file_okay=False),
     default=None,
     required=False,

diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py
@@ -34,11 +34,18 @@
     default=1,
     help="How many threads to use (can give good speed-ups with slow disks).",
 )
+@click.option(
+    "--to-lowercase",
+    type=bool,
+    default=False,
+    help="Conversion of transcripts to lower-vase (originally in uppercase).",
+)
 def librispeech(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
     alignments_dir: Pathlike,
     dataset_parts: Sequence[str],
+    to_lowercase: bool,
     num_jobs: int,
 ):
     """(Mini) Librispeech ASR data preparation."""
@@ -50,6 +57,7 @@ def librispeech(
         alignments_dir=alignments_dir,
         num_jobs=num_jobs,
         dataset_parts=dataset_parts,
+        to_lowercase=to_lowercase,
     )
 
 

diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py
@@ -18,6 +18,7 @@
 from collections import defaultdict
 from concurrent.futures.process import ProcessPoolExecutor
 from contextlib import contextmanager
+from multiprocessing import get_context as mp_get_context
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
@@ -198,7 +199,11 @@ def _prepare_part(
     tsv_path = lang_path / f"{part}.tsv"
 
     with disable_ffmpeg_torchaudio_info():
-        with ProcessPoolExecutor(num_jobs) as ex:
+        with ProcessPoolExecutor(
+            max_workers=num_jobs,
+            mp_context=mp_get_context("spawn"),
+        ) as ex:
+
             futures = []
             recordings = []
             supervisions = []

diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py
@@ -114,6 +114,7 @@ def prepare_librispeech(
     alignments_dir: Optional[Pathlike] = None,
     dataset_parts: Union[str, Sequence[str]] = "auto",
     output_dir: Optional[Pathlike] = None,
+    to_lowercase: bool = False,
     num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
@@ -126,6 +127,8 @@ def prepare_librispeech(
     :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
         By default we will infer which parts are available in ``corpus_dir``.
     :param output_dir: Pathlike, the path where to write the manifests.
+    :param to_lowercase: Bool, if True, the transcripts are converted to lower-case.
+    :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
     :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
     """
     corpus_dir = Path(corpus_dir)
@@ -204,6 +207,12 @@ def prepare_librispeech(
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
 
+            if to_lowercase:
+                to_lower = lambda text: text.lower()
+                supervision_set = SupervisionSet.from_segments(
+                    [s.transform_text(to_lower) for s in supervision_set]
+                )
+
             recording_set, supervision_set = fix_manifests(
                 recording_set, supervision_set
             )