lhotse-speech · pzelasko · Nov 10, 2023 · Nov 6, 2023 · Nov 7, 2023
diff --git a/lhotse/bin/modes/recipes/eval2000.py b/lhotse/bin/modes/recipes/eval2000.py
@@ -11,7 +11,7 @@
 @click.argument("corpus-dir", type=click.Path(exists=True, file_okay=False))
 @click.argument("output-dir", type=click.Path())
 @click.option(
-    "transcript-dir",
+    "--transcript-dir",
     type=click.Path(exists=True, file_okay=False),
     default=None,
     required=False,

diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py
@@ -34,11 +34,19 @@
     default=1,
     help="How many threads to use (can give good speed-ups with slow disks).",
 )
+@click.option(
+    "--normalize-text",
+    type=click.Choice(["none", "lower"], case_sensitive=False),
+    default="none",
+    help="Conversion of transcripts to lower-case (originally in upper-case).",
+    show_default=True,
+)
 def librispeech(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
     alignments_dir: Pathlike,
     dataset_parts: Sequence[str],
+    normalize_text: str,
     num_jobs: int,
 ):
     """(Mini) Librispeech ASR data preparation."""
@@ -50,6 +58,7 @@ def librispeech(
         alignments_dir=alignments_dir,
         num_jobs=num_jobs,
         dataset_parts=dataset_parts,
+        normalize_text=normalize_text,
     )
 
 

diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py
@@ -18,6 +18,7 @@
 from collections import defaultdict
 from concurrent.futures.process import ProcessPoolExecutor
 from contextlib import contextmanager
+from multiprocessing import get_context as mp_get_context
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
@@ -198,7 +199,11 @@ def _prepare_part(
     tsv_path = lang_path / f"{part}.tsv"
 
     with disable_ffmpeg_torchaudio_info():
-        with ProcessPoolExecutor(num_jobs) as ex:
+        with ProcessPoolExecutor(
+            max_workers=num_jobs,
+            mp_context=mp_get_context("spawn"),
+        ) as ex:
+
             futures = []
             recordings = []
             supervisions = []

diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py
@@ -114,6 +114,7 @@ def prepare_librispeech(
     alignments_dir: Optional[Pathlike] = None,
     dataset_parts: Union[str, Sequence[str]] = "auto",
     output_dir: Optional[Pathlike] = None,
+    normalize_text: str = "none",
     num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
@@ -126,6 +127,9 @@ def prepare_librispeech(
     :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
         By default we will infer which parts are available in ``corpus_dir``.
     :param output_dir: Pathlike, the path where to write the manifests.
+    :param normalize_text: str, "none" or "lower",
+        for "lower" the transcripts are converted to lower-case.
+    :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
     :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
     """
     corpus_dir = Path(corpus_dir)
@@ -204,6 +208,13 @@ def prepare_librispeech(
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
 
+            # Normalize text to lowercase
+            if normalize_text == "lower":
+                to_lower = lambda text: text.lower()
+                supervision_set = SupervisionSet.from_segments(
+                    [s.transform_text(to_lower) for s in supervision_set]
+                )
+
             recording_set, supervision_set = fix_manifests(
                 recording_set, supervision_set
             )