From e67ea2973d2fe943db7882f2eb864162565c64b0 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Mon, 6 Nov 2023 16:58:28 +0100
Subject: [PATCH 1/2] update_recipes

- commonvoice : use "spawn" context in `ProcessPoolExecutor`
- eval2000 : bugfix (@click.option must begin with `--*`)
- librispeech : add `--to-lowercase` for librispeech dataprep (default are uppercase transcripts)
---
 lhotse/bin/modes/recipes/eval2000.py    | 2 +-
 lhotse/bin/modes/recipes/librispeech.py | 8 ++++++++
 lhotse/recipes/commonvoice.py           | 7 ++++++-
 lhotse/recipes/librispeech.py           | 9 +++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/lhotse/bin/modes/recipes/eval2000.py b/lhotse/bin/modes/recipes/eval2000.py
index 6150a88bc..e3d278ce6 100644
--- a/lhotse/bin/modes/recipes/eval2000.py
+++ b/lhotse/bin/modes/recipes/eval2000.py
@@ -11,7 +11,7 @@
 @click.argument("corpus-dir", type=click.Path(exists=True, file_okay=False))
 @click.argument("output-dir", type=click.Path())
 @click.option(
-    "transcript-dir",
+    "--transcript-dir",
     type=click.Path(exists=True, file_okay=False),
     default=None,
     required=False,
diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py
index 0f922c104..3674f2011 100644
--- a/lhotse/bin/modes/recipes/librispeech.py
+++ b/lhotse/bin/modes/recipes/librispeech.py
@@ -34,11 +34,18 @@
     default=1,
     help="How many threads to use (can give good speed-ups with slow disks).",
 )
+@click.option(
+    "--to-lowercase",
+    type=bool,
+    default=False,
+    help="Conversion of transcripts to lower-vase (originally in uppercase).",
+)
 def librispeech(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
     alignments_dir: Pathlike,
     dataset_parts: Sequence[str],
+    to_lowercase: bool,
     num_jobs: int,
 ):
     """(Mini) Librispeech ASR data preparation."""
@@ -50,6 +57,7 @@ def librispeech(
         alignments_dir=alignments_dir,
         num_jobs=num_jobs,
         dataset_parts=dataset_parts,
+        to_lowercase=to_lowercase,
     )
 
 
diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py
index 5a1040645..fd84329da 100644
--- a/lhotse/recipes/commonvoice.py
+++ b/lhotse/recipes/commonvoice.py
@@ -18,6 +18,7 @@
 from collections import defaultdict
 from concurrent.futures.process import ProcessPoolExecutor
 from contextlib import contextmanager
+from multiprocessing import get_context as mp_get_context
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
@@ -198,7 +199,11 @@ def _prepare_part(
     tsv_path = lang_path / f"{part}.tsv"
 
     with disable_ffmpeg_torchaudio_info():
-        with ProcessPoolExecutor(num_jobs) as ex:
+        with ProcessPoolExecutor(
+            max_workers=num_jobs,
+            mp_context=mp_get_context("spawn"),
+        ) as ex:
+
             futures = []
             recordings = []
             supervisions = []
diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py
index 0b654514a..e1bf49d0c 100644
--- a/lhotse/recipes/librispeech.py
+++ b/lhotse/recipes/librispeech.py
@@ -114,6 +114,7 @@ def prepare_librispeech(
     alignments_dir: Optional[Pathlike] = None,
     dataset_parts: Union[str, Sequence[str]] = "auto",
     output_dir: Optional[Pathlike] = None,
+    to_lowercase: bool = False,
     num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
@@ -126,6 +127,8 @@ def prepare_librispeech(
     :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
         By default we will infer which parts are available in ``corpus_dir``.
     :param output_dir: Pathlike, the path where to write the manifests.
+    :param to_lowercase: Bool, if True, the transcripts are converted to lower-case.
+    :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
     :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
     """
     corpus_dir = Path(corpus_dir)
@@ -204,6 +207,12 @@ def prepare_librispeech(
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
 
+            if to_lowercase:
+                to_lower = lambda text: text.lower()
+                supervision_set = SupervisionSet.from_segments(
+                    [s.transform_text(to_lower) for s in supervision_set]
+                )
+
             recording_set, supervision_set = fix_manifests(
                 recording_set, supervision_set
             )

From ae2518b989cfef0abba591220575e93d20d04cda Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Tue, 7 Nov 2023 11:17:09 +0100
Subject: [PATCH 2/2] librispeech, redoing text-norm as
 `--normalize-text=['none','lower']` as desh2608 suggested

---
 lhotse/bin/modes/recipes/librispeech.py | 13 +++++++------
 lhotse/recipes/librispeech.py           |  8 +++++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py
index 3674f2011..7d464dc45 100644
--- a/lhotse/bin/modes/recipes/librispeech.py
+++ b/lhotse/bin/modes/recipes/librispeech.py
@@ -35,17 +35,18 @@
     help="How many threads to use (can give good speed-ups with slow disks).",
 )
 @click.option(
-    "--to-lowercase",
-    type=bool,
-    default=False,
-    help="Conversion of transcripts to lower-vase (originally in uppercase).",
+    "--normalize-text",
+    type=click.Choice(["none", "lower"], case_sensitive=False),
+    default="none",
+    help="Conversion of transcripts to lower-case (originally in upper-case).",
+    show_default=True,
 )
 def librispeech(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
     alignments_dir: Pathlike,
     dataset_parts: Sequence[str],
-    to_lowercase: bool,
+    normalize_text: str,
     num_jobs: int,
 ):
     """(Mini) Librispeech ASR data preparation."""
@@ -57,7 +58,7 @@ def librispeech(
         alignments_dir=alignments_dir,
         num_jobs=num_jobs,
         dataset_parts=dataset_parts,
-        to_lowercase=to_lowercase,
+        normalize_text=normalize_text,
     )
 
 
diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py
index e1bf49d0c..faa48f2a9 100644
--- a/lhotse/recipes/librispeech.py
+++ b/lhotse/recipes/librispeech.py
@@ -114,7 +114,7 @@ def prepare_librispeech(
     alignments_dir: Optional[Pathlike] = None,
     dataset_parts: Union[str, Sequence[str]] = "auto",
     output_dir: Optional[Pathlike] = None,
-    to_lowercase: bool = False,
+    normalize_text: str = "none",
     num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
@@ -127,7 +127,8 @@ def prepare_librispeech(
     :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
         By default we will infer which parts are available in ``corpus_dir``.
     :param output_dir: Pathlike, the path where to write the manifests.
-    :param to_lowercase: Bool, if True, the transcripts are converted to lower-case.
+    :param normalize_text: str, "none" or "lower",
+        for "lower" the transcripts are converted to lower-case.
     :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
     :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
     """
@@ -207,7 +208,8 @@ def prepare_librispeech(
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
 
-            if to_lowercase:
+            # Normalize text to lowercase
+            if normalize_text == "lower":
                 to_lower = lambda text: text.lower()
                 supervision_set = SupervisionSet.from_segments(
                     [s.transform_text(to_lower) for s in supervision_set]