lhotse-speech · pzelasko · Oct 8, 2023 · Oct 8, 2023 · Oct 8, 2023
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -121,6 +121,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_hifitts`
   * - HI-MIA (including HI-MIA-CW)
     - :func:`lhotse.recipes.prepare_himia`
+  * - ICMC-ASR
+    - :func:`lhotse.recipes.prepare_icmcasr`
   * - ICSI
     - :func:`lhotse.recipes.prepare_icsi`
   * - IWSLT22_Ta

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -40,6 +40,7 @@
 from .heroico import *
 from .hifitts import *
 from .himia import *
+from .icmcasr import *
 from .icsi import *
 from .iwslt22_ta import *
 from .kespeech import *

diff --git a/lhotse/bin/modes/recipes/icmcasr.py b/lhotse/bin/modes/recipes/icmcasr.py
@@ -0,0 +1,30 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.icmcasr import prepare_icmcasr
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+def icmcasr(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+    num_jobs: int = 1,
+):
+    """ICMC-ASR data preparation."""
+    prepare_icmcasr(
+        corpus_dir=corpus_dir,
+        output_dir=output_dir,
+        num_jobs=num_jobs,
+    )
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -37,6 +37,7 @@
 from .heroico import download_heroico, prepare_heroico
 from .hifitts import download_hifitts, prepare_hifitts
 from .himia import download_himia, prepare_himia
+from .icmcasr import prepare_icmcasr
 from .icsi import download_icsi, prepare_icsi
 from .iwslt22_ta import prepare_iwslt22_ta
 from .kespeech import prepare_kespeech

diff --git a/lhotse/recipes/icmcasr.py b/lhotse/recipes/icmcasr.py
@@ -0,0 +1,171 @@
+"""
+The ICMC-ASR Grand Challenge dataset is collected in a hybrid electric vehicle with speakers sitting in different positions, including the driver seat and passenger seats. The total number of speakers is over 160 and all of them are native Chinese speakers speaking Mandarin without strong accents. To comprehensively capture speech signals of the entire cockpit, two types of recording devices are used: far-field and near-field recording devices. 8 distributed microphones are placed at four seats in the car, which are the driver's seat (DS01C01, DX01C01), the passenger seat (DS02C01, DX02C01), the rear right seat (DS03C01, DX03C01) and the rear left seat (DS04C01, DX04C01). Additionally, 2 linear microphone arrays, each consisting of 2 microphones, are placed on the display screen (DL01C01, DL02C02) and at the center of the inner sunroof (DL02C01, DL02C02), respectively. All 12 channels of far-field data are time-synchronized and included in the released dataset as far-field data. For transcription purposes, each speaker wears a high-fidelity headphone to record near-field audio, denoted by the seat where the speaker is situated. Specifically, DA01, DA02, DA03, and DA04 represent the driver seat, passenger seat, rear right seat and rear left seat, respectively. The near-field data only have single-channel audio recordings. Additionally, a sizable real noise dataset is provided, following the recording setup of the far-filed data but without speaker talking, to facilitate research of in-car scenario data simulation technology.
+
+Participants can obtain the datasets at https://icmcasr.org - please download the datasets manually.
+"""
+
+import logging
+import os
+from collections import defaultdict
+from concurrent.futures.thread import ThreadPoolExecutor
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from tqdm.auto import tqdm
+
+from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
+from lhotse.recipes.utils import manifests_exist
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike
+
+ICMCASR = ("train",)  # TODO: Support all subsets when released
+POSITION = ("DA01", "DA02", "DA03", "DA04")
+
+
+def _parse_utterance(
+    corpus_dir: Pathlike,
+    section_path: Pathlike,
+) -> Optional[Tuple[Recording, SupervisionSegment]]:
+    recordings = []
+    segments = []
+    for position in POSITION:
+        text_path = (section_path / (position + ".TextGrid")).resolve()
+        if not text_path.is_file():
+            continue
+
+        audio_path = (section_path / (position + ".wav")).resolve()
+        recording_id = (
+            str(section_path / position)
+            .replace(str(corpus_dir) + "/", "")
+            .replace("/", "-")
+        )
+
+        recordings.append(
+            Recording.from_file(path=audio_path, recording_id=recording_id)
+        )
+
+        with open(text_path) as f:
+            datalines = f.read().splitlines()
+
+        seq = 0
+        for dataline in datalines:
+            if "name" in dataline:
+                speaker = dataline.split('"')[1].strip()
+            elif "xmin =" in dataline:
+                start = float(dataline.split("=")[1].strip())
+            elif "xmax =" in dataline:
+                end = float(dataline.split("=")[1].strip())
+            elif "text" in dataline:
+                text = dataline.split('"')[1].strip()
+                if len(text) > 0:
+                    if float(recordings[-1].duration) < end:
+                        duration = float(recordings[-1].duration) - start
+                    else:
+                        duration = end - start
+                    segment_id = recording_id + "-" + str(seq)
+                    segments.append(
+                        SupervisionSegment(
+                            id=segment_id,
+                            recording_id=recording_id,
+                            start=start,
+                            duration=duration,
+                            channel=0,
+                            language="Chinese",
+                            speaker=speaker,
+                            text=text,
+                        )
+                    )
+                    seq += 1
+
+    return recordings, segments
+
+
+def _prepare_subset(
+    subset: str,
+    corpus_dir: Pathlike,
+    num_jobs: int = 1,
+) -> Tuple[RecordingSet, SupervisionSet]:
+    """
+    Returns the RecodingSet and SupervisionSet given a dataset part.
+    :param subset: str, the name of the subset.
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :return: the RecodingSet and SupervisionSet for train and valid.
+    """
+    corpus_dir = Path(corpus_dir)
+    part_path = corpus_dir / subset
+    sections = os.listdir(part_path)
+
+    with ThreadPoolExecutor(num_jobs) as ex:
+        futures = []
+        recording_set = []
+        supervision_set = []
+        for section in tqdm(sections, desc="Distributing tasks"):
+            section_path = part_path / section
+            futures.append(ex.submit(_parse_utterance, corpus_dir, section_path))
+
+        for future in tqdm(futures, desc="Processing"):
+            result = future.result()
+            if result is None:
+                continue
+            recordings, segments = result
+            recording_set.extend(recordings)
+            supervision_set.extend(segments)
+
+        recording_set = RecordingSet.from_recordings(recording_set)
+        supervision_set = SupervisionSet.from_segments(supervision_set)
+
+        # Fix manifests
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
+    return recording_set, supervision_set
+
+
+def prepare_icmcasr(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+    num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions
+    :param corpus_dir: Path to the ICMC-ASR dataset.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
+    """
+    corpus_dir = Path(corpus_dir)
+
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    logging.info("Preparing ICMC-ASR...")
+
+    subsets = ICMCASR
+
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    manifests = defaultdict(dict)
+
+    for part in tqdm(subsets, desc="Dataset parts"):
+        logging.info(f"Processing ICMC-ASR subset: {part}")
+        if manifests_exist(
+            part=part,
+            output_dir=output_dir,
+            prefix="icmcasr",
+            suffix="jsonl.gz",
+        ):
+            logging.info(f"ICMC-ASR subset: {part} already prepared - skipping.")
+            continue
+
+        recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs)
+
+        if output_dir is not None:
+            supervision_set.to_file(
+                output_dir / f"icmcasr_supervisions_{part}.jsonl.gz"
+            )
+            recording_set.to_file(output_dir / f"icmcasr_recordings_{part}.jsonl.gz")
+
+        manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}
+
+    return manifests
diff --git a/lhotse/recipes/librilight.py b/lhotse/recipes/librilight.py
@@ -8,7 +8,7 @@
 English and a small labelled dataset (10h, 1h, and 10 min) plus metrics,
 trainable baseline models, and pretrained models that use these datasets.
 
-It is covered in more detail at https://arxiv.org/abs/1912.07875.
+It is covered in more detail at https://arxiv.org/abs/1912.07875
 
 This data is very huge - please download manually at LIBRILIGHT_URL.
 """