From 48f3539bf8651f8a61852e125d6122d458bce45c Mon Sep 17 00:00:00 2001 From: yfy62 Date: Sun, 8 Oct 2023 17:29:17 +0800 Subject: [PATCH 1/2] Add ICMC-ASR corpus --- docs/corpus.rst | 2 + lhotse/bin/modes/recipes/__init__.py | 1 + lhotse/bin/modes/recipes/icmcasr.py | 30 +++++ lhotse/recipes/__init__.py | 1 + lhotse/recipes/icmcasr.py | 170 +++++++++++++++++++++++++++ lhotse/recipes/librilight.py | 2 +- 6 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 lhotse/bin/modes/recipes/icmcasr.py create mode 100644 lhotse/recipes/icmcasr.py diff --git a/docs/corpus.rst b/docs/corpus.rst index e7cdbe0b2..8873a403d 100644 --- a/docs/corpus.rst +++ b/docs/corpus.rst @@ -121,6 +121,8 @@ a CLI tool that create the manifests given a corpus directory. - :func:`lhotse.recipes.prepare_hifitts` * - HI-MIA (including HI-MIA-CW) - :func:`lhotse.recipes.prepare_himia` + * - ICMC-ASR + - :func:`lhotse.recipes.prepare_icmcasr` * - ICSI - :func:`lhotse.recipes.prepare_icsi` * - IWSLT22_Ta diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py index 32a3dc93e..e55da601e 100644 --- a/lhotse/bin/modes/recipes/__init__.py +++ b/lhotse/bin/modes/recipes/__init__.py @@ -40,6 +40,7 @@ from .heroico import * from .hifitts import * from .himia import * +from .icmcasr import * from .icsi import * from .iwslt22_ta import * from .kespeech import * diff --git a/lhotse/bin/modes/recipes/icmcasr.py b/lhotse/bin/modes/recipes/icmcasr.py new file mode 100644 index 000000000..1a11c06af --- /dev/null +++ b/lhotse/bin/modes/recipes/icmcasr.py @@ -0,0 +1,30 @@ +from typing import Dict, List, Optional, Tuple, Union + +import click + +from lhotse.bin.modes import download, prepare +from lhotse.recipes.icmcasr import prepare_icmcasr +from lhotse.utils import Pathlike + + +@prepare.command(context_settings=dict(show_default=True)) +@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) +@click.argument("output_dir", type=click.Path()) +@click.option( + "-j", + "--num-jobs", + type=int, + default=1, + help="How many threads to use (can give good speed-ups with slow disks).", +) +def icmcasr( + corpus_dir: Pathlike, + output_dir: Optional[Pathlike] = None, + num_jobs: int = 1, +): + """ICMC-ASR data preparation.""" + prepare_icmcasr( + corpus_dir=corpus_dir, + output_dir=output_dir, + num_jobs=num_jobs, + ) diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py index 9ae74710c..a69c3f194 100644 --- a/lhotse/recipes/__init__.py +++ b/lhotse/recipes/__init__.py @@ -37,6 +37,7 @@ from .heroico import download_heroico, prepare_heroico from .hifitts import download_hifitts, prepare_hifitts from .himia import download_himia, prepare_himia +from .icmcasr import prepare_icmcasr from .icsi import download_icsi, prepare_icsi from .iwslt22_ta import prepare_iwslt22_ta from .kespeech import prepare_kespeech diff --git a/lhotse/recipes/icmcasr.py b/lhotse/recipes/icmcasr.py new file mode 100644 index 000000000..2666408ff --- /dev/null +++ b/lhotse/recipes/icmcasr.py @@ -0,0 +1,170 @@ +""" +The ICMC-ASR Grand Challenge dataset is collected in a hybrid electric vehicle with speakers sitting in different positions, including the driver seat and passenger seats. The total number of speakers is over 160 and all of them are native Chinese speakers speaking Mandarin without strong accents. To comprehensively capture speech signals of the entire cockpit, two types of recording devices are used: far-field and near-field recording devices. 8 distributed microphones are placed at four seats in the car, which are the driver's seat (DS01C01, DX01C01), the passenger seat (DS02C01, DX02C01), the rear right seat (DS03C01, DX03C01) and the rear left seat (DS04C01, DX04C01). Additionally, 2 linear microphone arrays, each consisting of 2 microphones, are placed on the display screen (DL01C01, DL02C02) and at the center of the inner sunroof (DL02C01, DL02C02), respectively. All 12 channels of far-field data are time-synchronized and included in the released dataset as far-field data. For transcription purposes, each speaker wears a high-fidelity headphone to record near-field audio, denoted by the seat where the speaker is situated. Specifically, DA01, DA02, DA03, and DA04 represent the driver seat, passenger seat, rear right seat and rear left seat, respectively. The near-field data only have single-channel audio recordings. Additionally, a sizable real noise dataset is provided, following the recording setup of the far-filed data but without speaker talking, to facilitate research of in-car scenario data simulation technology. + +Participants can obtain the datasets at https://icmcasr.org - please download the datasets manually. +""" + +import logging +import os +from collections import defaultdict +from concurrent.futures.thread import ThreadPoolExecutor +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple, Union + +from lhotse.audio import Recording, RecordingSet +from lhotse.qa import fix_manifests, validate_recordings_and_supervisions +from lhotse.recipes.utils import manifests_exist +from lhotse.supervision import SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike +from tqdm.auto import tqdm + +ICMCASR = ("train",) # TODO: Support all subsets when released +POSITION = ("DA01", "DA02", "DA03", "DA04") + + +def _parse_utterance( + corpus_dir: Pathlike, + section_path: Pathlike, +) -> Optional[Tuple[Recording, SupervisionSegment]]: + recordings = [] + segments = [] + for position in POSITION: + text_path = (section_path / (position + ".TextGrid")).resolve() + if not text_path.is_file(): + continue + + audio_path = (section_path / (position + ".wav")).resolve() + recording_id = ( + str(section_path / position) + .replace(str(corpus_dir) + "/", "") + .replace("/", "-") + ) + + recordings.append( + Recording.from_file(path=audio_path, recording_id=recording_id) + ) + + with open(text_path) as f: + datalines = f.read().splitlines() + + seq = 0 + for dataline in datalines: + if "name" in dataline: + speaker = dataline.split('"')[1].strip() + elif "xmin =" in dataline: + start = float(dataline.split("=")[1].strip()) + elif "xmax =" in dataline: + end = float(dataline.split("=")[1].strip()) + elif "text" in dataline: + text = dataline.split('"')[1].strip() + if len(text) > 0: + if float(recordings[-1].duration) < end: + duration = float(recordings[-1].duration) - start + else: + duration = end - start + segment_id = recording_id + "-" + str(seq) + segments.append( + SupervisionSegment( + id=segment_id, + recording_id=recording_id, + start=start, + duration=duration, + channel=0, + language="Chinese", + speaker=speaker, + text=text, + ) + ) + seq += 1 + + return recordings, segments + + +def _prepare_subset( + subset: str, + corpus_dir: Pathlike, + num_jobs: int = 1, +) -> Tuple[RecordingSet, SupervisionSet]: + """ + Returns the RecodingSet and SupervisionSet given a dataset part. + :param subset: str, the name of the subset. + :param corpus_dir: Pathlike, the path of the data dir. + :return: the RecodingSet and SupervisionSet for train and valid. + """ + corpus_dir = Path(corpus_dir) + part_path = corpus_dir / subset + sections = os.listdir(part_path) + + with ThreadPoolExecutor(num_jobs) as ex: + futures = [] + recording_set = [] + supervision_set = [] + for section in tqdm(sections, desc="Distributing tasks"): + section_path = part_path / section + futures.append(ex.submit(_parse_utterance, corpus_dir, section_path)) + + for future in tqdm(futures, desc="Processing"): + result = future.result() + if result is None: + continue + recordings, segments = result + recording_set.extend(recordings) + supervision_set.extend(segments) + + recording_set = RecordingSet.from_recordings(recording_set) + supervision_set = SupervisionSet.from_segments(supervision_set) + + # Fix manifests + recording_set, supervision_set = fix_manifests(recording_set, supervision_set) + validate_recordings_and_supervisions(recording_set, supervision_set) + + return recording_set, supervision_set + + +def prepare_icmcasr( + corpus_dir: Pathlike, + output_dir: Optional[Pathlike] = None, + num_jobs: int = 1, +) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: + """ + Returns the manifests which consist of the Recordings and Supervisions + :param corpus_dir: Path to the ICMC-ASR dataset. + :param output_dir: Pathlike, the path where to write the manifests. + :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. + """ + corpus_dir = Path(corpus_dir) + + assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" + + logging.info("Preparing ICMC-ASR...") + + subsets = ICMCASR + + if output_dir is not None: + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + manifests = defaultdict(dict) + + for part in tqdm(subsets, desc="Dataset parts"): + logging.info(f"Processing ICMC-ASR subset: {part}") + if manifests_exist( + part=part, + output_dir=output_dir, + prefix="icmcasr", + suffix="jsonl.gz", + ): + logging.info(f"ICMC-ASR subset: {part} already prepared - skipping.") + continue + + recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs) + + if output_dir is not None: + supervision_set.to_file( + output_dir / f"icmcasr_supervisions_{part}.jsonl.gz" + ) + recording_set.to_file(output_dir / f"icmcasr_recordings_{part}.jsonl.gz") + + manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} + + return manifests diff --git a/lhotse/recipes/librilight.py b/lhotse/recipes/librilight.py index 3dd0dff31..aefc999ec 100644 --- a/lhotse/recipes/librilight.py +++ b/lhotse/recipes/librilight.py @@ -8,7 +8,7 @@ English and a small labelled dataset (10h, 1h, and 10 min) plus metrics, trainable baseline models, and pretrained models that use these datasets. -It is covered in more detail at https://arxiv.org/abs/1912.07875. +It is covered in more detail at https://arxiv.org/abs/1912.07875 This data is very huge - please download manually at LIBRILIGHT_URL. """ From 8308a5324761b94bc0d423bfe9273c1a81024feb Mon Sep 17 00:00:00 2001 From: yfy62 Date: Sun, 8 Oct 2023 17:33:50 +0800 Subject: [PATCH 2/2] Fix isort --- lhotse/recipes/icmcasr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lhotse/recipes/icmcasr.py b/lhotse/recipes/icmcasr.py index 2666408ff..1cd2eea26 100644 --- a/lhotse/recipes/icmcasr.py +++ b/lhotse/recipes/icmcasr.py @@ -11,12 +11,13 @@ from pathlib import Path from typing import Dict, List, Optional, Sequence, Tuple, Union +from tqdm.auto import tqdm + from lhotse.audio import Recording, RecordingSet from lhotse.qa import fix_manifests, validate_recordings_and_supervisions from lhotse.recipes.utils import manifests_exist from lhotse.supervision import SupervisionSegment, SupervisionSet from lhotse.utils import Pathlike -from tqdm.auto import tqdm ICMCASR = ("train",) # TODO: Support all subsets when released POSITION = ("DA01", "DA02", "DA03", "DA04")