From 9491bdbc035000bfcfad468c85f4953d3027c0ec Mon Sep 17 00:00:00 2001 From: root Date: Fri, 24 Nov 2023 11:44:00 +0000 Subject: [PATCH 1/3] save sdm files into mdm file to do gss --- lhotse/bin/modes/recipes/icmcasr.py | 5 ++++- lhotse/recipes/icmcasr.py | 22 ++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/lhotse/bin/modes/recipes/icmcasr.py b/lhotse/bin/modes/recipes/icmcasr.py index a7c137c20..a2b5594c6 100644 --- a/lhotse/bin/modes/recipes/icmcasr.py +++ b/lhotse/bin/modes/recipes/icmcasr.py @@ -18,7 +18,10 @@ help="How many threads to use (can give good speed-ups with slow disks).", ) @click.option( - "--mic", type=click.Choice(["ihm", "sdm"]), default="ihm", help="Microphone type." + "--mic", + type=click.Choice(["ihm", "sdm", "mdm"]), + default="ihm", + help="Microphone type.", ) def icmcasr( corpus_dir: Pathlike, diff --git a/lhotse/recipes/icmcasr.py b/lhotse/recipes/icmcasr.py index 1f211e68b..0d1c70234 100644 --- a/lhotse/recipes/icmcasr.py +++ b/lhotse/recipes/icmcasr.py @@ -6,6 +6,7 @@ import logging import os +import subprocess from collections import defaultdict from concurrent.futures.thread import ThreadPoolExecutor from pathlib import Path @@ -21,7 +22,8 @@ ICMCASR = ("train", "dev") # TODO: Support all subsets when released POSITION = ("DA01", "DA02", "DA03", "DA04") -SDM_POSITION = ("DX01C01", "DX02C01", "DX03C01", "DX04C01", "DX05C01", "DX06C01") +# ignore "DX05C01", "DX06C01" +SDM_POSITION = ("DX01C01", "DX02C01", "DX03C01", "DX04C01") def _parse_utterance( @@ -60,6 +62,22 @@ def _parse_utterance( + f"-{position}" for sdm_position in SDM_POSITION ] + elif mic == "mdm": + wav_path_stereo = section_path / "DXmixC01.wav" + if not wav_path_stereo.is_file(): + audio_paths = [ + (section_path / (sdm_position + ".wav")).resolve() + for sdm_position in SDM_POSITION + ] + cmd = f"sox -M -c 1 {audio_paths[0]} -c 1 {audio_paths[1]} -c 1 {audio_paths[2]} -c 1 {audio_paths[3]} {wav_path_stereo.resolve()}" + subprocess.run(cmd, shell=True, check=True) + audio_paths = [wav_path_stereo.resolve()] + recording_ids = [ + str(section_path / "DXmixC01") + .replace(str(corpus_dir) + "/", "") + .replace("/", "-") + + f"-{position}" + ] else: raise ValueError(f"Unsupported mic type: {mic}") @@ -87,7 +105,7 @@ def _parse_utterance( recording_id=recording_id, start=start, duration=round(end - start, 4), - channel=0, + channel=0 if mic in ["sdm", "ihm"] else list(range(4)), language="Chinese", speaker=speaker, text=normalize_text_alimeeting(text), From 4eb2fdc0f952f808cb4a50e32a6e723cab75edf6 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Nov 2023 06:16:50 +0000 Subject: [PATCH 2/3] remove mdm files save --- lhotse/recipes/icmcasr.py | 56 +++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/lhotse/recipes/icmcasr.py b/lhotse/recipes/icmcasr.py index 0d1c70234..adb89c97a 100644 --- a/lhotse/recipes/icmcasr.py +++ b/lhotse/recipes/icmcasr.py @@ -14,7 +14,8 @@ from tqdm.auto import tqdm -from lhotse.audio import Recording, RecordingSet +from lhotse.audio import AudioSource, Recording, RecordingSet +from lhotse.audio.backend import info from lhotse.qa import fix_manifests, validate_recordings_and_supervisions from lhotse.recipes.utils import manifests_exist, normalize_text_alimeeting from lhotse.supervision import SupervisionSegment, SupervisionSet @@ -63,15 +64,7 @@ def _parse_utterance( for sdm_position in SDM_POSITION ] elif mic == "mdm": - wav_path_stereo = section_path / "DXmixC01.wav" - if not wav_path_stereo.is_file(): - audio_paths = [ - (section_path / (sdm_position + ".wav")).resolve() - for sdm_position in SDM_POSITION - ] - cmd = f"sox -M -c 1 {audio_paths[0]} -c 1 {audio_paths[1]} -c 1 {audio_paths[2]} -c 1 {audio_paths[3]} {wav_path_stereo.resolve()}" - subprocess.run(cmd, shell=True, check=True) - audio_paths = [wav_path_stereo.resolve()] + audio_paths = ["fake_audio_path_for_mdm"] recording_ids = [ str(section_path / "DXmixC01") .replace(str(corpus_dir) + "/", "") @@ -82,14 +75,43 @@ def _parse_utterance( raise ValueError(f"Unsupported mic type: {mic}") for audio_path, recording_id in zip(audio_paths, recording_ids): + if mic == "mdm": + channel_paths = [ + (section_path / (position + ".wav")).resolve() + for position in SDM_POSITION + ] + audio_info = info( + channel_paths[0], + force_opus_sampling_rate=None, + force_read_audio=False, + ) + recordings.append( + Recording( + id=recording_id, + sources=[ + AudioSource( + type="file", + channels=[idx], + source=str(audio_path), + ) + for idx, audio_path in enumerate(channel_paths) + ], + sampling_rate=16000, + num_samples=audio_info.frames, + duration=audio_info.duration, + ) + ) # check if audio_path exists, if not, then skip - if not audio_path.is_file(): - # give some warning - logging.warning(f"Audio file {audio_path} does not exist - skipping.") - continue - recordings.append( - Recording.from_file(path=audio_path, recording_id=recording_id) - ) + else: + if not audio_path.is_file(): + # give some warning + logging.warning( + f"Audio file {audio_path} does not exist - skipping." + ) + continue + recordings.append( + Recording.from_file(path=audio_path, recording_id=recording_id) + ) tg = textgrid.TextGrid.fromFile(str(text_path)) assert len(tg.tiers) == 1, f"Expected 1 tier, found {len(tg.tiers)} tiers." From 39d48e86ff0a51c833bc47d54964c4ab0313d01f Mon Sep 17 00:00:00 2001 From: root Date: Tue, 28 Nov 2023 12:00:53 +0000 Subject: [PATCH 3/3] add comments for ignored signals --- lhotse/recipes/icmcasr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lhotse/recipes/icmcasr.py b/lhotse/recipes/icmcasr.py index adb89c97a..20f25112f 100644 --- a/lhotse/recipes/icmcasr.py +++ b/lhotse/recipes/icmcasr.py @@ -6,7 +6,6 @@ import logging import os -import subprocess from collections import defaultdict from concurrent.futures.thread import ThreadPoolExecutor from pathlib import Path @@ -23,7 +22,9 @@ ICMCASR = ("train", "dev") # TODO: Support all subsets when released POSITION = ("DA01", "DA02", "DA03", "DA04") -# ignore "DX05C01", "DX06C01" +# ignore "DX05C01", "DX06C01", +# which are 2-channel reference signals for AEC. +# see https://github.com/MrSupW/ICMC-ASR_Baseline/tree/main SDM_POSITION = ("DX01C01", "DX02C01", "DX03C01", "DX04C01")