Skip to content

Commit

Permalink
Add mdm preparation for icmcasr recipe (#1221)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuekaizhang committed Nov 28, 2023
1 parent db2072e commit 89ca0e6
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 11 deletions.
5 changes: 4 additions & 1 deletion lhotse/bin/modes/recipes/icmcasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
help="How many threads to use (can give good speed-ups with slow disks).",
)
@click.option(
"--mic", type=click.Choice(["ihm", "sdm"]), default="ihm", help="Microphone type."
"--mic",
type=click.Choice(["ihm", "sdm", "mdm"]),
default="ihm",
help="Microphone type.",
)
def icmcasr(
corpus_dir: Pathlike,
Expand Down
61 changes: 51 additions & 10 deletions lhotse/recipes/icmcasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,19 @@

from tqdm.auto import tqdm

from lhotse.audio import Recording, RecordingSet
from lhotse.audio import AudioSource, Recording, RecordingSet
from lhotse.audio.backend import info
from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
from lhotse.recipes.utils import manifests_exist, normalize_text_alimeeting
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, is_module_available

ICMCASR = ("train", "dev") # TODO: Support all subsets when released
POSITION = ("DA01", "DA02", "DA03", "DA04")
SDM_POSITION = ("DX01C01", "DX02C01", "DX03C01", "DX04C01", "DX05C01", "DX06C01")
# ignore "DX05C01", "DX06C01",
# which are 2-channel reference signals for AEC.
# see https://github.com/MrSupW/ICMC-ASR_Baseline/tree/main
SDM_POSITION = ("DX01C01", "DX02C01", "DX03C01", "DX04C01")


def _parse_utterance(
Expand Down Expand Up @@ -60,18 +64,55 @@ def _parse_utterance(
+ f"-{position}"
for sdm_position in SDM_POSITION
]
elif mic == "mdm":
audio_paths = ["fake_audio_path_for_mdm"]
recording_ids = [
str(section_path / "DXmixC01")
.replace(str(corpus_dir) + "/", "")
.replace("/", "-")
+ f"-{position}"
]
else:
raise ValueError(f"Unsupported mic type: {mic}")

for audio_path, recording_id in zip(audio_paths, recording_ids):
if mic == "mdm":
channel_paths = [
(section_path / (position + ".wav")).resolve()
for position in SDM_POSITION
]
audio_info = info(
channel_paths[0],
force_opus_sampling_rate=None,
force_read_audio=False,
)
recordings.append(
Recording(
id=recording_id,
sources=[
AudioSource(
type="file",
channels=[idx],
source=str(audio_path),
)
for idx, audio_path in enumerate(channel_paths)
],
sampling_rate=16000,
num_samples=audio_info.frames,
duration=audio_info.duration,
)
)
# check if audio_path exists, if not, then skip
if not audio_path.is_file():
# give some warning
logging.warning(f"Audio file {audio_path} does not exist - skipping.")
continue
recordings.append(
Recording.from_file(path=audio_path, recording_id=recording_id)
)
else:
if not audio_path.is_file():
# give some warning
logging.warning(
f"Audio file {audio_path} does not exist - skipping."
)
continue
recordings.append(
Recording.from_file(path=audio_path, recording_id=recording_id)
)

tg = textgrid.TextGrid.fromFile(str(text_path))
assert len(tg.tiers) == 1, f"Expected 1 tier, found {len(tg.tiers)} tiers."
Expand All @@ -87,7 +128,7 @@ def _parse_utterance(
recording_id=recording_id,
start=start,
duration=round(end - start, 4),
channel=0,
channel=0 if mic in ["sdm", "ihm"] else list(range(4)),
language="Chinese",
speaker=speaker,
text=normalize_text_alimeeting(text),
Expand Down

0 comments on commit 89ca0e6

Please sign in to comment.