From 00abc09b6cb38a52a5a4b4eb6d4c6bb533f6484f Mon Sep 17 00:00:00 2001 From: Yifan Yang <64255737+yfyeung@users.noreply.github.com> Date: Sat, 3 Feb 2024 21:09:43 +0800 Subject: [PATCH] Add VAD to Supervisions in LibriLight Recipe (#1280) * Add vad info to supervisions * Use add_duration * Update librilight.py --------- Co-authored-by: yifanyeung --- lhotse/recipes/librilight.py | 44 ++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/lhotse/recipes/librilight.py b/lhotse/recipes/librilight.py index aefc999ec..1b5254b85 100644 --- a/lhotse/recipes/librilight.py +++ b/lhotse/recipes/librilight.py @@ -13,6 +13,7 @@ This data is very huge - please download manually at LIBRILIGHT_URL. """ +import json import logging import os from collections import defaultdict @@ -26,7 +27,7 @@ from lhotse.qa import fix_manifests, validate_recordings_and_supervisions from lhotse.recipes.utils import manifests_exist from lhotse.supervision import SupervisionSegment, SupervisionSet -from lhotse.utils import Pathlike +from lhotse.utils import Pathlike, add_durations LIBRILIGHT = ("small", "medium", "large") @@ -42,28 +43,43 @@ def _parse_utterance( audio_path: Pathlike, ) -> Optional[Tuple[Recording, SupervisionSegment]]: file_name = str(audio_path).replace(".flac", "").replace(str(corpus_dir) + "/", "") - speaker = str(audio_path).split("/")[-3] audio_path = audio_path.resolve() if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") return None + audio_info_path = str(audio_path).replace("flac", "json") + with open(audio_info_path) as f: + audio_infos = json.load(f) + speaker = audio_infos["speaker"] + vad_infos = audio_infos["voice_activity"] + recording = Recording.from_file( path=audio_path, recording_id=file_name, ) - segment = SupervisionSegment( - id=file_name, - recording_id=file_name, - start=0.0, - duration=recording.duration, - channel=0, - language="English", - speaker=speaker, - ) - return recording, segment + segments = [] + segment_seq = 0 + sampling_rate = 16000 + for vad_info in vad_infos: + segments.append( + SupervisionSegment( + id=file_name + "_" + str(segment_seq), + recording_id=file_name, + start=vad_info[0], + duration=add_durations( + vad_info[1], -vad_info[0], sampling_rate=sampling_rate + ), + channel=0, + language="English", + speaker=speaker, + ) + ) + segment_seq += 1 + + return recording, segments def _prepare_subset( @@ -92,9 +108,9 @@ def _prepare_subset( result = future.result() if result is None: continue - recording, segment = result + recording, segments = result recordings.append(recording) - supervisions.append(segment) + supervisions.extend(segments) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions)