From 00abc09b6cb38a52a5a4b4eb6d4c6bb533f6484f Mon Sep 17 00:00:00 2001
From: Yifan Yang <64255737+yfyeung@users.noreply.github.com>
Date: Sat, 3 Feb 2024 21:09:43 +0800
Subject: [PATCH] Add VAD to Supervisions in LibriLight Recipe (#1280)

* Add vad info to supervisions

* Use add_duration

* Update librilight.py

---------

Co-authored-by: yifanyeung <yifanyeung@yifanyeung.local>
---
 lhotse/recipes/librilight.py | 44 ++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/lhotse/recipes/librilight.py b/lhotse/recipes/librilight.py
index aefc999ec..1b5254b85 100644
--- a/lhotse/recipes/librilight.py
+++ b/lhotse/recipes/librilight.py
@@ -13,6 +13,7 @@
 This data is very huge - please download manually at LIBRILIGHT_URL.
 """
 
+import json
 import logging
 import os
 from collections import defaultdict
@@ -26,7 +27,7 @@
 from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.recipes.utils import manifests_exist
 from lhotse.supervision import SupervisionSegment, SupervisionSet
-from lhotse.utils import Pathlike
+from lhotse.utils import Pathlike, add_durations
 
 LIBRILIGHT = ("small", "medium", "large")
 
@@ -42,28 +43,43 @@ def _parse_utterance(
     audio_path: Pathlike,
 ) -> Optional[Tuple[Recording, SupervisionSegment]]:
     file_name = str(audio_path).replace(".flac", "").replace(str(corpus_dir) + "/", "")
-    speaker = str(audio_path).split("/")[-3]
     audio_path = audio_path.resolve()
 
     if not audio_path.is_file():
         logging.warning(f"No such file: {audio_path}")
         return None
 
+    audio_info_path = str(audio_path).replace("flac", "json")
+    with open(audio_info_path) as f:
+        audio_infos = json.load(f)
+        speaker = audio_infos["speaker"]
+        vad_infos = audio_infos["voice_activity"]
+
     recording = Recording.from_file(
         path=audio_path,
         recording_id=file_name,
     )
-    segment = SupervisionSegment(
-        id=file_name,
-        recording_id=file_name,
-        start=0.0,
-        duration=recording.duration,
-        channel=0,
-        language="English",
-        speaker=speaker,
-    )
 
-    return recording, segment
+    segments = []
+    segment_seq = 0
+    sampling_rate = 16000
+    for vad_info in vad_infos:
+        segments.append(
+            SupervisionSegment(
+                id=file_name + "_" + str(segment_seq),
+                recording_id=file_name,
+                start=vad_info[0],
+                duration=add_durations(
+                    vad_info[1], -vad_info[0], sampling_rate=sampling_rate
+                ),
+                channel=0,
+                language="English",
+                speaker=speaker,
+            )
+        )
+        segment_seq += 1
+
+    return recording, segments
 
 
 def _prepare_subset(
@@ -92,9 +108,9 @@ def _prepare_subset(
             result = future.result()
             if result is None:
                 continue
-            recording, segment = result
+            recording, segments = result
             recordings.append(recording)
-            supervisions.append(segment)
+            supervisions.extend(segments)
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)