Merge pull request #142 from linto-ai/bugfix/use_silero_v3.1

Add other VAD methods : silero v3.1 (or soem other versions) and auditok
linto-ai · Nov 30, 2023 · fadbfb7 · fadbfb7
2 parents 7233803 + 689e07e
commit fadbfb7
Show file tree

Hide file tree

Showing 13 changed files with 258 additions and 44 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -18,7 +18,10 @@ COPY setup.py /usr/src/app/setup.py
 COPY whisper_timestamped /usr/src/app/whisper_timestamped
 
 # Install
-RUN cd /usr/src/app/ && pip3 install ".[dev]" && pip3 install ".[vad]"
+RUN cd /usr/src/app/ && pip3 install ".[dev]"
+RUN cd /usr/src/app/ && pip3 install ".[vad_silero]"
+RUN cd /usr/src/app/ && pip3 install ".[vad_auditok]"
+RUN cd /usr/src/app/ && pip3 install ".[test]"
 
 # Cleanup
 RUN rm -R /usr/src/app/requirements.txt /usr/src/app/setup.py /usr/src/app/whisper_timestamped

diff --git a/README.md b/README.md
@@ -16,6 +16,9 @@ Multilingual Automatic Speech Recognition with word-level timestamps and confide
    * [Plotting word alignment](#plotting-word-alignment)
    * [Example output](#example-output)
    * [Options that may improve results](#options-that-may-improve-results)
+      * [Accurate Whisper transcription](#accurate-whisper-transcription)
+      * [Running Voice Activity Detection (VAD) before sending to Whisper](#running-voice-activity-detection-vad-before-sending-to-whisper)
+      * [Detecting disfluencies](#detecting-disfluencies)
 * [Acknowlegment](#acknowlegment)
 * [Citations](#citations)
 
@@ -32,7 +35,9 @@ The approach is based on Dynamic Time Warping (DTW) applied to cross-attention w
 
 `whisper-timestamped` is an extension of the [`openai-whisper`](https://pypi.org/project/whisper-openai/) Python package and is meant to be compatible with any version of `openai-whisper`.
 It provides more efficient/accurate word timestamps, along with those additional features:
-* Voice Activity Detection (VAD) can be run before applying Whisper model, to avoid hallucinations due to errors in the training data (for instance, predicting "Thanks you for watching!" on pure silence).
+* Voice Activity Detection (VAD) can be run before applying Whisper model,
+  to avoid hallucinations due to errors in the training data (for instance, predicting "Thanks you for watching!" on pure silence).
+  Several VAD methods are available: silero (default), auditok, auditok:v3.1
 * When the language is not specified, the language probabilities are provided among the outputs.
 
 ### Notes on other approaches
@@ -55,7 +60,7 @@ Requirements:
 
 You can install `whisper-timestamped` either by using pip:
 ```bash
-pip3 install git+https://github.com/linto-ai/whisper-timestamped
+pip3 install whisper-timestamped
 ```
 
 or by cloning this repository and running installation:
@@ -327,6 +332,27 @@ results = whisper_timestamped.transcribe(model, audio, vad=True, ...)
 whisper_timestamped --vad True ...
 ```
 
+By default, the VAD method used is [silero](https://github.com/snakers4/silero-vad).
+But other methods are available, such as earlier versions of silero, or [auditok](https://github.com/amsehili/auditok).
+Those methods were introduced because latest versions of silero VAD can have a lot of false alarms on some audios (speech detected on silence).
+* In Python:
+```python
+results = whisper_timestamped.transcribe(model, audio, vad="silero:v3.1", ...)
+results = whisper_timestamped.transcribe(model, audio, vad="auditok", ...)
+```
+* On the command line:
+```bash
+whisper_timestamped --vad silero:v3.1 ...
+whisper_timestamped --vad auditok ...
+```
+
+In order to watch the VAD results, you can use the `--plot` option of the `whisper_timestamped` CLI,
+or the `plot_word_alignment` option of the `whisper_timestamped.transcribe()` Python function.
+It will show the VAD results on the input audio signal as following (x-axis is time in seconds):
+| **vad="silero:v4.0"** | **vad="silero:v3.1"** | **vad="auditok"** |
+| :---: | :---: | :---: |
+| ![Example VAD](figs/VAD_silero_v4.0.png) | ![Example VAD](figs/VAD_silero_v3.1.png)  | ![Example VAD](figs/VAD_auditok.png) |
+
 #### Detecting disfluencies
 
 Whisper models tend to remove speech disfluencies (filler words, hesitations, repetitions, etc.). Without precautions, the disfluencies that are not transcribed will affect the timestamp of the following word: the timestamp of the beginning of the word will actually be the timestamp of the beginning of the disfluencies. `whisper-timestamped` can have some heuristics to avoid this.

diff --git a/figs/VAD_auditok.png b/figs/VAD_auditok.png
diff --git a/figs/VAD_silero_v3.1.png b/figs/VAD_silero_v3.1.png
diff --git a/figs/VAD_silero_v4.0.png b/figs/VAD_silero_v4.0.png
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
     name="whisper-timestamped",
     py_modules=["whisper_timestamped"],
     version=version,
-    description="Add to OpenAI Whisper the capability to give word timestamps",
+    description="OpenAI Whisper ASR with accurate word timestamps, language detection confidence, several options of VAD, and more.",
     python_requires=">=3.7",
     author="Jeronymous",
     url="https://github.com/linto-ai/whisper-timestamped",
@@ -37,7 +37,9 @@
     },
     include_package_data=True,
     extras_require={
-        'dev': ['matplotlib', 'jsonschema', 'transformers'],
-        'vad': ['onnxruntime', 'torchaudio'],
+        'dev': ['matplotlib', 'transformers'],
+        'vad_silero': ['onnxruntime', 'torchaudio'],
+        'vad_auditok': ['auditok'],
+        'test': ['jsonschema'],
     },
 )
diff --git a/tests/expected/verbose/vad_auditok_words.wav.stdout b/tests/expected/verbose/vad_auditok_words.wav.stdout
@@ -0,0 +1,8 @@
+[00:00.750 --> 00:01.470] settlement,
+[00:02.950 --> 00:03.670] Kentucky,
+[00:05.770 --> 00:06.290] causing
+[00:07.900 --> 00:08.950] damage,
+[00:10.900 --> 00:11.700] President,
+[00:14.200 --> 00:14.780] expansion,
+[00:17.120 --> 00:17.760] hospital,
+[00:20.730 --> 00:21.330] devastated.
diff --git a/tests/expected/verbose/vad_silero3.0_words.wav.stdout b/tests/expected/verbose/vad_silero3.0_words.wav.stdout
@@ -0,0 +1,8 @@
+[00:00.760 --> 00:01.480] settlement,
+[00:02.890 --> 00:03.670] Kentucky,
+[00:05.710 --> 00:06.270] causing
+[00:07.850 --> 00:08.930] damage,
+[00:10.940 --> 00:11.700] president,
+[00:14.200 --> 00:14.780] expansion,
+[00:17.120 --> 00:17.780] hospital,
+[00:20.140 --> 00:21.380] devastated.
diff --git a/tests/expected/verbose/vad_silero3.1_words.wav.stdout b/tests/expected/verbose/vad_silero3.1_words.wav.stdout
@@ -0,0 +1,8 @@
+[00:00.760 --> 00:01.480] settlement,
+[00:02.920 --> 00:03.660] Kentucky,
+[00:05.760 --> 00:06.260] causing
+[00:07.850 --> 00:08.940] damage,
+[00:10.840 --> 00:11.700] president,
+[00:14.190 --> 00:14.770] expansion,
+[00:17.130 --> 00:17.750] hospital,
+[00:21.200 --> 00:21.380] devastated.
diff --git a/tests/expected/verbose/vad_words.wav.stdout b/tests/expected/verbose/vad_words.wav.stdout
@@ -1,8 +1,8 @@
-[00:00.140 --> 00:01.320] Settlement.
-[00:03.020 --> 00:03.600] Kentucky.
-[00:05.170 --> 00:06.130] Causing.
-[00:08.040 --> 00:08.940] Damage.
-[00:10.890 --> 00:11.510] President.
-[00:13.730 --> 00:14.790] Expansion.
-[00:16.980 --> 00:17.600] Hospital.
-[00:20.410 --> 00:21.430] Devastated.
+[00:00.760 --> 00:01.460] settlement,
+[00:02.900 --> 00:03.680] Kentucky,
+[00:05.710 --> 00:06.270] causing
+[00:07.890 --> 00:08.940] damage,
+[00:10.930 --> 00:11.690] president,
+[00:14.070 --> 00:14.770] expansion,
+[00:17.140 --> 00:17.780] hospital,
+[00:20.730 --> 00:21.370] devastated.
diff --git a/tests/run_tests.py b/tests/run_tests.py
@@ -38,7 +38,10 @@
             "-c", "--catch",
             "-b", "--buffer",
             "-k",
-        ] and (i==0 or args[i-1] not in ["-k"]) and (arg.startswith("-") or (i>0 and args[i-1].startswith("-"))):
+        ] \
+        and not arg.startswith("Test") \
+        and (i==0 or args[i-1] not in ["-k"]) \
+        and (arg.startswith("-") or (i>0 and args[i-1].startswith("-"))):
             test_transcribe.CMD_OPTIONS.append(arg)
             sys.argv.remove(arg)
 

diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
@@ -489,15 +489,40 @@ def test_monolingual_small(self):
 
 class TestTranscribeWithVad(TestHelperCli):
 
-    def test_vad(self):
+    def test_vad_default(self):
         self._test_cli_(
-                ["--model", "large", "--accurate", "--language", "en", "--vad", "True", "--verbose", "True"],
+                ["--model", "tiny", "--accurate", "--language", "en", "--vad", "True", "--verbose", "True"],
                 "verbose",
                 files=["words.wav"],
                 prefix="vad",
                 extensions=None,
             )
 
+    def test_vad_custom_silero(self):
+        self._test_cli_(
+                ["--model", "tiny", "--accurate", "--language", "en", "--vad", "silero:v3.1", "--verbose", "True"],
+                "verbose",
+                files=["words.wav"],
+                prefix="vad_silero3.1",
+                extensions=None,
+            )
+        self._test_cli_(
+                ["--model", "tiny", "--accurate", "--language", "en", "--vad", "silero:v3.0", "--verbose", "True"],
+                "verbose",
+                files=["words.wav"],
+                prefix="vad_silero3.0",
+                extensions=None,
+            )
+
+    def test_vad_custom_auditok(self):
+        self._test_cli_(
+                ["--model", "tiny", "--language", "en", "--vad", "auditok", "--verbose", "True"],
+                "verbose",
+                files=["words.wav"],
+                prefix="vad_auditok",
+                extensions=None,
+            )
+
 
 class TestTranscribeUnspacedLanguage(TestHelperCli):