# Testing WebRTC's Voice Activation Detector (VAD)
by Jiho (2018.8.3)

https://github.com/wiseman/py-webrtcvad a python interface to the WebRTC Voice Activity Detector (VAD)

From 

https://www.quora.com/What-are-current-state-of-the-art-algorithms-for-voice-activity-detection

I found out that WebRTC's VAD is developed by Google and open-sourced for public use. I also saw some people using this on Kaggle Tensorflow Speech Competition, so I decided to test out some samples with this library.

Also, testing out noise reduction (https://github.com/dodiku/noise_reduction) since the noise is not easily removed from VAD. you can see some test results from https://dodiku.github.io/noise_reduction/

In [1]:
import IPython.display as ipd
import numpy as np
import wave
import vad
import librosa

In [2]:
from noise_reduction.noise import reduce_noise_power, reduce_noise_centroid_s
from noise_reduction.noise import reduce_noise_centroid_mb, reduce_noise_mfcc_down, reduce_noise_mfcc_up, reduce_noise_median
from noise_reduction.noise import trim_silence, output_file

In [3]:
vad_obj = vad.get_vad_object(2)

loaded VAD object. setting mode to 2


## Oriana1 - easy sample

In [4]:
ipd.Audio("example/oriana1_resample.wav")

In [5]:
vad.denoise(vad_obj, filename="example/oriana1_resample.wav", padding_duration_ms=100)
ipd.Audio("example/oriana1_resample_denoise.wav")

## Mom1 - long noise/silence in the front

In [6]:
ipd.Audio("example/mom1_resample.wav")

#### just do VAD

In [7]:
denoised, sr = vad.denoise(vad_obj, filename="example/mom1_resample.wav", padding_duration_ms=100, frame_length_ms=10)
ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

doesn't work very well.

### try noise reduction first

In [8]:
y, sr = librosa.load("example/mom1_resample.wav", sr=None)
ipd.Audio(data=y,rate=sr)

In [9]:
def apply_noise_reduction_then_vad(audio, sr, method):
    if method == "POWER":
        y_reduced = reduce_noise_power(audio, sr)
    elif method == "CENTROID_S":
        y_reduced = reduce_noise_centroid_s(audio, sr)
    elif method == "CENTROID_MB":
        y_reduced = reduce_noise_centroid_mb(audio, sr)
    elif method == "MFCC_UP":
        y_reduced = reduce_noise_mfcc_up(audio, sr)
    elif method == "MFCC_DOWN":
        y_reduced = reduce_noise_mfcc_down(audio, sr)
    elif method == "MEDIAN":
        y_reduced = reduce_noise_median(audio, sr)
    y_reduced, time_trimmed = trim_silence(y_reduced)
    denoised, sr = vad.denoise(vad_obj, audio=vad.float_to_byte(y_reduced), sample_rate=sr, 
                               padding_duration_ms=100, frame_length_ms=10)
    return ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

#### reduced_power (V)

In [10]:
apply_noise_reduction_then_vad(y, sr, "POWER")

much better

#### centroid s

In [11]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_S")

#### reduced_centroid_mb

In [12]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_MB")

#### mfcc up

In [13]:
apply_noise_reduction_then_vad(y, sr, "MFCC_UP")

not effective

#### mfcc down

In [14]:
apply_noise_reduction_then_vad(y, sr, "MFCC_DOWN")

removes too much

#### median

In [15]:
apply_noise_reduction_then_vad(y, sr, "MEDIAN")

okay

## MOM2 - silence in the middle

In [16]:
ipd.Audio("example/mom2_resample.wav")

In [17]:
denoised, sr = vad.denoise(vad_obj, filename="example/mom2_resample.wav", padding_duration_ms=100, frame_length_ms=10)
ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

In [18]:
y, sr = librosa.load("example/mom2_resample.wav", sr=None)

In [19]:
apply_noise_reduction_then_vad(y, sr, "POWER")

In [20]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_S")

In [21]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_MB")

In [22]:
apply_noise_reduction_then_vad(y, sr, "MFCC_UP") #V

In [23]:
apply_noise_reduction_then_vad(y, sr, "MFCC_DOWN")

In [24]:
apply_noise_reduction_then_vad(y, sr, "MEDIAN")

## Child1 - some noise in the front

In [25]:
ipd.Audio("example/child1_resample.wav")

In [26]:
denoised, sr = vad.denoise(vad_obj, filename="example/child1_resample.wav", padding_duration_ms=100, frame_length_ms=10)
ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

this is good

In [27]:
y, sr = librosa.load("example/child1_resample.wav", sr=None)

In [28]:
apply_noise_reduction_then_vad(y, sr, "POWER") # V

In [29]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_S")

In [30]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_MB")

In [31]:
apply_noise_reduction_then_vad(y, sr, "MFCC_UP")

In [32]:
apply_noise_reduction_then_vad(y, sr, "MFCC_DOWN")

In [33]:
apply_noise_reduction_then_vad(y, sr, "MEDIAN") #V

## Child 2  (no noise) - mostly good

In [34]:
ipd.Audio("example/child2_resample.wav")

In [35]:
denoised, sr = vad.denoise(vad_obj, filename="example/child2_resample.wav", padding_duration_ms=100, frame_length_ms=10)
ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

In [36]:
y, sr = librosa.load("example/child2_resample.wav", sr=None)

In [37]:
apply_noise_reduction_then_vad(y, sr, "POWER") # V

In [38]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_S") #V

In [39]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_MB") #V

In [40]:
apply_noise_reduction_then_vad(y, sr, "MFCC_UP") #V

In [41]:
apply_noise_reduction_then_vad(y, sr, "MFCC_DOWN") #V

In [42]:
apply_noise_reduction_then_vad(y, sr, "MEDIAN") #V

## Child 3 - medium loud,short noise on the front

In [43]:
ipd.Audio("example/child3_resample.wav")

In [44]:
denoised, sr = vad.denoise(vad_obj, filename="example/child3_resample.wav", padding_duration_ms=100, frame_length_ms=10)
ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

In [45]:
y, sr = librosa.load("example/child3_resample.wav", sr=None)

In [46]:
apply_noise_reduction_then_vad(y, sr, "POWER") #V

In [47]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_S") #V

In [48]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_MB")

In [49]:
apply_noise_reduction_then_vad(y, sr, "MFCC_UP")

In [50]:
apply_noise_reduction_then_vad(y, sr, "MFCC_DOWN")

In [51]:
apply_noise_reduction_then_vad(y, sr, "MEDIAN")

## Child 4 - almost no silence or noise, but let's check

In [52]:
ipd.Audio("example/child4_resample.wav")

In [53]:
denoised, sr = vad.denoise(vad_obj, filename="example/child4_resample.wav", padding_duration_ms=100, frame_length_ms=10)
ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

In [54]:
y, sr = librosa.load("example/child4_resample.wav", sr=None)

In [55]:
apply_noise_reduction_then_vad(y, sr, "POWER") #V

In [56]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_S")

In [57]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_MB")

In [58]:
apply_noise_reduction_then_vad(y, sr, "MFCC_UP") #V

In [59]:
apply_noise_reduction_then_vad(y, sr, "MFCC_DOWN") #V

In [60]:
apply_noise_reduction_then_vad(y, sr, "MEDIAN") #V

## Child 5 - large cracking noise in the fornt

In [61]:
ipd.Audio("example/child5_resample.wav")

In [62]:
denoised, sr = vad.denoise(vad_obj, filename="example/child5_resample.wav", padding_duration_ms=100, frame_length_ms=10)
ipd.Audio(data=vad.byte_to_float(denoised), rate=sr)

In [63]:
y, sr = librosa.load("example/child5_resample.wav", sr=None)

In [64]:
apply_noise_reduction_then_vad(y, sr, "POWER")

In [65]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_S")

In [66]:
apply_noise_reduction_then_vad(y, sr, "CENTROID_MB")

In [67]:
apply_noise_reduction_then_vad(y, sr, "MFCC_UP")

In [68]:
apply_noise_reduction_then_vad(y, sr, "MFCC_DOWN")

In [69]:
apply_noise_reduction_then_vad(y, sr, "MEDIAN")