In [11]:
import os

os.environ["KAGGLE_CONFIG_DIR"] = ""

In [13]:
# !kaggle datasets download -d mfekadu/english-multispeaker-corpus-for-voice-cloning
# !unzip english-multispeaker-corpus-for-voice-cloning.zip -d ../../data/

In [1]:
import warnings
import json
import os
from glob import glob
from typing import Callable
from collections import defaultdict

warnings.filterwarnings("ignore")

In [2]:
from models.vad import SpeechbrainVAD, EnergyVAD, SileroVAD, ZffVAD
from utils.wav_utils import resample_file, get_duration
from utils.metrics import DetectionMetric

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [3]:
class Config:
    data_path: str = "../data/VCTK-Corpus/VCTK-Corpus/wav48/*/*.wav"
    num_samples: int = 300
    target_methods: list[str] = ["silero", "speechbrain", "silero_merged"]
    pred_methods: list[str] = ["zff", "energy", "full_speech"]
    boundaries_save_path: str = "./boundaries_v3.json"

In [4]:
audio_files = glob(Config.data_path, recursive=True)
audio_files = list(filter(lambda f: "16k" not in f, audio_files))
dataset = audio_files[: Config.num_samples]

In [7]:
len(audio_files)

36725

In [8]:
if os.path.exists(Config.boundaries_save_path):
    with open(Config.boundaries_save_path, "r") as file:
        BOUNDARIES_DATA = json.load(file)
else:
    BOUNDARIES_DATA = {}

In [10]:
speechbrain_vad = SpeechbrainVAD()
silero_vad = SileroVAD()
energy_vad = EnergyVAD()
zero_ff_vad = ZffVAD()

In [11]:
for audio_file in dataset:
    # print(f"Processing {audio_file}...")
    resampled_path = f"{audio_file[:-4]}_16k.wav"
    if not os.path.exists(resampled_path):
        print(f"Resampling {audio_file}")
        resample_file(
            audio_path=audio_file,
            save_path=resampled_path,
            target_sr=16000,
            to_mono=True,
        )

    if audio_file not in BOUNDARIES_DATA:
        duration = get_duration(audio_file)
        BOUNDARIES_DATA[audio_file] = {
            "silero": silero_vad.get_boundaries(audio_file),
            "silero_merged": silero_vad.get_boundaries(audio_file, merge_th=300),
            "energy": energy_vad.get_boundaries(audio_file, close_th=500),
            "speechbrain": speechbrain_vad.get_boundaries(
                resampled_path, apply_energy_vad=False
            ),
            "zff": zero_ff_vad.get_boundaries(audio_file, close_th=500),
            "full_speech": [{"start": 0.0, "end": get_duration(audio_file)}],
        }

with open(Config.boundaries_save_path, "w") as f:
    json.dump(BOUNDARIES_DATA, f, indent=4)

In [14]:
metrics: list[DetectionMetric] = [
    DetectionMetric.create(metric_name="error_rate"),
    DetectionMetric.create(metric_name="precision"),
    DetectionMetric.create(metric_name="recall"),
    DetectionMetric.create(metric_name="f1"),
]

get_pred_methods: Callable[[dict[str, list[dict]]], list[str]] = lambda cache: [
    k for k in cache.keys() if k not in Config.target_methods
]
first_char_upper: Callable[[str], dict[str]] = lambda text: text[0].upper() + text[1:]

In [15]:
metrics_buffer = {
    "silero": {pred_method: defaultdict(list) for pred_method in Config.pred_methods},
    "silero_merged": {
        pred_method: defaultdict(list) for pred_method in Config.pred_methods
    },
    "speechbrain": {
        pred_method: defaultdict(list) for pred_method in Config.pred_methods
    },
}


def count_metrics(
    audio_id: str, method: str, cache: dict[str, list[dict]], target: str
):
    for metric in metrics:
        metrics_buffer[target][method][metric.name].append(
            metric(
                targets=cache[target],
                predictions=cache[method],
            )
        )


for key, boundaries in BOUNDARIES_DATA.items():
    for target_method in Config.target_methods:
        for pred_method in get_pred_methods(boundaries):
            count_metrics(
                audio_id=key,
                method=pred_method,
                cache=boundaries,
                target=target_method,
            )

In [16]:
for eval_model, metrics_cache in metrics_buffer.items():
    for pred_method, data in metrics_cache.items():
        print(
            f"Evaluating {first_char_upper(pred_method)} with {first_char_upper(eval_model)} as target:"
        )
        for name, values in data.items():
            print(f"Avg detection {name} - {sum(values) / len(values)}")
        print("=" * 50)

Evaluating Zff with Silero as target:
Avg detection error_rate - 1.01489310096228
Avg detection precision - 0.5117091431858285
Avg detection recall - 0.9972186924822481
Avg detection f1 - 0.6715370925197742
Evaluating Energy with Silero as target:
Avg detection error_rate - 0.4501670459100451
Avg detection precision - 0.9982126380105378
Avg detection recall - 0.5509874124086719
Avg detection f1 - 0.6913236909503266
Evaluating Full_speech with Silero as target:
Avg detection error_rate - 1.0247548450690047
Avg detection precision - 0.5091629903888466
Avg detection recall - 0.999999971935969
Avg detection f1 - 0.670298617415011
Evaluating Zff with Silero_merged as target:
Avg detection error_rate - 1.0069586242569626
Avg detection precision - 0.5143488038808729
Avg detection recall - 0.9972186924822481
Avg detection f1 - 0.6736337725465645
Evaluating Energy with Silero_merged as target:
Avg detection error_rate - 0.4507566130201017
Avg detection precision - 0.9994801083927601
Avg detecti