In [11]:
import os

os.environ["KAGGLE_CONFIG_DIR"] = ""

In [13]:
# !kaggle datasets download -d mfekadu/english-multispeaker-corpus-for-voice-cloning
# !unzip english-multispeaker-corpus-for-voice-cloning.zip -d ../../data/

In [2]:
import warnings
import json
import os
from glob import glob
from typing import Callable
from collections import defaultdict

warnings.filterwarnings("ignore")

In [3]:
from models.vad import SpeechbrainVAD, EnergyVAD, SileroVAD, ZffVAD
from utils.wav_utils import resample_file
from utils.metrics import DetectionMetric

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [4]:
class Config:
    data_path: str = "../data/VCTK-Corpus/VCTK-Corpus/wav48/*/*.wav"
    num_samples: int = 300
    target_methods: list[str] = ["silero", "speechbrain", "silero_merged"]
    pred_methods: list[str] = ["zff", "energy"]
    boundaries_save_path: str = "./boundaries_v3.json"

In [5]:
audio_files = glob(Config.data_path, recursive=True)
audio_files = list(filter(lambda f: "16k" not in f, audio_files))
dataset = audio_files[: Config.num_samples]

In [6]:
len(audio_files)

36725

In [7]:
if os.path.exists(Config.boundaries_save_path):
    with open(Config.boundaries_save_path, "r") as file:
        BOUNDARIES_DATA = json.load(file)
else:
    BOUNDARIES_DATA = {}

In [9]:
speechbrain_vad = SpeechbrainVAD()
silero_vad = SileroVAD()
energy_vad = EnergyVAD()
zero_ff_vad = ZffVAD()

In [10]:
for audio_file in dataset:
    # print(f"Processing {audio_file}...")
    resampled_path = f"{audio_file[:-4]}_16k.wav"
    if not os.path.exists(resampled_path):
        print(f"Resampling {audio_file}")
        resample_file(
            audio_path=audio_file,
            save_path=resampled_path,
            target_sr=16000,
            to_mono=True,
        )

    if audio_file not in BOUNDARIES_DATA:
        BOUNDARIES_DATA[audio_file] = {
            "silero": silero_vad.get_boundaries(audio_file),
            "silero_merged": silero_vad.get_boundaries(audio_file, merge_th=150),
            "energy": energy_vad.get_boundaries(audio_file),
            "speechbrain": speechbrain_vad.get_boundaries(resampled_path, apply_energy_vad=False),
            "zff": zero_ff_vad.get_boundaries(audio_file),
        }

with open(Config.boundaries_save_path, "w") as f:
    json.dump(BOUNDARIES_DATA, f, indent=4)

In [11]:
metrics: list[DetectionMetric] = [
    DetectionMetric.create(metric_name="error_rate"),
    DetectionMetric.create(metric_name="precision"),
    DetectionMetric.create(metric_name="recall"),
    DetectionMetric.create(metric_name="f1"),
]

get_pred_methods: Callable[[dict[str, list[dict]]], list[str]] = lambda cache: [
    k for k in cache.keys() if k not in Config.target_methods
]
first_char_upper: Callable[[str], dict[str]] = lambda text: text[0].upper() + text[1:]

In [12]:
metrics_buffer = {
    "silero": {pred_method: defaultdict(list) for pred_method in Config.pred_methods},
    "silero_merged": {pred_method: defaultdict(list) for pred_method in Config.pred_methods},
    "speechbrain": {
        pred_method: defaultdict(list) for pred_method in Config.pred_methods
    },
}


def count_metrics(method: str, cache: dict[str, list[dict]], target: str):
    for metric in metrics:
        metrics_buffer[target][method][metric.name].append(
            metric(
                targets=cache[target],
                predictions=cache[method],
            )
        )


for boundaries in BOUNDARIES_DATA.values():
    for target_method in Config.target_methods:
        for pred_method in get_pred_methods(boundaries):
            count_metrics(
                method=pred_method,
                cache=boundaries,
                target=target_method,
            )

In [13]:
for eval_model, metrics_cache in metrics_buffer.items():
    for pred_method, data in metrics_cache.items():
        print(
            f"Evaluating {first_char_upper(pred_method)} with {first_char_upper(eval_model)} as target:"
        )
        for name, values in data.items():
            print(f"Avg detection {name} - {sum(values) / len(values)}")
        print("=" * 50)

Evaluating Zff with Silero as target:
Avg detection error_rate - 1.0185666552005004
Avg detection precision - 0.5145642466180951
Avg detection recall - 0.9805756836031511
Avg detection f1 - 0.6677925112483006
Evaluating Energy with Silero as target:
Avg detection error_rate - 1.0109586713232093
Avg detection precision - 0.5160130766884796
Avg detection recall - 1.0
Avg detection f1 - 0.6751424127142461
Evaluating Zff with Silero_merged as target:
Avg detection error_rate - 1.017128601575384
Avg detection precision - 0.5150881108152754
Avg detection recall - 0.9806151569941385
Avg detection f1 - 0.6681993008778345
Evaluating Energy with Silero_merged as target:
Avg detection error_rate - 1.0095528293528677
Avg detection precision - 0.5165132103946282
Avg detection recall - 1.0
Avg detection f1 - 0.6755315521880686
Evaluating Zff with Speechbrain as target:
Avg detection error_rate - 0.9116974284188533
Avg detection precision - 0.5521955806851793
Avg detection recall - 0.9818346527261936