In [11]:
import os

os.environ["KAGGLE_CONFIG_DIR"] = ""

In [13]:
# !kaggle datasets download -d mfekadu/english-multispeaker-corpus-for-voice-cloning
# !unzip english-multispeaker-corpus-for-voice-cloning.zip -d ../../data/

In [1]:
import warnings
import json
from glob import glob
from typing import Callable
from collections import defaultdict

warnings.filterwarnings("ignore")

In [11]:
from models.vad import SpeechbrainVAD, EnergyVAD, SileroVAD, ZffVAD
from utils.wav_utils import resample_file
from utils.metrics import DetectionMetric

In [19]:
class Config:
    data_path: str = "../data/VCTK-Corpus/VCTK-Corpus/wav48/*/*.wav"
    num_samples: int = 200
    target_methods: list[str] = ["silero", "speechbrain"]
    pred_methods: list[str] = ["zff", "energy"]
    boundaries_save_path: str = "./boundaries.json"

In [4]:
audio_files = glob(Config.data_path, recursive=True)
dataset = audio_files[: Config.num_samples]

In [5]:
len(audio_files)

36970

In [6]:
if os.path.exists(Config.boundaries_save_path):
    with open("data.json", "r") as file:
        BOUNDARIES_DATA = json.load(file)
else:
    BOUNDARIES_DATA = {}

In [8]:
speechbrain_vad = SpeechbrainVAD()
silero_vad = SileroVAD()
energy_vad = EnergyVAD()
zero_ff_vad = ZffVAD()

In [12]:
for audio_file in dataset:
    # print(f"Processing {audio_file}...")
    resampled_path = f"{audio_file[:-4]}_16k.wav"
    if not os.path.exists(resampled_path):
        resample_file(
            audio_path=audio_file,
            save_path=resampled_path,
            target_sr=16000,
            to_mono=True,
        )

    if audio_file not in BOUNDARIES_DATA:
        BOUNDARIES_DATA[audio_file] = {
            "silero": silero_vad.get_boundaries(audio_file),
            "energy": energy_vad.get_boundaries(audio_file),
            "speechbrain": speechbrain_vad.get_boundaries(resampled_path),
            "zff": zero_ff_vad.get_boundaries(audio_file),
        }

with open(Config.boundaries_save_path, "w") as f:
    json.dump(BOUNDARIES_DATA, f, indent=4)

In [14]:
metrics: list[DetectionMetric] = [
    DetectionMetric.create(metric_name="error_rate"),
    DetectionMetric.create(metric_name="precision"),
    DetectionMetric.create(metric_name="recall"),
    DetectionMetric.create(metric_name="f1"),
]

get_pred_methods: Callable[[dict[str, list[dict]]], list[str]] = lambda cache: [
    k for k in cache.keys() if k not in Config.target_methods
]
first_char_upper: Callable[[str], dict[str]] = lambda text: text[0].upper() + text[1:]

In [20]:
metrics_buffer = {
    "silero": {pred_method: defaultdict(list) for pred_method in Config.pred_methods},
    "speechbrain": {
        pred_method: defaultdict(list) for pred_method in Config.pred_methods
    },
}


def count_metrics(method: str, cache: dict[str, list[dict]], target: str):
    for metric in metrics:
        metrics_buffer[target][method][metric.name].append(
            metric(
                targets=cache[target],
                predictions=cache[method],
            )
        )


for boundaries in BOUNDARIES_DATA.values():
    for target_method in Config.target_methods:
        for pred_method in get_pred_methods(boundaries):
            count_metrics(
                method=pred_method,
                cache=boundaries,
                target=target_method,
            )

In [22]:
for eval_model, metrics_cache in metrics_buffer.items():
    for pred_method, data in metrics_cache.items():
        print(
            f"Evaluating {first_char_upper(pred_method)} with {first_char_upper(eval_model)} as target:"
        )
        for name, values in data.items():
            print(f"Avg detection {name} - {sum(values) / len(values)}")
        print("=" * 50)

Evaluating zff with Silero as target:
Avg detection error_rate - 0.9454136760644996
Avg detection precision - 0.5303511087860955
Avg detection recall - 0.662877746822391
Avg detection f1 - 0.5840491927439041
Evaluating energy with Silero as target:
Avg detection error_rate - 0.9680405654825069
Avg detection precision - 0.3420743006175927
Avg detection recall - 0.4265007308420403
Avg detection f1 - 0.37628666586045745
Evaluating zff with Speechbrain as target:
Avg detection error_rate - 1.3074775284703604
Avg detection precision - 0.45758006784163874
Avg detection recall - 0.9858362540409786
Avg detection f1 - 0.6166886466969371
Evaluating energy with Speechbrain as target:
Avg detection error_rate - 1.179970975687464
Avg detection precision - 0.2998422130504583
Avg detection recall - 0.64
Avg detection f1 - 0.4034032769639063
