In [11]:
import os

os.environ["KAGGLE_CONFIG_DIR"] = ""

In [13]:
# !kaggle datasets download -d mfekadu/english-multispeaker-corpus-for-voice-cloning
# !unzip english-multispeaker-corpus-for-voice-cloning.zip -d ../../data/

In [4]:
import warnings
import json
import os
from glob import glob
from typing import Callable
from collections import defaultdict

warnings.filterwarnings("ignore")

In [5]:
from models.vad import SpeechbrainVAD, EnergyVAD, SileroVAD, ZffVAD, KMeansVAD
from utils.wav_utils import resample_file, get_duration
from utils.metrics import DetectionMetric

In [6]:
class Config:
    # data_path: str = "../data/VCTK-Corpus/VCTK-Corpus/wav48/*/*.wav"
    # num_samples: int = 300
    data_path: str = "../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/*.wav"
    target_methods: list[str] = ["silero", "speechbrain", "silero_merged"]
    pred_methods: list[str] = ["zff", "energy", "full_speech", "k_means"]
    boundaries_save_path: str = "./boundaries_v5.json"
    kmeans_version: str = "v2"

In [7]:
audio_files = glob(Config.data_path, recursive=True)
audio_files = list(filter(lambda f: "16k" not in f, audio_files))
dataset = audio_files
# dataset = audio_files[: Config.num_samples]

In [8]:
len(audio_files)

231

In [9]:
if os.path.exists(Config.boundaries_save_path):
    with open(Config.boundaries_save_path, "r") as file:
        BOUNDARIES_DATA = json.load(file)
else:
    BOUNDARIES_DATA = {}

In [11]:
speechbrain_vad = SpeechbrainVAD()
silero_vad = SileroVAD()
energy_vad = EnergyVAD()
zero_ff_vad = ZffVAD()

In [12]:
k_means_vad = KMeansVAD(
    features_list=[
        "mfccs",
        "mfcc_delta",
        "mfcc_delta2",
        "spectral_centroid",
        "zcr",
        "spectral_flux",
        "ste",
        # "pitch"
    ]
)
k_means_vad.fit(audio_files=dataset, csv_path=f"kmeans_vad_{Config.kmeans_version}.csv")
k_means_vad.save(version=Config.kmeans_version)

Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_159.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_165.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_171.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_039.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_011.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_005.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_212.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_004.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_010.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_038.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_158.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_366.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_358.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_172.wav
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_166

In [13]:
# # can be loaded only if all files from dataset were fitted
# k_means_vad = KMeansVAD.load(
#     kmeans_path="kmeans_vad_v2.pkl",
#     scaler_path="kmeans_scaler_v2.pkl",
#     cfg_path="kmeans_cfg_v2.json"
# )

In [13]:
for audio_file in dataset:
    print(f"Processing {audio_file}...")
    resampled_path = f"{audio_file[:-4]}_16k.wav"
    if not os.path.exists(resampled_path):
        print(f"Resampling {audio_file}")
        resample_file(
            audio_path=audio_file,
            save_path=resampled_path,
            target_sr=16000,
            to_mono=True,
        )

    preds = {
        "energy": energy_vad.get_boundaries(audio_file, close_th=500),
        "k_means": k_means_vad.get_boundaries(audio_file),
    }
    targets = {}
    if not BOUNDARIES_DATA.get(audio_file, {}).get("silero"):
        targets["silero"] = silero_vad.get_boundaries(audio_file)
        targets["silero_merged"] = silero_vad.get_boundaries(audio_file, merge_th=300)

    if not BOUNDARIES_DATA.get(audio_file, {}).get("speechbrain"):
        targets["speechbrain"] = speechbrain_vad.get_boundaries(
            resampled_path, apply_energy_vad=False
        )

    if not BOUNDARIES_DATA.get(audio_file, {}).get("full_speech"):
        targets["full_speech"] = [{"start": 0.0, "end": get_duration(audio_file)}]

    upd_data = BOUNDARIES_DATA.get(audio_file, {})
    upd_data.update(targets)
    upd_data.update(preds)
    BOUNDARIES_DATA[audio_file] = upd_data

with open(Config.boundaries_save_path, "w") as f:
    json.dump(BOUNDARIES_DATA, f, indent=4)

Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_159.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_165.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_171.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_039.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_011.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_005.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_212.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_004.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_010.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_038.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_158.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_366.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_358.wav...
Processing ../data/VCTK-Corpus/VCTK-Corpus/wav48/p225/p225_172.wav...
Processing ../data/V

In [14]:
metrics: list[DetectionMetric] = [
    DetectionMetric.create(metric_name="error_rate"),
    DetectionMetric.create(metric_name="precision"),
    DetectionMetric.create(metric_name="recall"),
    DetectionMetric.create(metric_name="f1"),
]

get_pred_methods: Callable[[dict[str, list[dict]]], list[str]] = lambda cache: [
    k for k in cache.keys() if k not in Config.target_methods
]
first_char_upper: Callable[[str], dict[str]] = lambda text: text[0].upper() + text[1:]

In [15]:
metrics_buffer = {
    "silero": {pred_method: defaultdict(list) for pred_method in Config.pred_methods},
    "silero_merged": {
        pred_method: defaultdict(list) for pred_method in Config.pred_methods
    },
    "speechbrain": {
        pred_method: defaultdict(list) for pred_method in Config.pred_methods
    },
}


def count_metrics(
    audio_id: str, method: str, cache: dict[str, list[dict]], target: str
):
    for metric in metrics:
        metrics_buffer[target][method][metric.name].append(
            metric(
                targets=cache[target],
                predictions=cache[method],
            )
        )


for key, boundaries in BOUNDARIES_DATA.items():
    for target_method in Config.target_methods:
        for pred_method in get_pred_methods(boundaries):
            count_metrics(
                audio_id=key,
                method=pred_method,
                cache=boundaries,
                target=target_method,
            )

In [16]:
for eval_model, metrics_cache in metrics_buffer.items():
    for pred_method, data in metrics_cache.items():
        print(
            f"Evaluating {first_char_upper(pred_method)} with {first_char_upper(eval_model)} as target:"
        )
        for name, values in data.items():
            print(f"Avg detection {name} - {sum(values) / len(values)}")
        print("=" * 50)

Evaluating Zff with Silero as target:
Evaluating Energy with Silero as target:
Avg detection error_rate - 0.4179757915088645
Avg detection precision - 0.9983792734128865
Avg detection recall - 0.5833149716390971
Avg detection f1 - 0.7149232892848006
Evaluating Full_speech with Silero as target:
Avg detection error_rate - 0.9796511204270454
Avg detection precision - 0.5309108836389607
Avg detection recall - 0.9999981191068512
Avg detection f1 - 0.6861506352376565
Evaluating K_means with Silero as target:
Avg detection error_rate - 0.08468887609791242
Avg detection precision - 0.9662739813374178
Avg detection recall - 0.952163618883239
Avg detection f1 - 0.9580005075872263
Evaluating Zff with Silero_merged as target:
Evaluating Energy with Silero_merged as target:
Avg detection error_rate - 0.42160625057925144
Avg detection precision - 0.9996939284725305
Avg detection recall - 0.5785604877461726
Avg detection f1 - 0.7112845121732623
Evaluating Full_speech with Silero_merged as target:
Av