In [12]:
# !pip install pyannote.audio

In [9]:
import os
import IPython.display as ipd
import librosa
from glob import glob
import soundfile as sf
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal, linalg
import sklearn
from ipywidgets import interact
import urllib

import torch
from pprint import pprint

from sklearn import preprocessing

from sklearn import cluster
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import numpy as np
import soundfile as sf


import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import time
import scipy
from scipy.signal import wiener
import math

In [None]:
def apply_filters(audio, sr):
    '''
    apply high-pass filter to filter out non-speech noise < 60Hz
    '''
    hp_cutoff = 60
    Wn_hp = hp_cutoff / (sr / 2)
    
    
    b, a = signal.butter(4, Wn_hp, btype='highpass')
    audio = signal.filtfilt(b, a, audio)

    return audio
    
def loudness_normalization(audio):
    
    return librosa.util.normalize(audio)

def apply_wiener_filter(y, sr=16000, win_length=7, hop_length=512):
    '''Noise filter'''
    filtered_audio = wiener(y, mysize=win_length)
    return filtered_audio

def extract_features(audio, sr, chunk_start, chunk_end, context_size=5):
    '''
    Extracting features in current chunk and in context window of n chunks
    '''
    current_chunk = audio[chunk_start:chunk_end]
    
    context_samples = int(context_size * (chunk_end - chunk_start))
    left_start = max(0, chunk_start - context_samples)
    right_end = min(len(audio), chunk_end + context_samples)
    
    left_context = audio[left_start:chunk_start]
    right_context = audio[chunk_end:right_end]
    
    full_context = np.concatenate([left_context, current_chunk, right_context])
    
    n_fft = min(512, len(current_chunk))
    hop_length = n_fft // 4
    
    mfccs = librosa.feature.mfcc(
        y=current_chunk, 
        sr=sr, 
        n_mfcc=13,
        n_fft=n_fft,
        hop_length=hop_length
    )
    
    mfccs_with_context = librosa.feature.mfcc(
        y=full_context, 
        sr=sr, 
        n_mfcc=13,
        n_fft=n_fft,
        hop_length=hop_length
    )
    # delta features need context window, they track how the mfccs change over time 
    mfcc_delta = librosa.feature.delta(mfccs_with_context)
    mfcc_delta2 = librosa.feature.delta(mfccs_with_context, order=2)
    
    if mfccs.shape[1] == 0:
        mfccs = np.zeros((13, 1))
    if mfcc_delta.shape[1] == 0:
        mfcc_delta = np.zeros((13, 1))
    if mfcc_delta2.shape[1] == 0:
        mfcc_delta2 = np.zeros((13, 1))
    
    spec = np.abs(librosa.stft(current_chunk, n_fft=n_fft, hop_length=hop_length))


    if spec.shape[1] == 0:
        spectral_centroid = 0
        spectral_flux = 0
    else:
        spectral_centroid = librosa.feature.spectral_centroid(
            S=spec, sr=sr).mean()
        spectral_flux = np.mean(np.diff(spec, axis=1)**2) if spec.shape[1] > 1 else 0
    
    zcr = librosa.feature.zero_crossing_rate(
        current_chunk, 
        frame_length=n_fft, 
        hop_length=hop_length
    ).mean()
    ste = np.mean(current_chunk**2)
    
    features = np.concatenate([
        np.mean(mfccs, axis=1),         
        np.mean(mfcc_delta, axis=1),   
        np.mean(mfcc_delta2, axis=1),   
        [spectral_centroid, zcr, spectral_flux, ste]  
    ])
    
    return features

def process_audio_file(audio_path, sr=16000, 
                      chunk_duration=0.025, hop_duration=0.01, context_size=5):
        
    audio, sr = librosa.load(audio_path, sr=sr, mono=True)
    
    # filter out < 60Hz no speech for sure
    
    audio = apply_filters(audio, sr)
    audio = loudness_normalization(audio)
    # noise reduce
    audio = apply_wiener_filter(audio)
    
    
    chunk_samples = int(chunk_duration * sr)
    hop_samples = int(hop_duration * sr)
    num_chunks = (len(audio) - chunk_samples) // hop_samples + 1
    
    feature_dim = 13 * 3 + 4 
    all_features = np.zeros((num_chunks, feature_dim))
    
    for i in range(num_chunks):
        chunk_start = i * hop_samples
        chunk_end = chunk_start + chunk_samples
        
        all_features[i] = extract_features(
            audio, sr, chunk_start, chunk_end, context_size
        )
                    
    
    feature_names = (
        [f"mfcc_{i+1}" for i in range(13)] +
        [f"mfcc_delta_{i+1}" for i in range(13)] +
        [f"mfcc_delta2_{i+1}" for i in range(13)] +
        ['spectral_centroid', 'zcr', 'spectral_flux', 'ste']
    )
    
    df = pd.DataFrame(all_features, columns=feature_names)
    df["start_time"] = np.arange(num_chunks) * hop_duration
    df["end_time"] = df["start_time"] + chunk_duration
    df["file_path"] = audio_path
    
    return df

def process_directory(input_dir, output_csv, save_interval=1000):
    all_data = []
    
    target_dirs = [f"p{i}" for i in range(230, 241)]
    total_files = sum(len(files) for d in target_dirs for _, _, files in os.walk(os.path.join(input_dir, d)) if os.path.exists(os.path.join(input_dir, d)))
    
    with tqdm(total=total_files, desc="Processing audio files") as pbar:
        for speaker_dir in target_dirs:
            speaker_path = os.path.join(input_dir, speaker_dir)
            
            if not os.path.exists(speaker_path):
                continue
            
            for root, _, files in os.walk(speaker_path):
                for file in files:
                    if file.endswith(".wav"):
                        audio_path = os.path.join(root, file)
                        
                        df = process_audio_file(audio_path)
                        all_data.append(df)
                        pbar.update(1)
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.to_csv(output_csv, index=False)
    
    return combined_df

In [None]:
def prepare_features(df):
    scaler = StandardScaler()
    feature_cols = [col for col in df.columns if any(
        col.startswith(prefix) for prefix in ['mfcc_', 'spectral_', 'zcr', 'ste']
    )]
    
    X = df[feature_cols].values
    # scale for k means
    X = scaler.fit_transform(X)
    return X
     
def apply_kmeans(X, n_clusters=2):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    # we take the last column, which is ste (noise ste < speech ste)
    cluster_energies = [X[labels == i, -1].mean() for i in range(n_clusters)]
    if cluster_energies[0] > cluster_energies[1]:
        labels = 1 - labels
        
    return labels
    
def correct_vad_predictions(df, threshold_ms=100):
    '''
    If a switch between speech and silence lasts for less than 100ms, 
    it is replaced with the surrounding majority class.
    '''
    df_copy = df.copy()
    min_frames = threshold_ms // 10 
    predictions = df_copy['vad_prediction'].values
    
    changes = np.diff(predictions)
    change_points = np.where(changes != 0)[0] + 1
    
    segments = np.concatenate(([0], change_points, [len(predictions)]))
    
    for i in range(len(segments) - 1):
        segment_length = segments[i + 1] - segments[i]
        
        if segment_length < min_frames:
            value_before = predictions[segments[i] - 1] if segments[i] > 0 else predictions[segments[i]]
            value_after = predictions[segments[i + 1]] if segments[i + 1] < len(predictions) else predictions[segments[i + 1] - 1]
            
            if value_before == value_after:
                predictions[segments[i]:segments[i + 1]] = value_before
    
    df_copy['vad_prediction'] = predictions
    
    return df_copy

In [45]:
from pyannote.core import Annotation, Segment
from pyannote.metrics.detection import DetectionErrorRate

def load_silero_vad(audio_path):
    wav, sr = librosa.load(audio_path, sr=16000)
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
    (get_speech_timestamps, _, _, _, _) = utils
    speech_timestamps = get_speech_timestamps(wav, model, return_seconds=True)
    return speech_timestamps


def floor_1_decimal(val):
    return math.floor(val * 10) / 10

def ceil_1_decimal(val):
    return math.ceil(val * 10) / 10

def custom_vad_segments(df, frame_duration=0.01):
    """Get speech segments from custom VAD predictions"""
    predictions = df['vad_prediction'].values
    segments = []
    changes = np.diff(predictions.astype(int))
    change_points = np.where(changes != 0)[0] + 1
    
    if predictions[0] == 1:
        change_points = np.concatenate(([0], change_points))
    if predictions[-1] == 1:
        change_points = np.concatenate((change_points, [len(predictions)]))
    
    for i in range(0, len(change_points), 2):
        if i + 1 < len(change_points):
            start = change_points[i] * frame_duration
            end = change_points[i + 1] * frame_duration
            segments.append({'start': floor_1_decimal(start), 'end': ceil_1_decimal(end)})
    return segments

def calculate_der(reference_segments, hypothesis_segments):
    reference = Annotation()
    hypothesis = Annotation()
    
    for segment in reference_segments:
        reference[Segment(segment['start'], segment['end'])] = 'SPEECH'
    
    for segment in hypothesis_segments:
        hypothesis[Segment(segment['start'], segment['end'])] = 'SPEECH'
    
    metric = DetectionErrorRate()
    _ = metric(reference, hypothesis)
    return abs(metric)

def main(df, audio_directory):
    total_error_rate = 0
    file_count = 0
    
    for file_path in df['file_path'].unique():
        try:
            file_df = df[df['file_path'] == file_path]
            file_path = os.path.join(file_path.split("/")[-2],file_path.split("/")[-1])
            audio_path = os.path.join(audio_directory, file_path)
            
            
            silero_vad_regions = load_silero_vad(audio_path)
            custom_vad_segments_result = custom_vad_segments(file_df)
            
            # print("Silero VAD Results:")
            # for region in silero_vad_regions:
            #     print(f"Start: {region['start']:.3f}, End: {region['end']:.3f}")
            
            # print("\nCustom VAD Results:")
            # for segment in custom_vad_segments_result:
            #     print(f"Start: {segment['start']:.3f}, End: {segment['end']:.3f}")
            
        
            error_rate = calculate_der(silero_vad_regions, custom_vad_segments_result)
            
            total_error_rate += error_rate
            file_count += 1
        except Exception:
            print("no fiel ")
            continue
    
    if file_count > 0:
        average_error_rate = (total_error_rate / file_count) * 100
        print(f"\nAverage Detection Error Rate across all files: {average_error_rate:.1f}%")
    
    return average_error_rate


#### Get features from audio files

In [None]:
# input_directory = "/kaggle/input/english-multispeaker-corpus-for-voice-cloning/VCTK-Corpus/VCTK-Corpus/wav48"
# output_csv_path = "output_features.csv"
# df = process_directory(input_directory, output_csv_path)

In [4]:
import pandas as pd 
df = pd.read_csv("/Users/vadymvilhurin/Documents/KPI/output_features.csv")

#### Apply K-means and post-processing

In [10]:
features = prepare_features(df)
predictions = apply_kmeans(features)
df["vad_prediction"] = predictions
df = correct_vad_predictions(df)

#### Calculate DER

In [46]:
audio_directory = '/Users/vadymvilhurin/Downloads/archive/VCTK-Corpus/VCTK-Corpus/wav48'
main(df, audio_directory)

Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch

no fiel 


Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master
Using cache found in /Users/vadymvilhurin/.cache/torch


Average Detection Error Rate across all files: 8.8%


Using cache found in /Users/vadymvilhurin/.cache/torch/hub/snakers4_silero-vad_master


8.820284282147856