In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
# !pip install pyloudnorm

In [None]:
# !pip install librosa


Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.0-cp312-cp312-win_amd64.whl.metadata (2.8 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Collecting lazy-loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.0-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.0->librosa)
  Downloading llvmlite-0.44.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Col

In [8]:
import os
import IPython.display as ipd
import librosa
from glob import glob
import soundfile as sf
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal, linalg
import sklearn
from ipywidgets import interact
import urllib

import torch
from pprint import pprint

from sklearn import preprocessing
from sklearn import cluster
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import numpy as np
import pyloudnorm as pyln
import soundfile as sf


import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [9]:
def load_vad_model():
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
    speech_timestamps = utils[0]
    read_audio = utils[2]
    
    return speech_timestamps, model, read_audio

In [10]:
import pyloudnorm as pyln
import numpy as np

def loudness_normalization(audio, target_lufs=-23.0):
    meter = pyln.Meter(16000)
    loudness = meter.integrated_loudness(audio)
    gain = target_lufs - loudness
    
    normalized_audio = audio * (10**(gain / 20))
    
    return normalized_audio


In [11]:
def apply_filters(audio, sr):
    hp_cutoff = 80 
    b, a = signal.butter(4, hp_cutoff / (sr / 2), btype='highpass')
    audio = signal.filtfilt(b, a, audio)

    lp_cutoff = 7000
    b, a = signal.butter(4, lp_cutoff / (sr / 2), btype='lowpass')
    audio = signal.filtfilt(b, a, audio)

    return audio

In [12]:
def apply_silero_vad(audio, speech_timestamps, model):
    audio_tensor = torch.tensor(audio, dtype=torch.float32)
    
    speech_segments = get_speech_timestamps(
        audio_tensor,
        model,
        return_seconds=False
    )
    return speech_segments

In [13]:
def extract_features(audio, sr):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)

    zcr = librosa.feature.zero_crossing_rate(y=audio)

    spectral_flux = librosa.onset.onset_strength(y=audio, sr=sr)
    spectral_flux = spectral_flux.reshape(1, -1)

    ste = librosa.feature.rms(y=audio)

    # spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)

    # spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)

    # spectral_flatness = librosa.feature.spectral_flatness(y=audio)
    
    # features_list = [mfccs, spectral_centroid, zcr, spectral_flux, ste, spectral_rolloff, spectral_bandwidth, spectral_flatness]
    features_list = [mfccs, spectral_centroid, zcr, spectral_flux, ste]

    num_frames = min(f.shape[1] for f in features_list)
    
    features = np.vstack([f[:, :num_frames] for f in features_list]).T

    return features

In [14]:
def normalize_features(features, method="standard"):
    if method == "standard":
        scaler = StandardScaler()
    elif method == "minmax":
        scaler = MinMaxScaler()

    return scaler.fit_transform(features)

In [15]:
def process_audio_file(audio_path, speech_timestamps, model, sr=16000, chunk_duration=0.025):
    audio, sr = librosa.load(audio_path, sr=sr, mono=True)

    audio = apply_filters(audio, sr)
    audio = loudness_normalization(audio)
    
    speech_segments = apply_silero_vad(audio, get_speech_timestamps, model)

    features = extract_features(audio, sr)
    
    labels = []
    for i in range(features.shape[0]):
        chunk_start = int(i * chunk_duration * sr)
        chunk_end = chunk_start + int(chunk_duration * sr)

        is_speech = any(s['start'] <= chunk_start and s['end'] >= chunk_end for s in speech_segments)
        labels.append(1 if is_speech else 0)

    if len(labels) != features.shape[0]:
        if len(labels) > features.shape[0]:
            labels = labels[:features.shape[0]]
        else:
            labels.extend([0] * (features.shape[0] - len(labels)))

    features = normalize_features(features, method="standard")

    num_features = features.shape[1]
    feature_names = [f"feature_{i}" for i in range(num_features)]

    df = pd.DataFrame(features, columns=feature_names)
    df["vad_label"] = labels
    df["start_time"] = [(i * chunk_duration) for i in range(len(labels))]
    df["end_time"] = [((i + 1) * chunk_duration) for i in range(len(labels))]
    df["file_path"] = audio_path

    return df


In [18]:
def process_directory(input_dir, output_csv, get_speech_timestamps, model):

    all_data = []  
    processed_count = 0
    error_count = 0
    
    for root, dirs, files in os.walk(input_dir):
        for file in tqdm(files):
            if file.endswith(".wav"): 
                audio_path = os.path.join(root, file)
                
                try:
                    df = process_audio_file(audio_path, get_speech_timestamps, model)
                    all_data.append(df)
                    processed_count += 1
                    
                    if processed_count % 100 == 0:
                        intermediate_df = pd.concat(all_data, ignore_index=True)
                        intermediate_df.to_csv(f"{output_csv}.intermediate_{processed_count}", index=False)
                            
                except Exception as e:
                    error_count += 1
                    print(f"Error processing {audio_path}: {str(e)}")
                    continue

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.to_csv(output_csv, index=False)


input_directory = "/kaggle/input/english-multispeaker-corpus-for-voice-cloning/VCTK-Corpus/VCTK-Corpus/wav48/p225"
output_csv_path = "output_features.csv"

torch.set_default_tensor_type(torch.FloatTensor)

get_speech_timestamps, model, read_audio = load_vad_model()

process_directory(input_directory, output_csv_path, get_speech_timestamps, model)

Using cache found in C:\Users\katja/.cache\torch\hub\snakers4_silero-vad_master


In [19]:
def align_cluster_labels(cluster_labels, vad_labels):
    if set(np.unique(cluster_labels)).issubset({0, 1, -1}):
        non_noise_mask = cluster_labels != -1
        filtered_clusters = cluster_labels[non_noise_mask]
        filtered_vad = vad_labels[non_noise_mask]
        
        if len(filtered_clusters) == 0:
            return np.zeros_like(cluster_labels)
            
        contingency = pd.crosstab(filtered_clusters, filtered_vad)
        
        if 0 in contingency.index and 1 in contingency.index:
            if contingency.loc[0, 1] > contingency.loc[1, 1]:
               
                mapping = {0: 1, 1: 0, -1: 0} 
            else:
                mapping = {0: 0, 1: 1, -1: 0}  
        else:
            only_label = list(contingency.index)[0]
            if contingency.loc[only_label, 1] > contingency.loc[only_label, 0]:
                mapping = {only_label: 1, -1: 0}
            else:
                mapping = {only_label: 0, -1: 0}
    
    labels = np.array([mapping.get(l, 0) for l in cluster_labels])
    return labels

def evaluate_clustering(vad_labels, cluster_labels):
    aligned_labels = align_cluster_labels(cluster_labels, vad_labels)
    
    accuracy = accuracy_score(vad_labels, aligned_labels)
    precision = precision_score(vad_labels, aligned_labels, zero_division=0)
    recall = recall_score(vad_labels, aligned_labels, zero_division=0)
    f1 = f1_score(vad_labels, aligned_labels, zero_division=0)
    conf_matrix = confusion_matrix(vad_labels, aligned_labels)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'aligned_labels': aligned_labels
    }

def main():
    csv_file = "/kaggle/working/output_features.csv"
    df = pd.read_csv(csv_file)
    
    feature_columns = [col for col in df.columns if col not in ['vad_label', 'start_time', 'end_time', 'file_path']]
    
    features = df[feature_columns].values
    vad_labels = df['vad_label'].values
    
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans_labels = kmeans.fit_predict(features_scaled)
    
    results = evaluate_clustering(vad_labels, kmeans_labels)
    
    print("Evaluation Results for KMeans Clustering:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1 Score: {results['f1_score']:.4f}")
    

main()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/output_features.csv'