In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("balanced_audio_dataset.csv")

In [4]:
df.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,energy,zcr,spectral_centroid,label
0,-477.9542,256.91644,67.11546,8.739448,2.758284,0.770825,5.305133,6.250667,0.596355,-0.712356,1.323257,2.500952,4.866691,0.023121,0.025192,341.103629,0
1,-270.60608,42.22273,-48.578365,19.939613,5.550137,17.706486,-11.928199,25.873552,-16.771667,1.831336,-8.294287,1.411134,5.032603,0.021516,0.220474,2328.221006,0
2,-131.91475,148.79492,13.958692,68.57376,0.65359,-12.121082,-0.728478,3.060145,6.074419,6.081366,-3.391458,-1.216702,1.177939,0.159142,0.048676,932.352704,1
3,-480.88986,262.51343,66.892426,8.498765,2.202479,1.525443,4.45303,4.999896,-0.053064,-1.211203,1.451995,2.073047,5.178939,0.024008,0.023468,333.384016,0
4,-413.6997,37.239326,2.051036,16.639393,-2.405408,19.12704,-3.345035,4.515248,7.415859,4.109774,-1.621489,2.398941,-0.977538,0.006376,0.237839,2733.993469,0


In [5]:
df=df.dropna()
X = df.drop("label", axis=1)
y = df["label"]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42
)

In [7]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
X_train_Scaled = scale.fit_transform(X_train)
X_test_scaled = scale.transform(X_test)

In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 10000)
model.fit(X_train_Scaled, y_train)

y_pred_test = model.predict(X_test_scaled)
y_pred_train = model.predict(X_train_Scaled)

In [9]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuaracy for training Dataset :- ", accuracy_score(y_pred_train, y_train))
print("Accuracy for Test Data :- ", accuracy_score(y_pred_test, y_test))

print("Classification Report")
print(classification_report(y_pred_test, y_test))


Accuaracy for training Dataset :-  0.9901630685944411
Accuracy for Test Data :-  0.991418008978083
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3808
           1       0.99      1.00      0.99      3766

    accuracy                           0.99      7574
   macro avg       0.99      0.99      0.99      7574
weighted avg       0.99      0.99      0.99      7574



In [10]:
import numpy as np
import librosa
from collections import deque

def advanced_vad(file_path, model, scaler):

    y, sr = librosa.load(file_path, sr=22050)
    window_duration = 0.5
    overlap_ratio = 0.5

    window_samples = int(sr * window_duration)
    hop_samples = int(window_samples * (1 - overlap_ratio))

    frame_length = 2048
    hop_length = 512

    enter_threshold = 0.7
    exit_threshold = 0.3

    smoothing_buffer = deque(maxlen=3)

    current_state = "Noise"

    print("\nAdvanced VAD Processing...\n")

    window_count = 0

    for start in range(0, len(y) - window_samples, hop_samples):

        window_count += 1

        chunk = y[start:start + window_samples]

        mfcc = librosa.feature.mfcc(
            y=chunk, sr=sr, n_mfcc=13,
            n_fft=frame_length,
            hop_length=hop_length
        )

        energy = librosa.feature.rms(
            y=chunk,
            frame_length=frame_length,
            hop_length=hop_length
        )

        zcr = librosa.feature.zero_crossing_rate(
            chunk,
            frame_length=frame_length,
            hop_length=hop_length
        )

        spectral_centroid = librosa.feature.spectral_centroid(
            y=chunk, sr=sr,
            n_fft=frame_length,
            hop_length=hop_length
        )

        features = np.vstack([
            mfcc,
            energy,
            zcr,
            spectral_centroid
        ]).T

        features_scaled = scaler.transform(features)

        probs = model.predict_proba(features_scaled)[:, 1]
        speech_ratio = np.mean(probs)
        smoothing_buffer.append(speech_ratio)
        smoothed_ratio = np.mean(smoothing_buffer)

        if current_state == "Noise":
            if smoothed_ratio > enter_threshold:
                current_state = "Speech"

        elif current_state == "Speech":
            if smoothed_ratio < exit_threshold:
                current_state = "Noise"

        print(f"Window {window_count}: {current_state} | Prob: {round(smoothed_ratio,3)}")


In [13]:
advanced_vad("noise.mp4", model, scale)

  y, sr = librosa.load(file_path, sr=22050)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



Advanced VAD Processing...





Window 1: Noise | Prob: 0.0
Window 2: Noise | Prob: 0.001
Window 3: Noise | Prob: 0.001
Window 4: Noise | Prob: 0.002
Window 5: Noise | Prob: 0.003
Window 6: Noise | Prob: 0.005
Window 7: Noise | Prob: 0.004
Window 8: Noise | Prob: 0.005
Window 9: Noise | Prob: 0.004
Window 10: Noise | Prob: 0.005
Window 11: Noise | Prob: 0.004
Window 12: Noise | Prob: 0.003
Window 13: Noise | Prob: 0.002
Window 14: Noise | Prob: 0.004
Window 15: Noise | Prob: 0.007
Window 16: Noise | Prob: 0.01
Window 17: Noise | Prob: 0.009
Window 18: Noise | Prob: 0.008
Window 19: Noise | Prob: 0.019
Window 20: Noise | Prob: 0.023
Window 21: Noise | Prob: 0.1
Window 22: Noise | Prob: 0.296




Window 23: Noise | Prob: 0.556
Window 24: Noise | Prob: 0.63
Window 25: Noise | Prob: 0.478
Window 26: Noise | Prob: 0.226
Window 27: Noise | Prob: 0.102
Window 28: Noise | Prob: 0.06
Window 29: Noise | Prob: 0.049
Window 30: Noise | Prob: 0.019
Window 31: Noise | Prob: 0.031
Window 32: Noise | Prob: 0.065
Window 33: Noise | Prob: 0.085
Window 34: Noise | Prob: 0.15
Window 35: Noise | Prob: 0.376
Window 36: Noise | Prob: 0.688
Window 37: Speech | Prob: 0.928
Window 38: Speech | Prob: 0.878
Window 39: Speech | Prob: 0.575
Window 40: Noise | Prob: 0.242
Window 41: Noise | Prob: 0.042
Window 42: Noise | Prob: 0.014
Window 43: Noise | Prob: 0.102
Window 44: Noise | Prob: 0.278
Window 45: Noise | Prob: 0.42
Window 46: Noise | Prob: 0.379


