Step 1: Initial Training Using an SVM Model (Non-Random Split 80-20)

In [1]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc_features.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                mfcc = extract_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
print("Loading testing data...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for testing
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM model
print("Training SVM model...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


Loading training data...




Loading testing data...




Training SVM model...
SVM Model Evaluation:
Accuracy: 0.96
F1 Score: 0.90


Step 2: Adding a Validation Split (80-10-10 Random Split)

In [1]:
import os
import numpy as np
import librosa
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Paths to the dataset directories
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc_features.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                mfcc = extract_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data from the train folder
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load all test data from the test folder
print("Loading test data for splitting into validation and test sets...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech test data
X_test_all = np.vstack((X_test_speech, X_test_non_speech))
y_test_all = np.hstack((y_test_speech, y_test_non_speech))

# Split the 20% test data into 10% validation and 10% test
X_val, X_test, y_val, y_test = train_test_split(X_test_all, y_test_all, test_size=0.5, random_state=42, stratify=y_test_all)

# Normalize the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train the SVM model on the training set
print("Training SVM model...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = svm_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
print("Validation Set Evaluation:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"F1 Score: {val_f1:.2f}")

# Final evaluation on the test set
y_test_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
print("Test Set Evaluation:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"F1 Score: {test_f1:.2f}")


Loading training data...




Loading test data for splitting into validation and test sets...




Training SVM model...
Validation Set Evaluation:
Accuracy: 0.97
F1 Score: 0.93
Test Set Evaluation:
Accuracy: 0.95
F1 Score: 0.86


Step 3: Filtering the Dataset

In [1]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train_filtered"
test_dir = r"E:\Python_proj\ML_Project\musan\test_filtered"

# Function to extract MFCC features
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc_features.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                mfcc = extract_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
print("Loading testing data...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for testing
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM model
print("Training SVM model...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Classification:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


Loading training data...
Loading testing data...




Training SVM model...
SVM Model Classification:
Accuracy: 0.93
F1 Score: 0.81


Step 4: Comparing with Hidden Markov Model (HMM)

In [1]:
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from hmmlearn.hmm import GaussianHMM
from librosa.feature import mfcc
from librosa import load

def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = load(file_path, sr=None)
        mfcc_features = mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc_features.T, axis=0)  # Return averaged MFCCs
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                mfcc = extract_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Paths to training and testing directories
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Load training data
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech training data
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech testing data
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Train HMM for speech
speech_hmm = GaussianHMM(n_components=5, covariance_type='full', random_state=42, n_iter=100)
speech_hmm.fit(X_train_speech)

# Train HMM for non-speech
non_speech_hmm = GaussianHMM(n_components=5, covariance_type='full', random_state=42, n_iter=100)
non_speech_hmm.fit(X_train_non_speech)

y_pred = []
for i, x in enumerate(X_test):
    # Compute log-likelihoods for speech and non-speech HMMs
    speech_score = speech_hmm.score([x])
    non_speech_score = non_speech_hmm.score([x])
    
    # Assign label based on higher score
    if speech_score > non_speech_score:
        y_pred.append(1)  # Speech
    else:
        y_pred.append(0)  # Non-speech

# Convert predictions to numpy array
y_pred = np.array(y_pred)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("HMM Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")




HMM Model Performance:
Accuracy: 0.70
F1 Score: 0.58


Step 5: Applying t-Test Before and After Training

In [13]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_rel

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc_features.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                mfcc = extract_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
print("Loading testing data...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for testing
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Ensure equal lengths for t-test
min_length = min(len(X_train), len(X_test))
X_train_sample = X_train[:min_length, 0]  # Take the first feature for comparison
X_test_sample = X_test[:min_length, 0]   # Take the first feature for comparison

# Conduct t-test on raw data before training
print("Performing t-test on raw data before training...")
t_stat, p_value = ttest_rel(X_train_sample, X_test_sample)
print(f"T-test before training - t-statistic: {t_stat:.2f}, p-value: {p_value:.2e}")

# Train SVM model
print("Training SVM model...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

# Ensure equal lengths for t-test after training
y_test_sample = y_test[:min_length]
y_pred_sample = y_pred[:min_length]

# Conduct t-test on predicted labels after training
print("Performing t-test on predicted labels after training...")
t_stat_post, p_value_post = ttest_rel(y_test_sample, y_pred_sample)
print(f"T-test after training - t-statistic: {t_stat_post:.2f}, p-value: {p_value_post:.2e}")


Loading training data...




Loading testing data...




Performing t-test on raw data before training...
T-test before training - t-statistic: -2.79, p-value: 5.48e-03
Training SVM model...
SVM Model Evaluation:
Accuracy: 0.96
F1 Score: 0.90
Performing t-test on predicted labels after training...
T-test after training - t-statistic: 0.73, p-value: 4.68e-01


Step 6: Applying PCA

In [14]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc_features.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                mfcc = extract_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
print("Loading testing data...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for testing
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply PCA for dimensionality reduction
print("Applying PCA...")
pca = PCA(n_components=10)  # Reduce to 10 principal components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train SVM model
print("Training SVM model with PCA...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_pca, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Evaluation with PCA:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


Loading training data...




Loading testing data...




Applying PCA...
Training SVM model with PCA...
SVM Model Evaluation with PCA:
Accuracy: 0.95
F1 Score: 0.88


Step 7: Adding Zero Crossing Rate (ZCR) to MFCC Features

In [15]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features and Zero Crossing Rate
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)

        # Extract MFCC features
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = np.mean(mfcc_features.T, axis=0)

        # Extract Zero Crossing Rate (ZCR)
        zcr = librosa.feature.zero_crossing_rate(y)
        zcr_mean = np.mean(zcr)

        # Combine MFCC and ZCR
        features = np.append(mfcc_mean, zcr_mean)
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                feature = extract_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
print("Loading testing data...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for testing
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM model
print("Training SVM model with MFCC and ZCR...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Evaluation with MFCC and ZCR:")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


Loading training data...




Loading testing data...




Training SVM model with MFCC and ZCR...
SVM Model Evaluation with MFCC and ZCR:
Accuracy: 0.96
F1 Score: 0.91


Step 8: Modifying Window and Hop Length (20 ms and 10 ms, respectively)

In [2]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features with updated window size and hop length
def extract_features(file_path, n_mfcc=13, n_fft=320, hop_length=160):
    try:
        y, sr = librosa.load(file_path, sr=None)

        # Extract MFCC features with standard 20ms window size and 10ms hop length
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
        return np.mean(mfcc_features.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                feature = extract_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data (80/20 non-random split)
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
print("Loading testing data...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for testing
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM model
print("Training SVM model with updated window size and hop length...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Evaluation (Standard Windowing Parameters):")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


Loading training data...


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


Loading testing data...
Training SVM model with updated window size and hop length...
SVM Model Evaluation (Standard Windowing Parameters):
Accuracy: 0.97
F1 Score: 0.91


Step 9: Combined Optimizations for Final Evaluation (Non-Random Split 80/20) with t-Test and ZCR

In [3]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import ttest_rel

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features and Zero Crossing Rate
def extract_features(file_path, n_mfcc=13, n_fft=320, hop_length=160):
    try:
        y, sr = librosa.load(file_path, sr=None)

        # Extract MFCC features with updated window length (20ms) and 50% overlap (hop length)
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
        mfcc_mean = np.mean(mfcc_features.T, axis=0)

        # Extract Zero Crossing Rate (ZCR)
        zcr = librosa.feature.zero_crossing_rate(y, hop_length=hop_length)
        zcr_mean = np.mean(zcr)

        # Combine MFCC and ZCR
        features = np.append(mfcc_mean, zcr_mean)
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                feature = extract_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data (80-20 split non-random)
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load testing data
print("Loading testing data...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for testing
X_test = np.vstack((X_test_speech, X_test_non_speech))
y_test = np.hstack((y_test_speech, y_test_non_speech))

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Ensure equal lengths for t-test
min_length = min(len(X_train), len(X_test))
X_train_sample = X_train[:min_length, 0]  # Take the first feature for comparison
X_test_sample = X_test[:min_length, 0]   # Take the first feature for comparison

# Conduct t-test on raw data before training
print("Performing t-test on raw data before training...")
t_stat, p_value = ttest_rel(X_train_sample, X_test_sample)
print(f"T-test before training - t-statistic: {t_stat:.2f}, p-value: {p_value:.2e}")

# Train SVM model
print("Training SVM model with MFCC and ZCR...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Evaluation with MFCC and ZCR (Non-Random Split):")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

# Ensure equal lengths for t-test after training
y_test_sample = y_test[:min_length]
y_pred_sample = y_pred[:min_length]

# Conduct t-test on predicted labels after training
print("Performing t-test on predicted labels after training...")
t_stat_post, p_value_post = ttest_rel(y_test_sample, y_pred_sample)
print(f"T-test after training - t-statistic: {t_stat_post:.2f}, p-value: {p_value_post:.2e}")


Loading training data...


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


Loading testing data...
Performing t-test on raw data before training...
T-test before training - t-statistic: -3.07, p-value: 2.32e-03
Training SVM model with MFCC and ZCR...
SVM Model Evaluation with MFCC and ZCR (Non-Random Split):
Accuracy: 0.97
F1 Score: 0.91
Performing t-test on predicted labels after training...
T-test after training - t-statistic: 1.07, p-value: 2.86e-01


Step 10: Combined Optimizations for Final Evaluation (Random Split 80/10/10) with t-Test and ZCR

In [4]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_rel

# Paths to the prepared dataset
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features and Zero Crossing Rate
def extract_features(file_path, n_mfcc=13, n_fft=320, hop_length=160):
    try:
        y, sr = librosa.load(file_path, sr=None)

        # Extract MFCC features with updated window length (20ms) and 50% overlap (hop length)
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
        mfcc_mean = np.mean(mfcc_features.T, axis=0)

        # Extract Zero Crossing Rate (ZCR)
        zcr = librosa.feature.zero_crossing_rate(y, hop_length=hop_length)
        zcr_mean = np.mean(zcr)

        # Combine MFCC and ZCR
        features = np.append(mfcc_mean, zcr_mean)
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                feature = extract_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load and combine data
print("Loading data...")
X_speech, y_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_non_speech, y_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

X = np.vstack((X_speech, X_non_speech))
y = np.hstack((y_speech, y_non_speech))

# Split data into training, validation, and testing (80-10-10 random split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Ensure equal lengths for t-test
min_length = min(len(X_train), len(X_test))
X_train_sample = X_train[:min_length, 0]  # Take the first feature for comparison
X_test_sample = X_test[:min_length, 0]   # Take the first feature for comparison

# Conduct t-test on raw data before training
print("Performing t-test on raw data before training...")
t_stat, p_value = ttest_rel(X_train_sample, X_test_sample)
print(f"T-test before training - t-statistic: {t_stat:.2f}, p-value: {p_value:.2e}")

# Train SVM model
print("Training SVM model with MFCC and ZCR...")
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("SVM Model Evaluation with MFCC and ZCR (Random Split):")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

# Ensure equal lengths for t-test after training
y_test_sample = y_test[:min_length]
y_pred_sample = y_pred[:min_length]

# Conduct t-test on predicted labels after training
print("Performing t-test on predicted labels after training...")
t_stat_post, p_value_post = ttest_rel(y_test_sample, y_pred_sample)
print(f"T-test after training - t-statistic: {t_stat_post:.2f}, p-value: {p_value_post:.2e}")


Loading data...


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


Performing t-test on raw data before training...
T-test before training - t-statistic: 2.01, p-value: 4.65e-02
Training SVM model with MFCC and ZCR...
SVM Model Evaluation with MFCC and ZCR (Random Split):
Accuracy: 0.99
F1 Score: 0.97
Performing t-test on predicted labels after training...
T-test after training - t-statistic: 0.00, p-value: 1.00e+00


Step 11: K-fold Cross-Validation (5-fold) in Step 10

In [5]:
import os
import numpy as np
import librosa
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from scipy.stats import ttest_rel

# Paths to the dataset directories
train_dir = r"E:\Python_proj\ML_Project\musan\train"
test_dir = r"E:\Python_proj\ML_Project\musan\test"

# Function to extract MFCC features and Zero Crossing Rate
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)

        # Extract MFCC features
        mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = np.mean(mfcc_features.T, axis=0)

        # Extract Zero Crossing Rate (ZCR)
        zcr = librosa.feature.zero_crossing_rate(y)
        zcr_mean = np.mean(zcr)

        # Combine MFCC and ZCR
        features = np.append(mfcc_mean, zcr_mean)
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to load data and extract features from a directory
def load_data(directory, label):
    features, labels = [], []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                feature = extract_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    return np.array(features), np.array(labels)

# Load training data from the train folder
print("Loading training data...")
X_train_speech, y_train_speech = load_data(os.path.join(train_dir, 'speech'), label=1)
X_train_non_speech, y_train_non_speech = load_data(os.path.join(train_dir, 'non_speech'), label=0)

# Combine speech and non-speech data for training
X_train = np.vstack((X_train_speech, X_train_non_speech))
y_train = np.hstack((y_train_speech, y_train_non_speech))

# Load all test data from the test folder
print("Loading test data for splitting into validation and test sets...")
X_test_speech, y_test_speech = load_data(os.path.join(test_dir, 'speech'), label=1)
X_test_non_speech, y_test_non_speech = load_data(os.path.join(test_dir, 'non_speech'), label=0)

# Combine speech and non-speech test data
X_test_all = np.vstack((X_test_speech, X_test_non_speech))
y_test_all = np.hstack((y_test_speech, y_test_non_speech))

# Split the 20% test data into 10% validation and 10% test
X_val, X_test, y_val, y_test = train_test_split(X_test_all, y_test_all, test_size=0.5, random_state=42, stratify=y_test_all)

# Normalize the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Ensure equal lengths for t-test
min_length = min(len(X_train), len(X_test))
X_train_sample = X_train[:min_length, 0]  # Take the first feature for comparison
X_test_sample = X_test[:min_length, 0]   # Take the first feature for comparison

# Conduct t-test on raw data before training
print("Performing t-test on raw data before training...")
t_stat, p_value = ttest_rel(X_train_sample, X_test_sample)
print(f"T-test before training - t-statistic: {t_stat:.2f}, p-value: {p_value:.2e}")

# Train SVM model using 5-fold cross-validation
print("Performing 5-fold cross-validation on the training data...")
svm_model = SVC(kernel='rbf', random_state=42)
accuracy_scorer = make_scorer(accuracy_score)
f1_scorer = make_scorer(f1_score)

accuracy_scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring=accuracy_scorer)
f1_scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring=f1_scorer)

print(f"Cross-Validation Accuracy Scores: {accuracy_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(accuracy_scores):.2f}")
print(f"Cross-Validation F1 Scores: {f1_scores}")
print(f"Mean Cross-Validation F1 Score: {np.mean(f1_scores):.2f}")

# Train the SVM model on the full training set
print("Training SVM model on the full training set...")
svm_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = svm_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
print("Validation Set Evaluation:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"F1 Score: {val_f1:.2f}")

# Final evaluation on the test set
y_test_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
print("Test Set Evaluation:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"F1 Score: {test_f1:.2f}")

# Ensure equal lengths for t-test after training
y_test_sample = y_test[:min_length]
y_pred_sample = y_test_pred[:min_length]

# Conduct t-test on predicted labels after training
print("Performing t-test on predicted labels after training...")
t_stat_post, p_value_post = ttest_rel(y_test_sample, y_pred_sample)
print(f"T-test after training - t-statistic: {t_stat_post:.2f}, p-value: {p_value_post:.2e}")


Loading training data...




Loading test data for splitting into validation and test sets...




Performing t-test on raw data before training...
T-test before training - t-statistic: -5.02, p-value: 1.12e-06
Performing 5-fold cross-validation on the training data...
Cross-Validation Accuracy Scores: [0.86068111 0.88235294 0.99068323 0.93167702 0.97204969]
Mean Cross-Validation Accuracy: 0.93
Cross-Validation F1 Scores: [0.62184874 0.63461538 0.97777778 0.84931507 0.93706294]
Mean Cross-Validation F1 Score: 0.80
Training SVM model on the full training set...
Validation Set Evaluation:
Accuracy: 0.97
F1 Score: 0.93
Test Set Evaluation:
Accuracy: 0.96
F1 Score: 0.89
Performing t-test on predicted labels after training...
T-test after training - t-statistic: 1.00, p-value: 3.19e-01
