<a href="https://colab.research.google.com/github/i-ganza007/Multimodal-Data-Preprocessing/blob/main/notebooks/VoicePrint_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [162]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import joblib
import librosa
import warnings

In [163]:
warnings.filterwarnings('ignore')

# Loading and preprocessing the data
def load_and_preprocess_data(csv_file):
    df = pd.read_csv(csv_file)

    # Filter for Eddy and Lievin with commands "yes approve" or "confirm transaction"
    valid_speakers = ['eddy', 'lievin']
    valid_commands = ['yes approve', 'confirm transaction']
    df = df[df['speaker'].isin(valid_speakers) & df['command'].isin(valid_commands)]

    # Selecting feature columns (excluding non-feature columns like file, speaker, command)
    feature_cols = [col for col in df.columns if col not in ['file', 'speaker', 'command', 'duration']]
    X = df[feature_cols].values
    y = df['speaker'].values

    # Handling potential NaN or infinite values
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

    print("Warning: Dataset is small ({} samples). Consider adding more data for better model performance.".format(len(X)))

    return X, y, feature_cols


In [164]:
def train_voiceprint_model(X, y):
    # Scaling the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Training an SVM classifier
    model = SVC(kernel='rbf', probability=True, random_state=42)
    model.fit(X_scaled, y)

    # Compute cross-validation metrics (2-fold CV for 4 samples)
    metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    print("\nCross-Validation Metrics (2-fold):")
    for metric in metrics:
        scores = cross_val_score(model, X_scaled, y, cv=2, scoring=metric)
        print(f"{metric.capitalize():<15}: {scores.mean():.4f} (±{scores.std():.4f})")

    print("Model trained on full dataset (no train-test split due to small size).")

    return model, scaler

In [165]:
def save_model_and_scaler(model, scaler, model_path='voiceprint_model.pkl', scaler_path='scaler.pkl'):
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    print(f"Model saved to {model_path}")
    print(f"Scaler saved to {scaler_path}")

In [166]:
def predict_speaker(model, scaler, features):
    features_scaled = scaler.transform([features])
    probabilities = model.predict_proba(features_scaled)[0]
    # Select the speaker with the highest probability
    max_prob_idx = np.argmax(probabilities)
    predicted_speaker = model.classes_[max_prob_idx]
    return predicted_speaker, probabilities

In [167]:
def extract_audio_features(audio_file, feature_cols, sr=22050):
    try:
        # Load audio file
        y, sr = librosa.load(audio_file, sr=sr)

        # Check duration (proxy for valid command, based on CSV durations ~2-3 seconds)
        duration = librosa.get_duration(y=y, sr=sr)
        if not (1.5 <= duration <= 3.5):
            print(f"Error: Audio duration {duration:.2f}s is outside expected range (1.5-3.5s).")
            return None

        # Initialize feature vector
        features = []

        # Extract MFCC features (mean and std for 13 coefficients)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        features.extend(mfcc_mean)
        features.extend(mfcc_std)

        # Extract MFCC deltas
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta_mean = np.mean(mfcc_delta, axis=1)
        features.extend(mfcc_delta_mean)

        # Extract MFCC delta2
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        mfcc_delta2_mean = np.mean(mfcc_delta2, axis=1)
        features.extend(mfcc_delta2_mean)

        # Extract spectral rolloff
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        rolloff_mean = np.mean(rolloff)
        rolloff_std = np.std(rolloff)
        features.extend([rolloff_mean, rolloff_std])

        # Extract spectral centroid
        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        centroid_mean = np.mean(centroid)
        centroid_std = np.std(centroid)
        features.extend([centroid_mean, centroid_std])

        # Extract spectral bandwidth
        bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        bandwidth_mean = np.mean(bandwidth)
        bandwidth_std = np.std(bandwidth)
        features.extend([bandwidth_mean, bandwidth_std])

        # Extract spectral contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        contrast_mean = np.mean(contrast)
        contrast_std = np.std(contrast)
        features.extend([contrast_mean, contrast_std])

        # Extract spectral flatness
        flatness = librosa.feature.spectral_flatness(y=y)
        flatness_mean = np.mean(flatness)
        flatness_std = np.std(flatness)
        features.extend([flatness_mean, flatness_std])

        # Extract RMS
        rms = librosa.feature.rms(y=y)
        rms_mean = np.mean(rms)
        rms_std = np.std(rms)
        features.extend([rms_mean, rms_std])

        # Extract zero-crossing rate
        zcr = librosa.feature.zero_crossing_rate(y)
        zcr_mean = np.mean(zcr)
        zcr_std = np.std(zcr)
        features.extend([zcr_mean, zcr_std])

        # Extract fundamental frequency (f0) using pyin
        f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=50, fmax=500)
        f0_mean = np.mean(f0[voiced_flag]) if np.any(voiced_flag) else 0.0
        f0_std = np.std(f0[voiced_flag]) if np.any(voiced_flag) else 0.0
        f0_min = np.min(f0[voiced_flag]) if np.any(voiced_flag) else 50.0
        f0_max = np.max(f0[voiced_flag]) if np.any(voiced_flag) else 500.0
        features.extend([f0_mean, f0_std, f0_min, f0_max])

        # Extract LPC coefficients (assuming 12 coefficients as per CSV)
        lpc_coeffs = librosa.lpc(y, order=11)
        features.extend(lpc_coeffs)

        # Ensure feature vector matches the expected number of features
        features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
        if len(features) != len(feature_cols):
            raise ValueError(f"Extracted {len(features)} features, but model expects {len(feature_cols)} features.")

        return features
    except Exception as e:
        print(f"Error extracting features from {audio_file}: {e}")
        return None

In [168]:
def test_new_audio(audio_file, model, scaler, feature_cols, confidence_threshold=0.6):
    # Extract features from the new audio
    features = extract_audio_features(audio_file, feature_cols)
    if features is None:
        return None, None, "Rejected: Invalid audio file or feature extraction failed."

    # Predict speaker
    predicted_speaker, probabilities = predict_speaker(model, scaler, features)
    max_probability = np.max(probabilities)

    # Rejection clause: Check confidence threshold
    if max_probability < confidence_threshold:
        return None, None, f"Rejected: Prediction confidence ({max_probability:.3f}) below threshold ({confidence_threshold})."

    return predicted_speaker, dict(zip(model.classes_, probabilities)), "Accepted: Command validated by duration."

In [174]:
def main():
    # Path to the CSV file (replace with actual path or use loadFileData in deployment)
    csv_file = 'audio_features(1)(1).csv'

    # Load and preprocess data
    X, y, feature_cols = load_and_preprocess_data(csv_file)

    # Train the model and compute metrics
    model, scaler = train_voiceprint_model(X, y)

    # Save the model and scaler
    save_model_and_scaler(model, scaler)

    # Example prediction (using a sample from the dataset for demonstration)
    sample_features = X[0]  # First row as an example
    predicted_speaker, probs = predict_speaker(model, scaler, sample_features)
    max_prob = np.max(probs)
    print(f"\nExample Prediction:")
    print(f"Predicted Speaker: {predicted_speaker}")
    print(f"Probabilities: {dict(zip(model.classes_, probs))}")
    print(f"Status: {'Accepted' if max_prob >= 0.6 else 'Rejected: Low confidence'}")

    # Example testing a new audio file (replace with actual audio file path)
    test_audio_file = '/content/ian_test.wav'  # Replace with actual path
    print(f"\nTesting new audio file: {test_audio_file}")
    predicted_speaker, probabilities, status = test_new_audio(test_audio_file, model, scaler, feature_cols)
    print(f"Status: {status}")
    if predicted_speaker is not None:
        print(f"Predicted Speaker: {predicted_speaker}")
        print(f"Probabilities: {probabilities}")

if __name__ == '__main__':
    main()


Cross-Validation Metrics (2-fold):
Accuracy       : 0.7500 (±0.2500)
Precision_macro: 0.6250 (±0.3750)
Recall_macro   : 0.7500 (±0.2500)
F1_macro       : 0.6667 (±0.3333)
Model trained on full dataset (no train-test split due to small size).
Model saved to voiceprint_model.pkl
Scaler saved to scaler.pkl

Example Prediction:
Predicted Speaker: lievin
Probabilities: {'eddy': np.float64(0.10201128376775633), 'lievin': np.float64(0.8979887162322437)}
Status: Accepted

Testing new audio file: /content/ian_test.wav
Status: Rejected: Prediction confidence (0.572) below threshold (0.6).
