In [1]:
pip install librosa numpy pandas scikit-learn joblib


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import librosa
import numpy as np

def load_audio_files(folder_path):
    audio_data = []
    labels = []
    
    for label in ['confident', 'Non-confident']:
        folder = os.path.join(folder_path, label)
        for file in os.listdir(folder):
            if file.endswith('.wav'):
                file_path = os.path.join(folder, file)
                y, sr = librosa.load(file_path, sr=None)
                
                # Check if the audio signal is empty
                if len(y) > 0:
                    audio_data.append(y)
                    labels.append(1 if label == 'confident' else 0)
                else:
                    print(f"Warning: {file_path} is empty and will be skipped.")
    
    return audio_data, np.array(labels)

# Example usage:
folder_path = '/kaggle/input/voice-data'  # Update this path
audio_data, labels = load_audio_files(folder_path)




In [3]:
def balance_dataset(audio_data, labels):
    confident_indices = [i for i, label in enumerate(labels) if label == 1]
    non_confident_indices = [i for i, label in enumerate(labels) if label == 0]

    # Determine the minimum count between the two classes
    min_count = min(len(confident_indices), len(non_confident_indices))

    # Select the samples
    balanced_indices = confident_indices[:min_count] + non_confident_indices[:min_count]
    balanced_labels = labels[balanced_indices]
    balanced_audio_data = [audio_data[i] for i in balanced_indices]
    
    return balanced_audio_data, balanced_labels

# Balance the dataset
balanced_audio_data, balanced_labels = balance_dataset(audio_data, labels)

# Check the lengths
print(f"Number of balanced audio samples: {len(balanced_audio_data)}")
print(f"Number of balanced labels: {len(balanced_labels)}")


Number of balanced audio samples: 946
Number of balanced labels: 946


In [4]:
import numpy as np
import librosa

def extract_features(audio_data, labels):
    features = []
    
    for y in audio_data:
        if len(y) == 0:  # Check for empty audio
            continue
        
        # Extracting MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=22050, n_mfcc=13)
        mfccs_mean = np.mean(mfccs.T, axis=0)  # Shape: (13,)
        
        # Extracting additional features
        pitches, magnitudes = librosa.piptrack(y=y, sr=22050)
        pitch_mean = np.mean(pitches) if pitches.size > 0 else 0  # Shape: (1,)
        
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=y))  # Shape: (1,)
        energy = np.mean(librosa.feature.rms(y=y))  # Shape: (1,)
        
        # New features
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=22050))  # Shape: (12,)
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=22050))  # Shape: (1,)
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=22050))  # Shape: (1,)
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=22050))  # Shape: (7,)
        rmse = np.mean(librosa.feature.rms(y=y))  # Shape: (1,)
        tempo, _ = librosa.beat.beat_track(y=y, sr=22050)  # Shape: (1,)
        
        # Compile all features into a single array
        feature_vector = np.hstack([
            mfccs_mean, pitch_mean, zero_crossing_rate, energy,
            chroma_stft, spectral_centroid, spectral_bandwidth,
            spectral_contrast, rmse, tempo
        ])  # Adjusted shape to accommodate new features
        features.append(feature_vector)
    
    features_array = np.array(features)
    
    # Debugging: Print shapes and lengths
    print(f"Extracted features shape: {features_array.shape}")  # Should be (n_samples, total_features)
    print(f"Number of audio files processed: {len(audio_data)}")
    
    return features_array

# Extract features from the balanced dataset
features = extract_features(balanced_audio_data, balanced_labels)

# Check the lengths again
print(f"Number of extracted features: {len(features)}")
print(f"Number of valid labels: {len(balanced_labels)}")


Extracted features shape: (946, 22)
Number of audio files processed: 946
Number of extracted features: 946
Number of valid labels: 946


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, balanced_labels, test_size=0.2, random_state=42)


In [36]:
# Initialize the Random Forest model with default parameters
model = RandomForestClassifier(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7736842105263158
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.77      0.79       102
           1       0.75      0.77      0.76        88

    accuracy                           0.77       190
   macro avg       0.77      0.77      0.77       190
weighted avg       0.77      0.77      0.77       190



In [24]:
pip install numpy pandas scikit-learn xgboost lightgbm tensorflow


Note: you may need to restart the kernel to use updated packages.


In [25]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-Confident', 'Confident']))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 77.37%

Classification Report:
               precision    recall  f1-score   support

Non-Confident       0.80      0.77      0.79       102
    Confident       0.75      0.77      0.76        88

     accuracy                           0.77       190
    macro avg       0.77      0.77      0.77       190
 weighted avg       0.77      0.77      0.77       190


Confusion Matrix:
[[79 23]
 [20 68]]


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, balanced_labels, test_size=0.2, random_state=42)

# Set up the parameter grid for hypertuning
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40, 50],    # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],            # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],              # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']    # Number of features to consider when looking for the best split
}

# Initialize the Random Forest model
model = RandomForestClassifier(random_state=42)

# Set up the randomized search with cross-validation
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, 
                                   n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the Random Forest with the best hyperparameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Hyperparameters: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40}
Accuracy: 0.7894736842105263
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.80       102
           1       0.75      0.82      0.78        88

    accuracy                           0.79       190
   macro avg       0.79      0.79      0.79       190
weighted avg       0.79      0.79      0.79       190



In [30]:
import joblib
# Save the trained model
joblib.dump(model, '/kaggle/working/model.joblib')


['/kaggle/working/model.joblib']

In [39]:
def predict_confidence(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=22050)
    
    # Extract features from the audio signal
    features = extract_features([y], None)  # Pass y as a list to match the expected input format
    
    # Reshape features to fit model input shape (1 sample, 16 features)
    features = features.reshape(1, -1)

    # Make a prediction (0 = Non-Confident, 1 = Confident)
    prediction = model.predict(features)

    # Map the prediction to the confidence level
    label = "Confident" if prediction[0] == 1 else "Non-Confident"
    return label

# Test the function with a new audio file
file_path = '/kaggle/input/voice-data/Non-confident/146-0.wav'
result = predict_confidence(file_path)
print("The model predicts this voice as:", result)


Extracted features shape: (1, 22)
Number of audio files processed: 1
The model predicts this voice as: Non-Confident
