This code block imports necessary libraries and modules for audio feature extraction, CNN model creation, and data preprocessing. It includes:

os: for operating system-related functionalities.

pandas: for ease of operating on arrays

librosa: for audio feature extraction. 

numpy: for numerical operations.

StratifiedKFold from sklearn.model_selection: for splitting data into train and test sets while maintaining class balance.

Sequential, Conv2D, MaxPooling2D, Flatten, Dense, Dropout from tensorflow.keras.layers: for building the CNN model architecture.

to_categorical from tensorflow.keras.utils: for converting class labels to categorical format.

from tensorflow.keras.utils import plot_model

This block sets up the initial environment and imports necessary tools for the subsequent cod

In [None]:
# Imports and Setup
import os
import pandas as pd
import librosa
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical


In [None]:
base_path = "\\UrbanSound8K\\audio" #Insert the base path here

# Path to the metadata CSV file
metadata_file = "\\UrbanSound8K.csv" #Insert the metadata file path that contains the output answers here

# Number of cross-validation folds, do not change
n_folds = 10

# Input shape for the CNN model (assuming grayscale images for the sounds)
input_shape = (256, 256, 1)

# Determine the number of classes in the dataset
metadata = pd.read_csv(metadata_file)
num_classes = metadata['class'].nunique()

# Batch size for training
batch_size = 64

# Number of epochs for training the model
epochs = 50

In [None]:
# Initialize empty lists to store audio file paths and corresponding labels
audio_paths = []
labels = []

# Iterate over each folder in the base path
for folder in sorted(os.listdir(base_path)):
    # Construct full path to the current folder
    folder_path = os.path.join(base_path, folder)
    
    # Check if the current item in the directory is a folder
    if os.path.isdir(folder_path):
        # Iterate over each file in the current folder
        for file in os.listdir(folder_path):
            # Construct full path to the current file
            file_path = os.path.join(folder_path, file)
            
            # Check if the file is a WAV audio file
            if file.endswith('.wav'):
                # Append the file path to the list of audio paths
                audio_paths.append(file_path)
                
                # Append the folder name (label) to the list of labels
                labels.append(folder)

# Convert the lists to NumPy arrays for convenience
audio_paths = np.array(audio_paths)
labels = np.array(labels)


In [None]:
def extract_features(file_path, metadata, n_mels=256, hop_length=1024, n_fft=4096):
    """
    Extracts Mel spectrogram features from an audio file.

    Parameters:
    - file_path: Path to the audio file.
    - metadata: DataFrame containing metadata information.
    - n_mels: Number of Mel frequency bins.
    - hop_length: Number of samples between successive frames.
    - n_fft: Number of samples used for each Fourier transform.

    Returns:
    - mel_spec_db: Mel spectrogram feature with added channel dimension.
    - label: Label corresponding to the audio file.
    """
    # Extract file name from file path
    file_name = os.path.basename(file_path)

    # Retrieve metadata row corresponding to the file name
    row = metadata[metadata['slice_file_name'] == file_name]

    # Check if the metadata row is empty (i.e., file not found in metadata)
    if row.empty:
        return None, None  

    # Extract label from metadata
    label = row['classID'].values[0]

    # Load audio file using librosa
    audio, sr = librosa.load(file_path, res_type='kaiser_fast')

    # Compute Mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)

    # Convert to decibels (dB)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    # Resize spectrogram to fixed length (input_shape[0])
    mel_spec_db = librosa.util.fix_length(mel_spec_db, size=input_shape[0], axis=1)

    # Add channel dimension to spectrogram
    mel_spec_db = mel_spec_db.reshape((*input_shape[:2], 1))

    return mel_spec_db, label


In [None]:
# Model Creation
def create_cnn_model(num_classes):
    """
    Creates a Convolutional Neural Network (CNN) model for classification.
    
    Parameters:
    - num_classes: Number of classes for classification.
    
    Returns:
    - model: CNN model compiled with specified optimizer, loss function, and metrics.
    """
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    # Compile the model with Adam optimizer, categorical crossentropy loss, and accuracy metric
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [None]:
# Initialize empty lists to store extracted features and corresponding labels
X_features = []
y_labels = []

# Iterate over each audio file path
for file_path in audio_paths:
    # Extract features and label for the current audio file
    features, label = extract_features(file_path, metadata)
    
    # Check if features are successfully extracted (not None)
    if features is not None:
        # Append extracted features to the list of X_features
        X_features.append(features)
        
        # Append corresponding label to the list of y_labels
        y_labels.append(label)

# Convert the lists to NumPy arrays for convenience
X_features = np.array(X_features)
y_labels = np.array(y_labels)


In [None]:
skf = StratifiedKFold(n_splits=n_folds, shuffle=False)
accuracy_scores = []

# Iterate over each fold in the cross-validation
for fold_idx, (train_index, test_index) in enumerate(skf.split(X_features, y_labels)):
    # Split features and labels into training and testing sets
    X_train_features, X_test_features = X_features[train_index], X_features[test_index]
    y_train, y_test = y_labels[train_index], y_labels[test_index]

    # Convert labels to categorical
    y_train_cat = to_categorical(y_train, num_classes=num_classes)
    y_test_cat = to_categorical(y_test, num_classes=num_classes)

    # Create and compile the CNN model
    model = create_cnn_model(num_classes)
    
    # Train the CNN model on the training data
    model.fit(X_train_features, y_train_cat, batch_size=batch_size, epochs=epochs, verbose=1)

    # Evaluate the model on test data and collect accuracy
    _, test_accuracy = model.evaluate(X_test_features, y_test_cat, verbose=0)
    accuracy_scores.append(test_accuracy)

    # Print the test accuracy for the current fold
    print(f"Fold {fold_idx + 1}: Test Accuracy = {test_accuracy}")

# Calculate and print average accuracy and standard deviation
mean_accuracy = np.mean(accuracy_scores)
std_dev = np.std(accuracy_scores)
print(f"Average Accuracy: {mean_accuracy:.4f} ± {std_dev:.4f}")