In [1]:
# Required libraries

import os
# import pickle
# import sys
# import traceback

import IPython as IP

import librosa
import librosa.display

import matplotlib.pyplot as plt
%matplotlib inline

import math
import numpy as np
import pandas as pd

import soundfile as sf

# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import random
from datetime import datetime
from include import helpers

from keras import backend as keras_backend
from keras.models import Sequential, load_model
# from keras.layers import Dense, Dropout, Flatten, LeakyReLU, SpatialDropout2D, Activation, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.layers import Dense, LeakyReLU, SpatialDropout2D, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.optimizers import Adam
# from keras.utils import to_categorical, plot_model
from keras.utils import to_categorical
# import np_utils
from keras.callbacks import ModelCheckpoint 
from keras.regularizers import l2
import keras.backend as K
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

from include import helpers

# Set your path to the dataset
us8k_path = os.path.abspath("./UrbanSound8K")
audio_path = os.path.join(us8k_path, "audio")
augmented_path = os.path.join(audio_path, "augmented")
metadata_path = os.path.join(us8k_path, "metadata/UrbanSound8K.csv")
metadata_augmented_path = os.path.abspath("data_temp/augmented-data.csv")
models_path = os.path.abspath('./models')
data_path = os.path.abspath('./data_temp')

# Ensure "channel last" data format on Keras
keras_backend.set_image_data_format('channels_last')

# Define a labels array for future use
labels = [
        'Air Conditioner',
        'Car Horn',
        'Children Playing',
        'Dog bark',
        'Drilling',
        'Engine Idling',
        'Gun Shot',
        'Jackhammer',
        'Siren',
        'Street Music'
    ]

In [2]:
print("Loading CSV file {}".format(metadata_path))

# Load metadata as a Pandas dataframe
metadata = pd.read_csv(metadata_path)

# Examine dataframe's head
metadata.head()

Loading CSV file /Users/jaycrappe/Documents/GitHub/urban-audio-classifier/UrbanSound8K/metadata/UrbanSound8K.csv


Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [3]:
# Class distribution
metadata["class"].value_counts()

class
dog_bark            1000
children_playing    1000
air_conditioner     1000
street_music        1000
engine_idling       1000
jackhammer          1000
drilling            1000
siren                929
car_horn             429
gun_shot             374
Name: count, dtype: int64

In [4]:
# Read every file header to collect audio properties
audiodata = []
for index, row in metadata.iterrows():
    cat = str(row["class"])
    fold = "fold" + str(row["fold"])
    name = str(row["slice_file_name"])
    file_name = os.path.join(audio_path, fold, name)
    audio_props = helpers.read_header(file_name)
    duration = row["end"] - row["start"]
    audiodata.append((name, fold, cat, duration) + audio_props)

# Convert into a Pandas dataframe
audiodatadf = pd.DataFrame(
    audiodata,
    columns=[
        "file",
        "fold",
        "class",
        "duration",
        "channels",
        "sample_rate",
        "bit_depth",
    ],
)

In [5]:
row = audiodatadf[audiodatadf["class"] == "gun_shot"].sample(1)
helpers.play_dataset_sample(row, audio_path)

Class: gun_shot
File: /Users/jaycrappe/Documents/GitHub/urban-audio-classifier/UrbanSound8K/audio/fold6/148833-6-0-0.wav
Sample rate: 44100
Bit depth: 16
Duration 1.4775939999999999 seconds


#### 1. Time stretching (changing play time)


In [6]:
rates = [0.81, 1.07]
total = len(metadata) * len(rates)
count = 0
for rate in rates:
    # Generate new stretched audio file
    for index, row in metadata.iterrows():
        curr_fold = str(row["fold"])
        curr_file_path = audio_path + "/fold" + curr_fold + "/" + row["slice_file_name"]

        # Speed sub-dir inside current fold dir
        curr_rate_path = (
            augmented_path + "/fold" + curr_fold + "/speed_" + str(int(rate * 100))
        )

        # Create sub-dir if it does not exist
        if not os.path.exists(curr_rate_path):
            os.makedirs(curr_rate_path)

        output_path = curr_rate_path + "/" + row["slice_file_name"]

        # Skip when file already exists
        if os.path.isfile(output_path):
            count += 1
            continue

        y, sr = librosa.load(curr_file_path)
        y_changed = librosa.effects.time_stretch(y, rate=rate)
        # librosa.output.write_wav(output_path, y_changed, sr)
        sf.write(output_path, y_changed, sr)

        count += 1

        clear_output(wait=True)
        print("Progress: {}/{}".format(count, total))
        print("Last file: ", row["slice_file_name"])

#### 2. Pitch shifting


In [7]:
tone_steps = [-1, -2, 1, 2]
total = len(metadata) * len(tone_steps)
count = 0
for tone_step in tone_steps:
    # Generate new pitched audio
    for index, row in metadata.iterrows():
        curr_fold = str(row["fold"])
        curr_file_path = audio_path + "/fold" + curr_fold + "/" + row["slice_file_name"]

        # Pitch Shift sub-dir inside current fold dir
        curr_ps_path = augmented_path + "/fold" + curr_fold + "/pitch_" + str(tone_step)

        # Create sub-dir if it does not exist
        if not os.path.exists(curr_ps_path):
            os.makedirs(curr_ps_path)

        output_path = curr_ps_path + "/" + row["slice_file_name"]

        # Skip when file already exists
        if os.path.isfile(output_path):
            count += 1
            continue

        y, sr = librosa.load(curr_file_path)
        y_changed = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=tone_step)
        # librosa.output.write_wav(output_path, y_changed, sr)
        sf.write(output_path, y_changed, sr)

        count += 1

        clear_output(wait=True)
        print("Progress: {}/{}".format(count, total))
        print("Last file: ", row["slice_file_name"])

#### 3. Noise


In [8]:
import random


def add_noise(data):
    noise = np.random.rand(len(data))
    noise_amp = random.uniform(0.005, 0.008)
    data_noise = data + (noise_amp * noise)
    return data_noise


total = len(metadata)
count = 0

# Generate new noised audio
for index, row in metadata.iterrows():
    curr_fold = str(row["fold"])
    curr_file_path = audio_path + "/fold" + curr_fold + "/" + row["slice_file_name"]

    # Noised sub-dir inside current fold dir
    curr_noise_path = augmented_path + "/fold" + curr_fold + "/noise"

    # Create sub-dir if it does not exist
    if not os.path.exists(curr_noise_path):
        os.makedirs(curr_noise_path)

    output_path = curr_noise_path + "/" + row["slice_file_name"]

    # Skip when file already exists
    if os.path.isfile(output_path):
        count += 1
        continue

    y, sr = librosa.load(curr_file_path)
    y_changed = add_noise(y)
    sf.write(output_path, y_changed, sr)
    # librosa.output.write_wav(output_path, y_changed, sr)

    count += 1

    clear_output(wait=True)
    print("Progress: {}/{}".format(count, total))
    print("Last file: ", row["slice_file_name"])

#### 4. Create metadata for the new files


In [9]:
def get_files_recursive(path):
    # create a list of file and sub directories names in the given directory
    file_list = os.listdir(path)
    all_files = list()
    # Iterate over all the entries
    for entry in file_list:
        # Create full path
        full_path = os.path.join(path, entry)
        # If entry is a directory then get the list of files in this directory
        if os.path.isdir(full_path):
            all_files = all_files + get_files_recursive(full_path)
        else:
            all_files.append(full_path)

    return all_files

In [10]:
# Get every single file within the tree
files = get_files_recursive(augmented_path)

# Define metadata columns
names = []
classes = []
folds = []
augmentations = []

# Iterate and collect name, fold and class
for file in files:
    pieces = file.split("/")
    file = pieces[len(pieces) - 1]
    fold = pieces[len(pieces) - 3]
    augment = pieces[len(pieces) - 2]
    fold_num = fold[4 : len(fold)]
    class_id = file.split("-")[1]

    # Push records
    names.append(file)
    folds.append(fold_num)
    classes.append(class_id)
    augmentations.append(augment)

# Create a dataframe with the new augmented data
new_meta = pd.DataFrame(
    {"file": names, "fold": folds, "class_id": classes, "augment": augmentations}
)

# Make sure class_id is int
new_meta["class_id"] = new_meta["class_id"].astype(np.int64)

print(len(new_meta), "new entries")

61124 new entries


In [11]:
# Add class names to the new dataframe using merge
classes = pd.DataFrame(
    {
        "class_id": range(0, 10),
        "class": [
            "air_conditioner",
            "car_horn",
            "children_playing",
            "dog_bark",
            "drilling",
            "engine_idling",
            "gun_shot",
            "jackhammer",
            "siren",
            "street_music",
        ],
    }
)

new_meta = pd.merge(new_meta, classes, on="class_id")

#### 5. Integrate metadata in a single file


In [12]:
# Modify original data to fit the new structure
del metadata["fsID"], metadata["start"], metadata["end"], metadata["salience"]
metadata.columns = ["file", "fold", "class_id", "class"]
metadata["augment"] = "none"

# Concat the two dataframes
full_meta = pd.concat([metadata, new_meta])

# Verify lengths
if len(full_meta) == len(metadata) + len(new_meta):
    print("Dataframes merged correctly!")
else:
    print("Error! Lengths do not match.")

print("Initial data:", len(metadata))
print("New data:", len(new_meta))
print("Merged data:", len(full_meta))

Dataframes merged correctly!
Initial data: 8732
New data: 61124
Merged data: 69856


#### 6. Save the new dataset


In [13]:
# Save the new metadata
full_meta.to_csv(metadata_augmented_path, index=False, encoding="utf-8")

#### 1. MFCC extraction


In [14]:
# Load the metadata from the generated CSV
metadata = pd.read_csv(metadata_augmented_path)

# Examine dataframe
print("Metadata length:", len(metadata))
metadata.tail()

Metadata length: 69856


Unnamed: 0,file,fold,class_id,class,augment
69851,88121-8-0-0.wav,10,8,siren,pitch_-2
69852,189982-0-0-42.wav,10,0,air_conditioner,pitch_-2
69853,74364-8-1-7.wav,10,8,siren,pitch_-2
69854,99192-4-0-7.wav,10,4,drilling,pitch_-2
69855,101382-2-0-33.wav,10,2,children_playing,pitch_-2


In [15]:
# Iterate through all audio files and extract MFCC
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(metadata)
n_mfcc = 40

for index, row in metadata.iterrows():
    file_path = os.path.join(
        os.path.abspath(audio_path), "fold" + str(row["fold"]), str(row["file"])
    )
    class_label = row["class"]

    # Extract MFCCs (do not add padding)
    mfccs = helpers.get_mfcc(file_path, 0, n_mfcc)

    # Save current frame count
    num_frames = mfccs.shape[1]

    # Add row (feature / label)
    features.append(mfccs)
    labels.append(class_label)

    # Update frames maximum
    if num_frames > frames_max:
        frames_max = num_frames
    
    IP.display.clear_output(wait=True)
    print("Progress: {}/{}".format(index + 1, total_samples))
    print("Last file: ", file_path)

    counter += 1

print("Finished: {}/{}".format(index, total_samples))

Progress: 69856/69856
Last file:  /Users/jaycrappe/Documents/GitHub/urban-audio-classifier/UrbanSound8K/audio/fold10/101382-2-0-33.wav
Finished: 69855/69856


#### 2. Add padding


In [16]:
padded = []

# Add padding
mfcc_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if size < mfcc_max_padding:
        pad_width = mfcc_max_padding - size
        px = np.pad(
            features[i],
            pad_width=((0, 0), (0, pad_width)),
            mode="constant",
            constant_values=(0,),
        )

    padded.append(px)

#### 3. Save X and y


In [17]:
# Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("data_temp/X-mfcc-augmented", X)
np.save("data_temp/y-mfcc-augmented", y)

In [18]:
# Verify shapes
print("Raw features length: {}".format(len(features)))
print("Padded features length: {}".format(len(padded)))
print("Feature labels length: {}".format(len(features)))
print("X: {}, y: {}".format(X.shape, y.shape))

Raw features length: 69856
Padded features length: 69856
Feature labels length: 69856
X: (69856, 40, 174), y: (69856,)


#### 4. Log-Mel Spectrogram extraction


In [19]:
# Iterate through all audio files and extract Log-Mel Spectrograms
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(metadata)
n_mels = 40

for index, row in metadata.iterrows():
    file_path = os.path.join(
        os.path.abspath(audio_path), "fold" + str(row["fold"]), str(row["file"])
    )
    class_label = row["class"]

    # Extract Log-Mel Spectrograms (do not add padding)
    mels = helpers.get_mel_spectrogram(file_path, 0, n_mels=n_mels)

    # Save current frame count
    num_frames = mels.shape[1]

    # Add row (feature / label)
    features.append(mels)
    labels.append(class_label)

    # Update frames maximum
    if num_frames > frames_max:
        frames_max = num_frames

    IP.display.clear_output(wait=True)
    # clear_output(wait=True)
    print("Progress: {}/{}".format(index + 1, total_samples))
    print("Last file: ", file_path)

    counter += 1

print("Finished: {}/{}".format(index, total_samples))

Progress: 44781/69856
Last file:  /Users/jaycrappe/Documents/GitHub/urban-audio-classifier/UrbanSound8K/audio/fold6/38121-3-0-0.wav


KeyboardInterrupt: 

In [20]:
frames_max

174

#### 2. Add padding for a consistent shape


In [None]:
padded = []

# Add padding
mels_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if size < mels_max_padding:
        pad_width = mels_max_padding - size
        px = np.pad(
            features[i],
            pad_width=((0, 0), (0, pad_width)),
            mode="constant",
            constant_values=(0,),
        )

    padded.append(px)

#### 3. Save X and y


In [None]:
# Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("data_temp/X-mel_spec-augmented", X)
np.save("data_temp/y-mel_spec-augmented", y)

In [None]:
# Verify shapes
print("Raw features length: {}".format(len(features)))
print("Padded features length: {}".format(len(padded)))
print("Feature labels length: {}".format(len(features)))
print("X: {}, y: {}".format(X.shape, y.shape))

Raw features length: 69856
Padded features length: 69856
Feature labels length: 69856
X: (69856, 40, 174), y: (69856,)


# Training CNN model with augmented data


### List all Physical Devices available, and set Memory Growth to Maximum


In [None]:
import keras.backend as K
import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices("GPU")
print(f"Found {len(physical_devices)} Physical Devices: {physical_devices}")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

Found 1 Physical Devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Load Data


In [23]:
# Pre-processed MEL SPEC coefficients
X = np.load("data_temp/X-mel_spec-augmented.npy")
y = np.load("data_temp/y-mel_spec-augmented.npy")

# Metadata
metadata = pd.read_csv(metadata_path)
aug_metadata = pd.read_csv(metadata_augmented_path)

### 1. Data preparation: features + metadata

#### 1.1 Train / Test split


In [24]:
indexes = []
total = len(aug_metadata)
indexes = list(range(0, total))

# Randomize indexes
random.shuffle(indexes)

# Divide the indexes into Train and Test
test_split_pct = 20
split_offset = math.floor(test_split_pct * total / 100)

# Split the metadata
test_split_idx = indexes[0:split_offset]
train_split_idx = indexes[split_offset:total]


# Split metadata
test_meta = aug_metadata.iloc[test_split_idx]
train_meta = aug_metadata.iloc[train_split_idx]

# Remove augmented data from test metadata split
test_meta = test_meta[test_meta["augment"] != "none"]
aug_on_test_idx = test_meta.index.tolist()

# Split the features with the same indexes
X_test = np.take(X, aug_on_test_idx, axis=0)
y_test = np.take(y, aug_on_test_idx, axis=0)
X_train = np.take(X, train_split_idx, axis=0)
y_train = np.take(y, train_split_idx, axis=0)


# Print status
print(
    "Test split: {} \t\t Train split: {}".format(
        len(test_meta) - len(aug_on_test_idx), len(train_meta)
    )
)
print("X test shape: {} \t X train shape: {}".format(X_test.shape, X_train.shape))
print("y test shape: {} \t\t y train shape: {}".format(y_test.shape, y_train.shape))

Test split: 0 		 Train split: 55885
X test shape: (12201, 40, 174) 	 X train shape: (55885, 40, 174)
y test shape: (12201,) 		 y train shape: (55885,)


#### 1.2 One hot encode labels


In [25]:
le = LabelEncoder()
y_test_encoded = to_categorical(le.fit_transform(y_test))
y_train_encoded = to_categorical(le.fit_transform(y_train))

#### 1.3 Reshape data


In [26]:
# How data should be organized
num_rows = 40
num_columns = 174
num_channels = 1

print(X_train.shape, X_test.shape)
# Reshape to fit the network input (channel last!)
X_train = X_train.reshape(X_train.shape[0], num_rows, num_columns, num_channels)
X_test = X_test.reshape(X_test.shape[0], num_rows, num_columns, num_channels)

# Total number of labels to predict (equal to the network output nodes)
num_labels = y_train_encoded.shape[1]

(55885, 40, 174) (12201, 40, 174)


### 2. Basic CNN model

We are using the exact same model than the previous notebooks

#### 2.1 Model definition


In [None]:
def create_model(spatial_dropout_rate_1=0, spatial_dropout_rate_2=0, l2_rate=0):

    # Create a secquential object
    model = Sequential()

    # Conv 1
    model.add(
        Conv2D(
            filters=32,
            kernel_size=(3, 3),
            kernel_regularizer=l2(l2_rate),
            input_shape=(num_rows, num_columns, num_channels),
        )
    )
    model.add(LeakyReLU(negative_slope=0.1))
    model.add(BatchNormalization())

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=32, kernel_size=(3, 3), kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(negative_slope=0.1))
    model.add(BatchNormalization())

    # Max Pooling #1
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(SpatialDropout2D(spatial_dropout_rate_1))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(negative_slope=0.1))
    model.add(BatchNormalization())

    model.add(SpatialDropout2D(spatial_dropout_rate_2))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), kernel_regularizer=l2(l2_rate)))
    model.add(LeakyReLU(negative_slope=0.1))
    model.add(BatchNormalization())

    # Reduces each h×w feature map to a single number by taking the average of all h,w values.
    model.add(GlobalAveragePooling2D())

    # Softmax output
    model.add(Dense(num_labels, activation="softmax"))

    return model


# Regularization rates
spatial_dropout_rate_1 = 0.07
spatial_dropout_rate_2 = 0.14
l2_rate = 0.001

model = create_model(spatial_dropout_rate_1, spatial_dropout_rate_2, l2_rate)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


#### 2.2 Setup model optimizer and loss function


In [None]:
adam = Adam(learning_rate=1e-4, beta_1=0.99, beta_2=0.999)
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer=adam)

# Display model architecture summary
model.summary()

#### 2.3 Compile model


In [None]:
num_epochs = 53
num_batch_size = 128
# model_file = 'aug-train-nb3.hdf5'
# model_file = "aug-train-nb3.keras"
model_file = 'aug-train-nb3-{}.keras'.format(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
print(model_file)
model_path = os.path.join(models_path, model_file)


# Save checkpoints
checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True)
start = datetime.now()
history = model.fit(
    X_train,
    y_train_encoded,
    batch_size=num_batch_size,
    epochs=num_epochs,
    validation_split=1 / 12.0,
    callbacks=[checkpointer],
    verbose=1,
)

duration = datetime.now() - start
print("Training completed in time: ", duration)

aug-train-nb3-2024-05-01-15-26-31.keras
Epoch 1/53


AttributeError: module 'keras.src.backend' has no attribute 'convert_to_numpy'

#### 2.4 General model evaluation

In [None]:
model_file = 'aug-train-nb3-{}.keras'.format(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
models_path = os.path.abspath("./models")
model_path = os.path.join(models_path, model_file)

model = load_model(model_path)

In [None]:
helpers.model_evaluation_report(model, X_train, y_train_encoded, X_test, y_test_encoded)

#### 2.5 Train vs Test history plot

In [None]:
helpers.plot_train_history(history)

### 3. Model evaluation

#### * Register model output

In [None]:
# Predict probabilities for test set
y_probs = model.predict(X_test, verbose=0)

# Get predicted labels
yhat_probs = np.argmax(y_probs, axis=1)
y_trues = np.argmax(y_test_encoded, axis=1)

# Add "pred" column
test_meta['pred'] = yhat_probs

#### 3.1 Confusion matrix

In [None]:
# Sets decimal precision (for printing output only)
np.set_printoptions(precision=2)

# Compute confusion matrix data
cm = confusion_matrix(y_trues, yhat_probs)

helpers.plot_confusion_matrix(cm,
                          labels, 
                          normalized=False, 
                          title="Model Performance", 
                          cmap=plt.cm.Blues,
                          size=(12,12))

#### 3.2 Accuracy table

In [None]:
# Find per-class accuracy from the confusion matrix data
accuracies = helpers.acc_per_class(cm)

pd.DataFrame({
    'CLASS': labels,
    'ACCURACY': accuracies
}).sort_values(by="ACCURACY", ascending=False)

In [None]:
# Build classification report
re = classification_report(y_trues, yhat_probs, labels=[0,1,2,3,4,5,6,7,8,9], target_names=labels)

print(re)