In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

audio_dataset_path = 'UrbanSound8K/audio/'
metadata = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')

max_length = 174
def features_extractor(file, max_length=174):
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast') 
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    
    if spectrogram.shape[1] < max_length:
        pad_width = max_length - spectrogram.shape[1]
        spectrogram = np.pad(spectrogram, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        spectrogram = spectrogram[:, :max_length]
    
    return spectrogram

extracted_features = []
for index_num, row in tqdm(metadata.iterrows(), total=metadata.shape[0]):
    file_name = os.path.join(os.path.abspath(audio_dataset_path), 'fold'+str(row["fold"])+'/', str(row["slice_file_name"]))
    final_class_labels = row["class"]
    data = features_extractor(file_name, max_length)
    extracted_features.append([data, final_class_labels])

extracted_features_df = pd.DataFrame(extracted_features, columns=['feature', 'class'])
X = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

labelencoder = LabelEncoder()
y = to_categorical(labelencoder.fit_transform(y))

X = X[..., np.newaxis]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.3))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.3))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification_cnn3.keras', 
                               verbose=1, save_best_only=True)
start = datetime.now()

num_epochs = 100
num_batch_size = 32

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", test_accuracy[1])


100%|██████████████████████████████████████████████████████████████████████████████| 8732/8732 [10:56<00:00, 13.30it/s]
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step - accuracy: 0.1373 - loss: 13.9421
Epoch 1: val_loss improved from inf to 2.12780, saving model to saved_models/audio_classification_cnn3.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 274ms/step - accuracy: 0.1374 - loss: 13.9005 - val_accuracy: 0.1809 - val_loss: 2.1278
Epoch 2/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step - accuracy: 0.1639 - loss: 2.1459
Epoch 2: val_loss did not improve from 2.12780
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 268ms/step - accuracy: 0.1639 - loss: 2.1458 - val_accuracy: 0.1431 - val_loss: 2.1758
Epoch 3/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step - accuracy: 0.1731 - loss: 2.1023
Epoch 3: val_loss improved from 2.12780 to 2.12080, saving model to saved_models/audio_classification_cnn3.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [3]:
from tensorflow.keras.models import load_model
import numpy as np
import librosa

class_names = ['air_conditioner', 'car_horn', 'chileren_playing', 'dog_bark', 'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren',
              'street_music', ]

def predict_audio_class(file_path, model_path='saved_models/audio_classification_cnn3.keras', max_length=174):
    model = load_model(model_path)
    
    def features_extractor(file, max_length=174):
        audio, sample_rate = librosa.load(file, res_type='kaiser_fast')
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        
        if spectrogram.shape[1] < max_length:
            pad_width = max_length - spectrogram.shape[1]
            spectrogram = np.pad(spectrogram, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            spectrogram = spectrogram[:, :max_length]
        
        return spectrogram
    
    features = features_extractor(file_path, max_length)
    
    features = features[np.newaxis, ..., np.newaxis]
    
    predictions = model.predict(features)
    
    predicted_class_index = np.argmax(predictions, axis=1)[0]
    
    predicted_class_name = class_names[predicted_class_index]
    
    return predicted_class_name

file_path = 'UrbanSound8k/100263-2-0-3.wav'
predicted_class_name = predict_audio_class(file_path)
print(f'Predicted class name: {predicted_class_name}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
Predicted class name: chileren_playing
