In [1]:
# Loading the libraries
import librosa
import librosa.display
import numpy as np
import os
import random
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from collections import defaultdict
from sklearn.metrics import confusion_matrix, classification_report

# setting seed for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

os.environ['PYTHONHASHSEED'] = '42'
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [None]:
# data file
data = "data/words"
labels = sorted(os.listdir(data))
print(labels)

In [None]:
# Defining a function for loading .wav files and preprocessing them
def preprocess_audio(filename):
    y,sr = librosa.load(filename, sr=None)
    target_len = sr * 1   # 1 second worth of samples
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)), mode='constant')

    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40) # n_fft=2048, hop_length=512, hann window function
    mel_db = librosa.power_to_db(mel_spec, ref=np.max) # log scale, 0dB=loudest point
    return mel_db, y, sr

In [None]:
words = sorted(os.listdir(data))
print(words)

mel_spectrograms = []
indices = []
filepaths = []
speaker_ids = defaultdict(set)

for index, word in enumerate(words):
    folder = os.path.join(data, word)
    count = 1
    save_folder = os.path.join("melspectrograms", word)
    os.makedirs(save_folder, exist_ok=True)
    # print("Loading", word)
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        mel_spec, y, sr = preprocess_audio(file_path)
        mel_spectrograms.append(mel_spec)
        indices.append(index)
        filepaths.append(file_path)
        speaker = file.split("_")[0]
        speaker_ids[word].add(speaker)

        if len(y)/sr != 1:
            print(f"{word, count} duration {len(y)/sr}")

        save_path = os.path.join(save_folder,  f"{word}_{count}.npy")
        np.save(save_path, mel_spec)

        if count >= 100:
            break
        count += 1
print("\nUnique speaker counts per word:")
for word in words:
    num_speakers = len(speaker_ids[word])
    print(f"{word}: {num_speakers}")

In [None]:
x = np.array(mel_spectrograms)
y = np.array(indices)
print(x.shape)
print(y.shape)

In [None]:
# Data splitting
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
x_train, x_temp, y_train, y_temp, filepath_train, filepath_temp = train_test_split(
    x, y, filepaths, test_size=0.3, stratify=y, random_state=42
)

x_val, x_test, y_val, y_test, filepath_val, filepath_test = train_test_split(
    x_temp, y_temp, filepath_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print("Train:", x_train.shape)
print("Val:  ", x_val.shape)
print("Test: ", x_test.shape)

x_train = x_train[..., np.newaxis]
x_val = x_val[..., np.newaxis]
x_test = x_test[..., np.newaxis]

In [7]:
# Example files for later mel spectrogram visualization
# function to visualize mel spectrograms
def visualize_melspectrograms(mels, titles=None):

    fig, axes = plt.subplots(1, 3, figsize=(15, 4))


    for i, (mel, ax) in enumerate(zip(mels, axes)):
        img = librosa.display.specshow(
            mel,
            sr=16000,
            x_axis='time',
            y_axis='mel',
            cmap="magma",
            ax=ax
        )
        if titles is not None:
            ax.set_title(titles[i], fontsize=12)
        ax.set_xlabel("Time (s)")
        ax.set_ylabel("Mel Frequency")
        fig.colorbar(img, ax=ax, format="%+2.0f dB")

    plt.tight_layout()
    plt.show()
    return fig


In [None]:
# loading and visualizing mel spectrogram examples

example_word = 'cat'
examples_filepaths = [i for i in filepath_test if example_word in i][:3]
examples_filepath_noise = examples_filepaths[:1]

mel_vis_examples = []
for path in examples_filepaths:
    mel_spec, y, sr = preprocess_audio(path)
    mel_vis_examples.append(mel_spec)

mel_spec_cat_clean = visualize_melspectrograms(mel_vis_examples)
# mel_spec_cat_clean.savefig('mel_spec_cat_clean.png')


In [None]:
# Model building

num_classes = len(words)

model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=x_train.shape[1:]),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(128, (3,3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2,2)),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# model.summary()


In [None]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,     # stop after 3 epochs with no improvement
    restore_best_weights=True
)

history = model.fit(
    x_train, y_train,
    epochs=30,
    validation_data=(x_val, y_val),
    callbacks=[early_stop]
)


In [None]:
metrics = history.history
fig = plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1,2,2)
plt.plot(history.epoch, 100*np.array(metrics['accuracy']), 100*np.array(metrics['val_accuracy']))
plt.legend(['accuracy', 'val_accuracy'])
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy, %')
plt.savefig('model_metrics.png')
plt.show()

In [None]:
# Model evaluation

test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:", test_acc)


In [None]:
# WHITE NOISE DIFFERENT SNR DB

x_test_wn = {5:[], 20:[], 40:[]}

wn_examples_mels = []

for file in filepath_test:
    y, sr = librosa.load(file, sr=None)
    target_len = sr * 1   # 1 second worth of samples
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)), mode='constant')

    signal_power = np.mean(y**2)

    for snr_db in x_test_wn:

        noise_power = signal_power/(10**(snr_db/10))
        white_noise = np.random.normal(0, np.sqrt(noise_power), len(y))
        y_wn = y + white_noise

        mel_spec_wn = librosa.feature.melspectrogram(y=y_wn, sr=sr, n_mels=40)
        mel_db_wn = librosa.power_to_db(mel_spec_wn, ref=np.max)
        x_test_wn[snr_db].append(mel_db_wn)

        if file in examples_filepath_noise:
            wn_examples_mels.append(mel_db_wn)

wn_fig=visualize_melspectrograms(wn_examples_mels, ["SNR = 5dB", "SNR = 20dB", "SNR = 40dB",])
wn_fig.savefig('wn_mel_spec.png')

x_test_wn_5 = np.array(x_test_wn[5])
x_test_wn_5 = x_test_wn_5[..., np.newaxis]

x_test_wn_20 = np.array(x_test_wn[20])
x_test_wn_20 = x_test_wn_20[..., np.newaxis]

x_test_wn_40 = np.array(x_test_wn[40])
x_test_wn_40 = x_test_wn_40[..., np.newaxis]

In [None]:
# Testing with white noise

# 5 db
test_loss, test_acc = model.evaluate(x_test_wn_5, y_test)
print("Test accuracy 5db:", test_acc)

# 20 db
test_loss, test_acc = model.evaluate(x_test_wn_20, y_test)
print("Test accuracy 20db:", test_acc)

# 40 db
test_loss, test_acc = model.evaluate(x_test_wn_40, y_test)
print("Test accuracy 40db:", test_acc)

In [None]:
# REAL WORLD NOISE, DIFFERENT SNR

# Loading noise

noise_y ,_ = librosa.load("data/noise/doing_the_dishes.wav")
noise_y = noise_y[778876:778876+16000]
noise_power = np.mean(noise_y ** 2)

# Adding background noise to the files
x_test_backn = {5:[], 20:[], 40:[]}
backn_examples_mels = []

for file in filepath_test:
    y, sr = librosa.load(file, sr=None)
    target_len = sr * 1   # 1 second worth of samples
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)), mode='constant')
    signal_power = np.mean(y**2)

    for snr_db in x_test_backn:

        target_noise_power = signal_power/(10**(snr_db/10))
        noise = noise_y * np.sqrt(target_noise_power / noise_power)
        y_backn = y + noise

        mel_spec_backn = librosa.feature.melspectrogram(y=y_backn, sr=sr, n_mels=40)
        mel_db_backn = librosa.power_to_db(mel_spec_backn, ref=np.max)
        x_test_backn[snr_db].append(mel_db_backn)

        if file in examples_filepath_noise:
            backn_examples_mels.append(mel_db_backn)

backn_fig=visualize_melspectrograms(backn_examples_mels, ["SNR = 5dB", "SNR = 20dB", "SNR = 40dB",])
backn_fig.savefig('backn_mel_spec.png')

x_test_backn_5 = np.array(x_test_backn[5])
x_test_backn_5 = x_test_backn_5[..., np.newaxis]

x_test_backn_20 = np.array(x_test_backn[20])
x_test_backn_20 = x_test_backn_20[..., np.newaxis]

x_test_backn_40 = np.array(x_test_backn[40])
x_test_backn_40 = x_test_backn_40[..., np.newaxis]

In [None]:
# Testing with background noise different snr

test_loss, test_acc = model.evaluate(x_test_backn_5, y_test)
print("Test accuracy 5db:", test_acc)

test_loss, test_acc = model.evaluate(x_test_backn_20, y_test)
print("Test accuracy 20db:", test_acc)

test_loss, test_acc = model.evaluate(x_test_backn_40, y_test)
print("Test accuracy 40db:", test_acc)

In [None]:
# evaluating model performance on different words

def evaluate_words(x, y_true, words):
    # Predictions
    y_pred = np.argmax(model.predict(x, verbose=0), axis=1)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=range(len(words)))
    per_word_acc = np.divide(np.diag(cm), cm.sum(axis=1), out=np.zeros(len(words)), where=cm.sum(axis=1) != 0)

    print(f"Overall accuracy: {np.mean(per_word_acc):.3f}")
    print(classification_report(y_true, y_pred, target_names=words, digits=3))

    # Bar plot of per-word accuracy
    order = np.argsort(per_word_acc)
    barchart = plt.figure(figsize=(10, 5))
    plt.barh(np.array(words)[order], per_word_acc[order]*100, color='steelblue')
    plt.xlabel("Accuracy, %")
    plt.tight_layout()
    plt.show()

    #  Normalized confusion matrix
    cm_norm = np.nan_to_num(cm / cm.sum(axis=1, keepdims=True))
    cm_fig = plt.figure(figsize=(8, 7))
    plt.imshow(cm_norm, cmap='Blues', aspect='auto')
    plt.title("Normalized Confusion Matrix")
    plt.colorbar()
    plt.xticks(range(len(words)), words, rotation=90)
    plt.yticks(range(len(words)), words)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()
    return barchart, cm_fig

# test on clean data
barchart, cm_fig = evaluate_words(x_test, y_test, words)
barchart.savefig('barchart.png')
cm_fig.savefig('cm_fig.png')

