## Importing Libraries

In [1]:
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from datasets import load_dataset

## Loading data

In [2]:
# Load the Speech Commands dataset
dataset = load_dataset("speech_commands", 'v0.02')

speech_commands.py:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

The repository for speech_commands contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/speech_commands.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/229M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/112M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84848 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9982 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4890 [00:00<?, ? examples/s]

In [3]:
# Get the label-word mapping
label_to_word = dataset['train'].features['label'].names
wake_word = "marvin"
wake_word_label = label_to_word.index(wake_word)
non_wake_words = ["yes", "no", "up", "down", "left", "right", "go", "stop"]
non_wake_word_labels = [label_to_word.index(word) for word in non_wake_words]

## MFCC extraction

In [4]:
# MFCC - Mel Frequency Cepstral Coefficients
# extracting MFCC features
def extract_mfcc(audio_array, sr=16000, n_mfcc=13):
    mfcc = librosa.feature.mfcc(y=audio_array, sr=sr, n_mfcc=n_mfcc)
    return mfcc.T  # Transpose to get shape (time steps, features)

# filtering dataset by numeric labels
def filter_dataset_by_label(dataset, wake_word_label, non_wake_word_labels):
    wake_word_data = []
    non_wake_word_data = []
    
    for example in dataset['train']:
        if example['label'] == wake_word_label:
            wake_word_data.append(example)
        elif example['label'] in non_wake_word_labels:
            non_wake_word_data.append(example)
    
    return wake_word_data, non_wake_word_data

wake_word_data, non_wake_word_data = filter_dataset_by_label(dataset, wake_word_label, non_wake_word_labels)

## Noise augmentation

In [5]:
x,y = [],[]

# adding noise to audio
def add_noise(audio, noise_factor=0.005):
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_factor * noise
    augmented_audio = augmented_audio.astype(type(audio[0]))
    return augmented_audio

# Adding noise to both data
for example in wake_word_data:
    audio_array = example['audio']['array']
    sr = example['audio']['sampling_rate']
    mfcc_features = extract_mfcc(audio_array, sr)
    if np.random.rand() < 0.5:  # 50-50 chance of adding noise
        audio_array = add_noise(audio_array)
    x.append(mfcc_features)
    y.append(1)  # Wake word - 1

for example in non_wake_word_data:
    audio_array = example['audio']['array']
    sr = example['audio']['sampling_rate']
    mfcc_features = extract_mfcc(audio_array, sr)
    if np.random.rand() < 0.5:  # 50-50 chance of adding noise
        audio_array = add_noise(audio_array)
    x.append(mfcc_features)
    y.append(0)  # Non-wake word -0

# Padding
x_padded = pad_sequences([mfcc for mfcc in x], padding='post', dtype='float32')
y = np.array(y)
x_padded = np.expand_dims(x_padded, axis=-1)

## Training

In [6]:
from tensorflow.keras.layers import Input

# LSTM model
model = Sequential()
model.add(Input(shape=(x_padded.shape[1], x_padded.shape[2])))  
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(32))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# class weights - imbalanced data
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

# training
model.fit(x_padded, y, validation_split=0.1, epochs=10, batch_size=128, class_weight=class_weights)
model.save('wake_word_model_v2.h5')

Epoch 1/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.7513 - loss: 0.4925 - val_accuracy: 0.9501 - val_loss: 0.1100
Epoch 2/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9107 - loss: 0.2105 - val_accuracy: 0.9852 - val_loss: 0.0381
Epoch 3/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9353 - loss: 0.1679 - val_accuracy: 0.9671 - val_loss: 0.0689
Epoch 4/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9478 - loss: 0.1314 - val_accuracy: 0.9667 - val_loss: 0.0704
Epoch 5/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9502 - loss: 0.1144 - val_accuracy: 0.9743 - val_loss: 0.0531
Epoch 6/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9455 - loss: 0.1220 - val_accuracy: 0.9747 - val_loss: 0.0503
Epoch 7/10
[1m186/186

# Prediction and evaluation metrics ( before pruning )

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import time

x_train, x_test, y_train, y_test = train_test_split(x_padded, y, test_size=0.2, random_state=42, stratify=y)

start_time = time.time()
y_pred = (model.predict(x_test) > 0.6).astype(int)          # Threshold = 0.6
end_time = time.time()
y_true = y_test  

TN,FP,FN,TP = confusion_matrix(y_true, y_pred).ravel()   # Confusion Matrix

accuracy = (TP + TN) / (TP + TN + FP + FN)                  # Accuracy
far = FP / (FP + TN)                                        # False Acceptance Rate
frr = FN / (TP + FN)                                        # False Rejection Rate
detection_latency = end_time - start_time                   # Detection Latency
print(f'Accuracy: {accuracy:.2f}')
print(f'False Acceptance Rate (FAR): {far:.2f}')
print(f'False Rejection Rate (FRR): {frr:.2f}')
print(f"Detection Latency (avg): {detection_latency:.4f} seconds")

[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Accuracy: 0.99
False Acceptance Rate (FAR): 0.01
False Rejection Rate (FRR): 0.02
Detection Latency (avg): 1.0689 seconds


## Pruning

In [8]:
# Pruning
def apply_pruning(model, pruning_percentage=0.4):
    for layer in model.layers:
        if isinstance(layer, LSTM) or isinstance(layer, Dense):
            weights = layer.get_weights()
            pruned_weights = []
            for w in weights:
                # Get threshold to prune small weights
                threshold = np.percentile(np.abs(w), pruning_percentage * 100)
                w[np.abs(w) < threshold] = 0  # Set weights below threshold to zero
                pruned_weights.append(w)
            layer.set_weights(pruned_weights)
    return model

# Apply pruning
pruned_model = apply_pruning(model, pruning_percentage=0.4)
# Save the pruned model
pruned_model.save('pruned_wake_word_model.h5')

# Prediction and evaluation metrics ( after pruning )

In [9]:
start_time = time.time()
y_pred = (model.predict(x_test) > 0.6).astype(int)          # Threshold = 0.6
end_time = time.time()
y_true = y_test  

TN,FP,FN,TP = confusion_matrix(y_true, y_pred).ravel()   # Confusion Matrix

accuracy = (TP + TN) / (TP + TN + FP + FN)                  # Accuracy
far = FP / (FP + TN)                                        # False Acceptance Rate
frr = FN / (TP + FN)                                        # False Rejection Rate
detection_latency = end_time - start_time                   # Detection Latency
print(f'Accuracy: {accuracy:.2f}')
print(f'False Acceptance Rate (FAR): {far:.2f}')
print(f'False Rejection Rate (FRR): {frr:.2f}')
print(f"Detection Latency (avg): {detection_latency:.4f} seconds")

[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Accuracy: 0.92
False Acceptance Rate (FAR): 0.08
False Rejection Rate (FRR): 0.06
Detection Latency (avg): 0.5999 seconds
