# Wake word processing

This notebook is to train a wake word model using tensorflow.

## Data Preprocessing

for data preprocessing we use the `python_speech_features` package for simplicity as we just have extract the mfcc feature and that package is as simple as posible

In [None]:
!virtualenv env
!source env/bin/activate
!pip install -r requirement

## This proccess is to make dataset.
The dataset consist of wake word and non wake word. In each audio file only have one class, either wake word or non wake word.
The audio itself, is then extracted using mfcc feature extraction that loaded from python_speech_features module. In this block we create dataset with various length of mfcc features, thus after this process it is required to apply some padding so the input data would be in the same shape. As for the target or label we make one hot encodeing so that the array of [0,1] would represent a non wake word uttarance and [1,0] as a wake word uttarance.

In [None]:
import glob
import numpy as np
from python_speech_features import mfcc
import scipy.io.wavfile as wav

ww = "" # wake word audio directory
nww = "" # non wake word audio directory

X = []
Y = []   # ww = [1,0] , nww = [0,1]

maxshapeX = 0

for x in glob.glob(ww+"*.wav"):
    sr, frame = wav.read(x)
    feat = mfcc(frame, sr) 
    if feat.shape[0] > 1000:
        continue
    if feat.shape[0] > maxshapeX:
        maxshapeX = feat.shape[0]
        print(maxshapeX)
    X.append( feat )
    Y.append( np.array( [1, 0] ) )
    
for x in glob.glob(nww+"*.wav"):
    sr, frame = wav.read(x)
    feat = mfcc(frame, sr) 
    if feat.shape[0] > 1000:
        continue
    if feat.shape[0] > maxshapeX:
        maxshapeX = feat.shape[0]
        print(maxshapeX)
    X.append( feat )
    Y.append( np.array( [0, 1] ) )
    
Y = np.array(Y)

print(maxshapeX)

## Padding

In [None]:
def pad_along_axis(array: np.ndarray, target_length, axis=0):

    pad_size = target_length - array.shape[axis]
    axis_nb = len(array.shape)

    if pad_size < 0:
        return array

    npad = [(0, 0) for x in range(axis_nb)]
    npad[axis] = (0, pad_size)

    b = np.pad(array, pad_width=npad, mode='constant', constant_values=0)

    return b


for i in range(len(X)):
    X[i] = pad_along_axis(X[i], maxshapeX, 0)

X = np.array(X)
print(X.shape)
print(Y.shape)

np.save('X', X)
np.save('Y', Y)

In [None]:
from keras.models import Sequential  
from keras.layers import Dense, Activation, BatchNormalization, Flatten, Conv1D, MaxPooling1D
from keras.layers import Dropout  
from keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split


def create_model_cnn(n_timesteps, n_dim, n_classes):
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu',data_format="channels_last", input_shape=(n_timesteps,n_dim)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Dropout(0.2))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model



epochs = 20

X = np.load('X.npy')
Y = np.load('Y.npy')
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=19)

n_dim = X_train.shape[2]  
n_classes = y_train.shape[1] 
n_timesteps = X_train.shape[1]
model_cnn = create_model_cnn(n_timesteps, n_dim, n_classes)
print("CNN")
hist = model_cnn.fit(X_train, y_train, epochs=epochs, batch_size=4, verbose=2)
model_cnn.save('model-cnn.h5')

In [None]:
print("test with a new data")
print(model_cnn.evaluate(x=X_test, y=y_test))
# model_cnn.save('model-cnn.h5')

In [None]:
%time
prob = []

for x in X_test :
    x = np.reshape( x, (1,846,13) )
    pred = model_cnn.predict(x)
    idx = np.argmax(pred)
    prob.append(pred[0][idx])
    kelas = "Wakeword" if idx == 0 else "Not wake word"
    print(f"Kelas {kelas} dengan probability {pred[0][idx]}")

## On the fly test
This is the wraper to e2e process

In [None]:
%time
import numpy as np
from python_speech_features import mfcc
import scipy.io.wavfile as wav
from keras.models import load_model


def pad_along_axis(array: np.ndarray, target_length, axis=0):

    pad_size = target_length - array.shape[axis]
    axis_nb = len(array.shape)

    if pad_size < 0:
        return array

    npad = [(0, 0) for x in range(axis_nb)]
    npad[axis] = (0, pad_size)

    b = np.pad(array, pad_width=npad, mode='constant', constant_values=0)

    return b

def load_h5model(path):
    return load_model(path)
    

    
model = load_h5model("model-cnn.h5")
x = "" # example audio

sr, frame = wav.read(x)
x = mfcc(frame, sr)

if x.shape != (846, 13):
    # Padding
    if x.shape[0] < 846:
        x = pad_along_axis(x, 846,0)
    elif x.shape[0] > 846:
        x = x[:846,:]

x = np.reshape( x, (1,846,13) )
pred = model.predict(x)
idx = np.argmax(pred)
kelas = "Wakeword" if idx == 0 else "Not wake word"
hasil = {
    'prob' : pred[0][idx],
    'label' : kelas
}
hasil

## Convert model into tensorlite model using command line

This command line is used to convert the model into a tensorflow lite model. To use this model using tflite config please refer [here](https://github.com/juunnn/wakeword/tree/engine/engine)

In [None]:
!tflite_convert --keras_model_file model-cnn.h5 --output_file model.tflite