In [None]:
import os
from scipy.io import wavfile
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import keras
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, LSTM
from tensorflow.keras.layers import Dropout, Dense, TimeDistributed
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import ModelCheckpoint
import librosa
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

In [None]:
img_shape = (128, 32, 1) # rows, columns, channels
power = .5
sr = 16000
commands = [ 'no', 'stop','yes', 'up', 'down', 'left', 'right']

In [None]:
def prepare_data(datatype):
    global _max
    y = []
    command_num = 0
    for command in commands:
        command_data = np.load('./data/' + command + '-' + datatype + '.npy', allow_pickle=True)
        for _ in range(command_data.shape[0]):
            y.append(command_num)
        if command_num == 0:
            X = command_data
        else:
            X = np.concatenate((X, command_data), axis=0)
        command_num += 1   
    
    # add generated data to training dataset 
    if datatype == 'train':
        generated_data = np.load('./data/speech-commands-generated-data.npy', allow_pickle=True)
        X_generated, y_generated = zip(*generated_data)
        #X = np.concatenate((X, X_generated), axis=0)
        X = X_generated
        y = y + list(y_generated)
   
    # one hot encode target output
    y = np.array(keras.utils.to_categorical(y, len(commands)))
    
    # normalize
    X = np.expand_dims(X, axis=3)
    _max = np.amax(X)
    X = X / _max
    
    return shuffle(X, y, random_state=0)

In [None]:
X_train, y_train = prepare_data('train')

In [None]:
X_test, y_test = prepare_data('test')

In [None]:
model = Sequential()
model.add(Conv2D(128, (3, 3), activation='relu', strides=(1, 1), padding='same', input_shape=(img_shape)))
model.add(Conv2D(128, (3,3), activation='relu', strides=(1,1), padding='same'))
model.add(Conv2D(128, (3,3), activation='relu', strides=(1,1), padding='same'))
model.add(MaxPool2D(2,2))
model.add(Flatten())

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))

model.add(Dense(len(commands), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [None]:
history = model.fit(X_train, y_train, verbose=2, epochs=15, batch_size=256, validation_split=0.2)

In [None]:
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
predictions = model.predict(X_test)

In [None]:
predictionList = []
actualList = []
for pred in predictions:
    predictionList.append(np.argmax(pred))
for actual in y_test:
    actualList.append(np.argmax(actual))

In [None]:
correct = 0
total = 0
for i in range(len(predictionList)):
    if predictionList[i] == actualList[i]:
        correct += 1
    total += 1

In [None]:
round(correct/total, 4) 

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes, title=None, cmap=plt.cm.Blues):
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
plot_confusion_matrix(actualList, predictionList, classes=commands, title='Confusion Matrix')

In [None]:
commands_with_label = {'no':[0, 0], 'stop':[1, 0], 'yes':[2, 0], 'up':[3, 0], 'down':[4, 0], 'left':[5, 0], 'right':[6, 0]}
def prep_generated_data():
    spectrograms = []
    file_names = os.listdir('./data/speech_clips/generated')
    np.random.shuffle(file_names)
    for file in tqdm(file_names):
        data, rate = librosa.load('./data/speech_clips/generated/' + file, duration=1, sr=sr)
        spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128, power=power)
        if spectrogram.shape[1] == 32:
            command = file.split('_')[0]
            if command in commands_with_label:
                if commands_with_label[command][1] < 1500:
                    spectrograms.append((spectrogram, commands_with_label[command][0]))
                    commands_with_label[command][1] += 1

    X_train = np.array(spectrograms)
    
    np.save('./data/speech-commands-generated-data', X_train)
    return spectrogram.shape

shape = prep_generated_data()
shape