Due to difficulties converting a Pytorch model to a Tensorflow model, this notebook uses the same CNN model as before, but written in Tensorflow. This allows it to be portable to an Android device.

In [68]:
# imports 
import numpy as np
import os
import librosa

import build.pybind_modules.dsp_module as cu
import build.pybind_modules.matrix_module as myMatrix

import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

import tensorflow as tf
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib
from tensorflow.keras.models import Model
from tensorflow.math import confusion_matrix


print('TensorFlow version:',tf.__version__)
physical_devices = tf.config.list_physical_devices()
for dev in physical_devices:
    print(dev)

TensorFlow version: 2.12.0
PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')


In [69]:
# Parameters 
MODEL_NAME = 'audio_mnist'
EPOCHS = 20
BATCH_SIZE = 64

FS = 48000
DOWNSAMPLED_FS = 8000
NFFT = 256
NOVERLAP = -1
NFILT = 40
NUM_CEPS = 13
NN_DATA_COLS = 48
NN_DATA_ROWS = 12

In [70]:
def relu6(x):
    return tf.keras.activations.relu(x, max_value=6)

def compute_accuracies(predicted_labels, dev_set, dev_labels):
    yhats = predicted_labels
    assert predicted_labels.dtype == int, "Your predicted labels have type {}, but they should have type np.int (consider using .astype(int) on your output)".format(predicted_labels.dtype)

    if len(yhats) != len(dev_labels):
        print("Lengths of predicted labels don't match length of actual labels", len(yhats), len(dev_labels))
        return 0., 0., 0., 0.
    accuracy = np.mean(yhats == dev_labels)
    conf_m = np.zeros((len(np.unique(dev_labels)),len(np.unique(dev_labels))))
    for i,j in zip(dev_labels,predicted_labels):
        conf_m[i,j] +=1

    return accuracy, conf_m


class NeuralNet(tf.keras.Model):
    def __init__(self, out_size):
        super(NeuralNet, self).__init__()

        self.conv1 = tf.keras.layers.Conv2D(filters=10, kernel_size=(3,3), activation=relu6, padding='same', kernel_initializer='he_uniform')
        self.maxpool = tf.keras.layers.MaxPooling2D(pool_size=(3,3), padding='same')
        self.conv2 = tf.keras.layers.Conv2D(filters=20, kernel_size=(3,3), activation=relu6, padding='same', kernel_initializer='he_uniform')
        self.dropout_1 = tf.keras.layers.Dropout(rate=0.1)
        self.dropout_2 = tf.keras.layers.Dropout(rate=0.16)
        self.dropout_3 = tf.keras.layers.Dropout(rate=0.12)

        self.flatten = tf.keras.layers.Flatten()
        self.dense_1 = tf.keras.layers.Dense(units=5000, activation='relu', kernel_initializer='he_uniform')
        self.dense_2 = tf.keras.layers.Dense(units=1000, activation='relu', kernel_initializer='he_uniform')
        self.dense_3 = tf.keras.layers.Dense(units=out_size, kernel_initializer='he_uniform')

        # self.loss_fn = loss_fn
        # self.optimizer = tf.keras.optimizers.SGD(learning_rate=lrate, momentum=0.9, weight_decay=weight_decay)

    def call(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.maxpool(x)
        x = self.dropout_1(x, training=True)
        x = self.flatten(x)
        x = self.dense_1(x)
        x = self.dropout_2(x, training=True)
        x = self.dense_2(x)
        x = self.dropout_3(x, training=True)
        x = self.dense_3(x)
        return x

In [71]:
# Load the data (should be trained and uploaded using the other spoken_digit_recognition notebook)

dev_set_labels = np.loadtxt("l_dev_set_labels.csv", delimiter=",", dtype=np.int32)
train_labels = np.loadtxt("train_labels.csv", delimiter=",", dtype=np.int32)
train_set = np.loadtxt("train_set.csv", delimiter=",", dtype=np.float64)
dev_set = np.loadtxt("dev_set.csv", delimiter=",", dtype=np.float64)

print("Len dev_set_labels: {}".format(len(dev_set_labels)))
print("Len dev_set: {}".format(len(dev_set_labels)))
print("Len train_set_labels: {}".format(len(train_labels)))
print("Len train_set: {}".format(len(train_set)))

print("\nTrainset shape: {}".format(train_set.shape))

Len dev_set_labels: 200
Len dev_set: 200
Len train_set_labels: 1000
Len train_set: 1000

Trainset shape: (1000, 576)


In [72]:
# reshaping data to desired shape
reshaped_dev_set = np.zeros((len(dev_set), NN_DATA_ROWS, NN_DATA_COLS))
reshaped_train_set = np.zeros((len(train_set), NN_DATA_ROWS, NN_DATA_COLS))
for i in range(len(dev_set)):
    reshaped_dev_set[i] = np.reshape(dev_set[i], (NN_DATA_ROWS, NN_DATA_COLS))

for i in range(len(train_set)):
    reshaped_train_set[i] = np.reshape(train_set[i], (NN_DATA_ROWS, NN_DATA_COLS))

In [73]:
# running dataset on model 
batch_size = 64
epochs = 20
input_dim = 12 # num ceps - 1
output_dim = 10 # number of genres
weight_decay = 1e-2
learning_rate = 1e-2

input_layer = Input(shape=(NN_DATA_ROWS, NN_DATA_COLS, 1))
x = NeuralNet(output_dim)(input_layer)

model = Model(inputs=input_layer, outputs=x)

# printing a view of the model
print(model.summary(expand_nested=True))

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 12, 48, 1)]       0         
                                                                 
 neural_net_7 (NeuralNet)    (None, 10)                6217930   
|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|
| conv2d_14 (Conv2D)        multiple                  100       |
|                                                               |
| max_pooling2d_7 (MaxPooling  multiple               0         |
| 2D)                                                           |
|                                                               |
| conv2d_15 (Conv2D)        multiple                  1820      |
|                                                               |
| dropout_21 (Dropout)      multiple                  0         |
|                                                          

In [74]:
# training the model now
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate, weight_decay=weight_decay), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics="acc")
history = model.fit(x=reshaped_train_set, y=train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(reshaped_dev_set, dev_set_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [75]:

# confusion = confusion_matrix(labels=dev_set_labels, predictions=, num_classes=num_classes)
# print(confusion)

In [87]:
# testing a prediction on the model
dev_set_1_idx = 20
dev_set_2_idx = 180
single_test = tf.convert_to_tensor(np.array([reshaped_dev_set[dev_set_1_idx], reshaped_dev_set[dev_set_2_idx]]))
predictions = model(single_test, training=False).numpy()

print("Top 3 prections by (value, label)")
print(sorted(zip(predictions[0], [i for i in range(10)]), reverse=True)[:3])
print("Actual label was: {}\n".format(dev_set_labels[dev_set_1_idx]))

print("Top 3 prections by (value, label)")
print(sorted(zip(predictions[1], [i for i in range(10)]), reverse=True)[:3])
print("Actual label was: {}\n".format(dev_set_labels[dev_set_2_idx]))

Top 3 prections by (value, label)
[(6.1345177, 8), (2.4934447, 6), (2.345057, 2)]
Actual label was: 8

Top 3 prections by (value, label)
[(3.8146093, 9), (3.5829391, 1), (2.2780287, 5)]
Actual label was: 1

