# Import libraries and packages

In [2]:
import keras
from keras.layers import Activation, Dense, Dropout, Conv2D, Flatten, MaxPooling2D
from keras.models import Sequential
from keras.optimizers import Adam
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import load_model
from keras.models import Model
from keras import backend as K
from sklearn import metrics
from sklearn.metrics import classification_report
import librosa
import librosa.display
import numpy as np
import pandas as pd
import random
import warnings
import os
import glob
import pickle
import random

# Extract MFCCs from the audio using Librosa
### The chunks of audio are normalized, turned to mono, and converted into 16kHz sample rate. Then 14 MFCCs are computed for frames at regular intervals for each chunk. 45 frames of non-silent audio are needed for each chunk. If there are less than 45, the chunk is just skipped.
### The MFCCs order is randomized, and then turned into One Hot Encoding for the Neural Network.

In [21]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name, sr=16000, mono=True, res_type='kaiser_fast')
    X = librosa.util.normalize(X)
    mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=14)
    return mfccs

#iterates over all the files within subdirectories and calls extract_feature
def parse_audio_files(parent_dir,sub_dirs,file_ext='*.wav'):
    features = []
    labels = []
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs = extract_feature(fn)
            #print(mfccs.shape)
            if mfccs.shape[1] >= 45:
                mfccs = np.resize(mfccs,(14,45))
                features.append(mfccs)
                labels.append(fn.split('/')[2].split('-')[1])
    new_labels = [] #we need the labels to be numbers, not letters.
    for i in labels:
        if i == 'w':
            new_labels.append(0)
        if i == 't':
            new_labels.append(1)
        if i == 's':
            new_labels.append(2)
    return np.array(features), np.array(new_labels, dtype = np.int)

parent_dir = 'mdataset'
train_sub_dirs = ['siltrain']
test_sub_dirs = ['siltest']

train_features, train_labels = parse_audio_files(parent_dir,train_sub_dirs)
test_features, test_labels = parse_audio_files(parent_dir,test_sub_dirs)

#Zip labels and mfccs together to randomize the order
train_data = zip(train_features, train_labels)
test_data = zip(test_features, test_labels)
random.shuffle(train_data)
random.shuffle(test_data)
X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

#Reshape for encoding.
X_train = np.array([x.reshape( (14, 45, 1) ) for x in X_train])
X_test = np.array([x.reshape( (14, 45, 1) ) for x in X_test])

#One-Hot encoding for classes
y_train = np.array(keras.utils.to_categorical(y_train, 3))
y_test = np.array(keras.utils.to_categorical(y_test, 3))

#Save MFCCS to avoid recomputing them (costly)
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
print("data exported.")

data exported.


In [3]:
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

print(X_train.shape)
print(X_test.shape)

(1673, 14, 45, 1)
(697, 14, 45, 1)


# The Neural Network
### A very simple architecture leads to satisfying results. 4 layers in total, the input layer with 5% dropout.

In [10]:
model = Sequential()

model.add(Dense(14, input_shape=(14, 45, 1)))
model.add(Activation('sigmoid'))
model.add(Dropout(0.05)) #prevent overfitting

model.add(Dense(14))
model.add(Activation('relu'))
#model.add(Dropout(0.1))

model.add(Dense(14))
model.add(Activation('relu'))
#model.add(Dropout(0.1))

model.add(Flatten())

model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Training and running
### We can vary the number of epochs and the batch size.

In [11]:
%%time

model.fit(X_train, y_train, batch_size=32, epochs=20) #5 epochs works pretty damn good, 10 more consistent?
score, acc = model.evaluate(X_test, y_test, batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)

Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
('Test score:', 1.0272739882790718)
('Test accuracy:', 0.7977044478892595)
             precision    recall  f1-score   support

          0       0.78      0.78      0.78       232
          1       0.75      0.94      0.84       275
          2       0.96      0.61      0.75       190

avg / total       0.82      0.80      0.79       697

CPU times: user 41.7 s, sys: 5.9 s, total: 47.6 s
Wall time: 20.7 s


# Saving the model, and additional testing.

In [32]:
from keras.models import load_model
model.save('84_RSil_Model.h5')  # creates a HDF5 file 'my_model.h5'

In [51]:
model = load_model('84_RSil_Model.h5')

single = np.reshape(X_test[2], (1, 14, 45, 1))

#get_3rd_layer_output = K.function([model.layers[0].input],[model.layers[8].output])
#layer_output = get_3rd_layer_output([single])[0]
#print(layer_output)
#The above does the same as these keras functions:

print(model.predict(single))
print(model.predict_classes(single))

[[5.7190257e-01 4.2809615e-01 1.2795387e-06]]
[0]
