In [34]:
import pandas as pd

# load features
featuresdf = pd.read_pickle('featuresData')
print(featuresdf)

                                             feature       class_label
0  [[-306.77255, -177.59209, -99.13616, -65.97198...          dog_bark
1  [[-457.69534, -451.0248, -450.68613, -445.0000...  children_playing
2  [[-323.20044, -244.39201, -208.50298, -184.233...          car_horn
3  [[-688.7444, -262.64093, -105.28191, -60.13772...   air_conditioner
4  [[-205.19269, -215.90787, -209.7127, -184.8985...      street_music
5  [[-119.95263, -98.58099, -102.46894, -113.9573...          gun_shot
6  [[-212.37454, -203.63791, -200.84283, -208.838...             siren
7  [[-168.26811, -159.26343, -158.1763, -156.5134...     engine_idling
8  [[-298.7493, -288.96646, -294.43912, -300.8143...        jackhammer
9  [[-686.07166, -615.6793, -523.6804, -469.46082...          drilling


In [35]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

In [36]:
#Convolutional Neural Network (CNN) model architecture

#We will modify our model to be a Convolutional Neural Network (CNN) again using Keras and a Tensorflow backend.

#Again we will use a sequential model, starting with a simple model architecture, consisting of four Conv2D 
#convolution layers, with our final output layer being a dense layer.

#The convolution layers are designed for feature detection. It works by sliding a filter window over the input 
#and performing a matrix multiplication and storing the result in a feature map. This operation is known as a 
#convolution.

#The filter parameter specifies the number of nodes in each layer. Each layer will increase in size from 16, 32, 
#64 to 128, while the kernel_size parameter specifies the size of the kernel window which in this case is 2 
#resulting in a 2x2 filter matrix.

#The first layer will receive the input shape of (40, 174, 1) where 40 is the number of MFCC's 174 is the number 
#of frames taking padding into account and the 1 signifying that the audio is mono.

#The activation function we will be using for our convolutional layers is ReLU which is the same as our previous 
#model. We will use a smaller Dropout value of 20% on our convolutional layers.

#Each convolutional layer has an associated pooling layer of MaxPooling2D type with the final convolutional layer 
#having a GlobalAveragePooling2D type. The pooling layer is do reduce the dimensionality of the model (by 
#reducing the parameters and subsquent computation requirements) which serves to shorten the training time 
#and reduce overfitting. The Max Pooling type takes the maximum size for each window and the Global Average 
#Pooling type takes the average which is suitable for feeding into our dense output layer.

#Our output layer will have 10 nodes (num_labels) which matches the number of possible classifications. The 
#activation is for our output layer is softmax. Softmax makes the output sum up to 1 so the output can be 
#interpreted as probabilities. The model will then make its prediction based on which option has the highest 
#probability.

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax')) 

In [37]:
#For compiling our model, we will use the same three parameters as the previous model: 
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') 

In [38]:
#Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 39, 173, 16)       80        
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 19, 86, 16)        0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 19, 86, 16)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 18, 85, 32)        2080      
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 9, 42, 32)         0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 9, 42, 32)         0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 8, 41, 64)        

In [39]:
#Here we will train the model. As training a CNN can take a sigificant amount of time, we will start with a 
#low number of epochs and a low batch size. If we can see from the output that the model is converging, we will 
#increase both numbers. 

from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

#num_epochs = 12
#num_batch_size = 128

#num_epochs = 72
num_epochs = 10
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/10
Epoch 00001: val_loss improved from inf to 3.85902, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 2/10
Epoch 00002: val_loss did not improve from 3.85902
Epoch 3/10
Epoch 00003: val_loss did not improve from 3.85902
Epoch 4/10
Epoch 00004: val_loss did not improve from 3.85902
Epoch 5/10
Epoch 00005: val_loss did not improve from 3.85902
Epoch 6/10
Epoch 00006: val_loss did not improve from 3.85902
Epoch 7/10
Epoch 00007: val_loss did not improve from 3.85902
Epoch 8/10
Epoch 00008: val_loss did not improve from 3.85902
Epoch 9/10
Epoch 00009: val_loss did not improve from 3.85902
Epoch 10/10
Epoch 00010: val_loss did not improve from 3.85902
Training completed in time:  0:00:01.536192


In [40]:
#Here we will review the accuracy of the model on both the training and test data sets. 

# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

#The Training and Testing accuracy scores are both high and an increase on our initial model. Training accuracy 
#has increased by ~6% and Testing accuracy has increased by ~4%.
#There is a marginal increase in the difference between the Training and Test scores (~6% compared to ~5% 
#previously) though the difference remains low so the model has not suffered from overfitting.


Training Accuracy:  0.375
Testing Accuracy:  0.0


In [43]:
#Here we will modify our previous method for testing the models predictions on a specified audio .wav file. 

import numpy as np
max_pad_len = 174

def extract_features(file_name):
   
    try:
        
        print("o")
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        print("o")
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        
        print(mfccs.shape)
        
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs

def print_prediction(file_name):
    prediction_feature = extract_features(file_name) 
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

In [44]:
#As before we will verify the predictions using a subsection of the sample audio files we explored in the 
#first notebook. We expect the bulk of these to be classified correctly. 

# Class: Air Conditioner

filename = './audio/fold3/18594-1-1-0.wav' 
print_prediction(filename) 

o
Error encountered while parsing file:  ./audio/fold3/18594-1-1-0.wav


AttributeError: 'NoneType' object has no attribute 'reshape'