In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("UrbanSound8K/metadata/UrbanSound8K.csv")
data.shape

(8732, 8)

In [2]:
data.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [3]:
import os
import struct
from scipy.io import wavfile as wav
import matplotlib.pyplot as plt
import IPython.display as ipd

In [4]:
def path_class(filename):
    excerpt = data[data['slice_file_name'] == filename]
    path_name = os.path.join('UrbanSound8K/audio', 'fold'+str(excerpt.fold.values[0]), filename)
    return path_name, excerpt['class'].values[0]
  
def wav_fmt_parser(file_name):
    full_path, _ = path_class(file_name)
    wave_file = open(full_path,"rb")
    riff_fmt = wave_file.read(36)
    n_channels_string = riff_fmt[22:24]
    n_channels = struct.unpack("H",n_channels_string)[0]
    s_rate_string = riff_fmt[24:28]
    s_rate = struct.unpack("I",s_rate_string)[0]
    bit_depth_string = riff_fmt[-2:]
    bit_depth = struct.unpack("H",bit_depth_string)[0]
    return (n_channels,s_rate,bit_depth) 
# n_channels => no. of channels
# s_rate => sampling rate
# bit_depth => bit depth

In [5]:
wav_fmt_data = [wav_fmt_parser(i) for i in data.slice_file_name]
data[['n_channels','sampling_rate','bit_depth']] = pd.DataFrame(wav_fmt_data)
data.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,n_channels,sampling_rate,bit_depth
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark,2,44100,16
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,2,44100,16
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,2,44100,16
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,2,44100,16
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,2,44100,16


In [6]:
import librosa
# By default, Librosa’s load function will convert the sampling rate to 22.05khz, as well as reducing the number of channels 
# to 1(mono), and normalise the data so that the values will range from -1 to 1.

# Spectrograms are a useful technique for visualising the spectrum of frequencies of a sound and how they vary during a very 
# short period of time.
# **** MFCC ****
# The main difference is that a spectrogram uses a linear spaced frequency scale (so each frequency bin is spaced an equal 
# number of Hertz apart), whereas an MFCC uses a quasi-logarithmic spaced frequency scale, which is more similar to how the
# human auditory system processes sounds.

In [7]:
def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) 
        
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None 
     
    return mfccs

In [8]:
features = []

# Iterate through each sound file and extract the features 
for index, row in data.iterrows():
    
    file_name = os.path.join('UrbanSound8K','audio', 'fold'+str(row.fold), str(row.slice_file_name))
    
    class_label = row["class"]
    data_feature = extract_features(file_name)
    
    features.append([data_feature, class_label])

In [16]:
# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

In [17]:
featuresdf.head()

Unnamed: 0,feature,class_label
0,"[[-306.77255, -177.59209, -99.13616, -65.97198...",dog_bark
1,"[[-457.69534, -451.0248, -450.68613, -445.0000...",children_playing
2,"[[-468.0367, -467.42264, -481.04654, -486.5948...",children_playing
3,"[[-422.42215, -411.9085, -409.46243, -409.0892...",children_playing
4,"[[-438.10162, -434.47787, -443.3284, -442.6643...",children_playing


In [21]:
# create X
l = featuresdf.feature.tolist()
for i in range(len(l)):
    pad_val = 174 - l[i].shape[1]
    arr = l[i]
    arr = np.pad(arr,((0,0),(0,pad_val)),'constant')
    l[i] = arr
    #print(l[i].shape)


In [23]:
X = np.array(l)
X.shape

(8732, 40, 174)

In [24]:
Y = np.array(featuresdf.class_label.tolist())
Y.shape

(8732,)

In [25]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
lb = LabelEncoder()  # Encode target labels with value between 0 and n_classes-1.
y = np_utils.to_categorical(lb.fit_transform(Y))
# to_categorical => one hot encode integer data.
# label encoder => convert categorical labels to integers 0 to num_classes-1.

Using TensorFlow backend.


In [26]:
y.shape

(8732, 10)

In [27]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [28]:
# We will use a sequential model, starting with a simple model architecture, consisting of four Conv2D convolution layers, with 
# our final output layer being a dense layer. Our output layer will have 10 nodes (num_labels) which matches the number of 
# possible classifications.

In [29]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from sklearn import metrics 

In [30]:
num_rows = 40
num_columns = 174
num_channels = 1

In [31]:
x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

In [33]:
num_labels = y.shape[1]
filter_size = 2

In [34]:
# Construct model 
model = Sequential()

In [35]:
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())
# flatten just rearranges elements to convert multi-dimensional to single dimension 
# GlobalAveragePooling is a methodology used for better representation of your vector. It can be 1D/2D/3D. It uses a parser 
# window which moves across the object and pools the data by averaging it (GlobalAveragePooling) or picking max value 
# (GlobalMaxPooling). Padding is essentially required to take the corner cases into the account.

model.add(Dense(num_labels, activation='softmax'))




In [36]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [37]:
# Display model architecture summary 
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 39, 173, 16)       80        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 19, 86, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 86, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 18, 85, 32)        2080      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 9, 42, 32)         0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 9, 42, 32)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 8, 41, 64)        

In [38]:
# Calculate pre-training accuracy 
# verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch
score = model.evaluate(x_test, y_test, verbose=1) 
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)


Pre-training accuracy: 9.5020%


In [40]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Train on 6985 samples, validate on 1747 samples
Epoch 1/72

Epoch 00001: val_loss improved from inf to 1.98975, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 2/72

Epoch 00002: val_loss improved from 1.98975 to 1.82309, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 3/72

Epoch 00003: val_loss improved from 1.82309 to 1.62719, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 4/72

Epoch 00004: val_loss improved from 1.62719 to 1.51406, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 5/72

Epoch 00005: val_loss improved from 1.51406 to 1.47293, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 6/72

Epoch 00006: val_loss improved from 1.47293 to 1.38213, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 7/72

Epoch 00007: val_loss improved from 1.38213 to 1.31390, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 8/72

Epoch 00008: val_loss improved from 1.31390 to 1.26073, saving model 


Epoch 00034: val_loss did not improve from 0.61843
Epoch 35/72

Epoch 00035: val_loss improved from 0.61843 to 0.61247, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 36/72

Epoch 00036: val_loss improved from 0.61247 to 0.56385, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 37/72

Epoch 00037: val_loss did not improve from 0.56385
Epoch 38/72

Epoch 00038: val_loss improved from 0.56385 to 0.54652, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 39/72

Epoch 00039: val_loss did not improve from 0.54652
Epoch 40/72

Epoch 00040: val_loss did not improve from 0.54652
Epoch 41/72

Epoch 00041: val_loss improved from 0.54652 to 0.51379, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 42/72

Epoch 00042: val_loss did not improve from 0.51379
Epoch 43/72

Epoch 00043: val_loss improved from 0.51379 to 0.49191, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 44/72

Epoch 00044: val_loss did not improve from 0.4919

In [41]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9411596059799194
Testing Accuracy:  0.8963938355445862
