# Data Preprocessing

Importing Libraries

In [1]:
import os
import numpy as np 
import pandas as pd 
import time as ti 
import tensorflow as tf
from tensorflow.keras import layers, models, activations
from keras.utils import plot_model
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import Audio, Image

pd.options.mode.chained_assignment = None

Set physical device to GPU

In [2]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Read dataset

In [3]:
audio_folder_path = os.getcwd() + '/esc/archive/audio/audio'
csv_path =  os.getcwd() + '/esc/archive/esc50.csv'

In [4]:
dataset = pd.read_csv(csv_path)

In [5]:
dataset.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


Extract important features

In [6]:
dataset = dataset[["filename", "target","category"]]

In [7]:
dataset.head()

Unnamed: 0,filename,target,category
0,1-100032-A-0.wav,0,dog
1,1-100038-A-14.wav,14,chirping_birds
2,1-100210-A-36.wav,36,vacuum_cleaner
3,1-100210-B-36.wav,36,vacuum_cleaner
4,1-101296-A-19.wav,19,thunderstorm


Add path attribute which gieves path to respective wav file

In [8]:
dataset["path"] = audio_folder_path + "/" + dataset["filename"]

In [9]:
dataset.head()

Unnamed: 0,filename,target,category,path
0,1-100032-A-0.wav,0,dog,/Users/karanhadiyal/Desktop/esc/archive/audio/...
1,1-100038-A-14.wav,14,chirping_birds,/Users/karanhadiyal/Desktop/esc/archive/audio/...
2,1-100210-A-36.wav,36,vacuum_cleaner,/Users/karanhadiyal/Desktop/esc/archive/audio/...
3,1-100210-B-36.wav,36,vacuum_cleaner,/Users/karanhadiyal/Desktop/esc/archive/audio/...
4,1-101296-A-19.wav,19,thunderstorm,/Users/karanhadiyal/Desktop/esc/archive/audio/...


# Convert wav file to spectogram

Function to read wav file and return in form of numpy array

In [14]:
def read_wav(writeFile):
    desired_sr = 220500 #Desired sample rate
    data, samplerate = librosa.load(writeFile, sr=None) #Load wav file on sample rate of 220500
    
    #change sample rate to desired sample rate
    if data.shape[0] <= desired_sr:
        newdata = np.zeros((desired_sr,))
        newdata[: data.shape[0]] = data
    else:
        newdata = data[:desired_sr]
    
    #prepare wav on desired sample rate
    dataN = np.shape(newdata)[0]
    newdata = np.reshape(np.transpose(newdata),(1, dataN))
    return newdata #return wav

Add "read_wav" attribute

In [15]:
dataset["read_wav"] = dataset["path"].apply(read_wav)

Function to get spectrum from respective read_wav attribute

In [16]:
def get_spectrum(signal, w, flag, channels):

    signal = signal.astype('float32')  
    Nsamples = np.size(signal, 1)
    delmax = w / 4  # Maximum delay should be w / 4
    res = np.where(channels <= delmax)
    channels = channels[res]  #remove channels not satisfyiong this condition. 
    m = np.shape(channels)[0]
    

    spectrograms = Nsamples // w # Number of spectrograms
    Samples = spectrograms * w # The number of samples used to compute the spectrograms
    matrix = np.reshape(signal[0, 0 : Samples], (spectrograms, w)) # Each line is one to be submited for computation of spectrogram 
    
    spectrum = np.zeros((m, spectrograms)) # Spectrum initialization
    for i in range(0, spectrograms):
        values = matrix[i, :] # The whole line 
        for k in range(0, m):
            delay = channels[k]  # Delay  
            t = np.array(range(delay, w-delay-1))
            difus = np.abs(values[t - delay] + values[t + delay] - 2 * values[t])
            
            if flag == 0:
                spectrum[k, i] = np.mean(difus) / 4
            elif flag == 1:
                spectrum[k, i] = np.mean(difus / (np.abs(values[t - delay]) + np.abs(values[t + delay]) + 2 * np.abs(values[t]) + 1e-12)) / 4
    return spectrum

In [17]:
channels = np.array([2, 4, 8, 16, 20, 32, 50, 64, 100, 128, 200, 300]) #Channels for wav file

Add spectrum attribute

In [32]:
dataset["spectrum"] = dataset["read_wav"].apply(get_spectrum, args = (1000,0,channels))

Prepare input data and labels

In [91]:
images = dataset["spectrum"]
labels = dataset["target"]

Prepare train and test data

In [92]:
images, labels = images.to_list(), labels.to_list()
images, labels = np.array(images, dtype='float32'), np.array(labels, dtype='float32')

train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=0.2)

# Build and train model

Import dependencies

In [93]:
dataset["spectrum"][0].shape

(11, 220)

In [95]:
(x,y) = dataset["spectrum"][0].shape
function = activations.selu

model = models.Sequential()
model.add(layers.Conv2D(filters=40, kernel_size=(1,1), padding="same", activation=function, input_shape=(x, y, 1)))
model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"))
model.add(layers.Conv2D(filters=40, kernel_size=(3,3), padding="same", activation=function))

model.add(layers.Conv2D(filters=50, kernel_size=(1,1), padding="same", activation=function))
model.add(layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding="same"))
model.add(layers.Conv2D(filters=50, kernel_size=(5,5), padding="same", activation=function))

model.add(layers.Conv2D(filters=60, kernel_size=(1,1), padding="same", activation=function))
model.add(layers.MaxPooling2D(pool_size=(4, 4), strides=(2, 2), padding="same"))
model.add(layers.Conv2D(filters=60, kernel_size=(7,7), padding="same", activation=function))

model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(200))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(50))

In [96]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_38 (Conv2D)          (None, 11, 220, 40)       80        
                                                                 
 max_pooling2d_17 (MaxPoolin  (None, 11, 220, 40)      0         
 g2D)                                                            
                                                                 
 conv2d_39 (Conv2D)          (None, 11, 220, 40)       14440     
                                                                 
 conv2d_40 (Conv2D)          (None, 11, 220, 50)       2050      
                                                                 
 max_pooling2d_18 (MaxPoolin  (None, 6, 110, 50)       0         
 g2D)                                                            
                                                                 
 conv2d_41 (Conv2D)          (None, 6, 110, 50)       

In [97]:
model.compile(optimizer='adam',loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

In [98]:
model.fit(train_images,train_labels,epochs=75,validation_data=(test_images,test_labels))

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


Save the model

In [106]:
model.save("model.h5")

Test accuracy and test loss

In [107]:
test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=1)
print(test_acc)

0.699999988079071
