In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,Model
from keras.layers import Activation, Dense,Conv2D,Dropout,Activation,Flatten,MaxPooling2D
from tensorflow.keras.models import Sequential
import imblearn
from imblearn.over_sampling import RandomOverSampler,SMOTE
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import librosa
from sklearn.model_selection import train_test_split
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
from scipy.io.wavfile import write




In [None]:

train_path='Train dataset path'
test_path='test data set path'
#the classification classes for audio files
classes=['blue','change_color','green','high','low','noise','off','on','party','purple','red','sleep','unknown','wakeup','white','yellow']


In [None]:
#The function take a .wav file as input and return the features 
def features_extractorcnn(file_name):
    audio, sample_rate = librosa.load(file_name) 
    mfccs_features =librosa.feature.melspectrogram(y=audio,sr=sample_rate)    
    return mfccs_features.reshape(128,44,1)

In [None]:

features_extractorcnn('blue.unknown.2uelmobe.ingestion-5b9bcd5b54-nclkz.wav').shape

(128, 44, 1)

In [None]:

# getting features of all training dataset and save it in a variable for model trainin
x=np.array([features_extractorcnn(train_path+'\\'+f) for f in listdir(train_path) if isfile(join(train_path, f))])
x_test=np.array([features_extractorcnn(test_path+'\\'+f) for f in listdir(test_path) if isfile(join(test_path, f))])



In [None]:
# Slicing for the file name to get the class name then making one-hot encoding for labels
y=np.array([f.split('.')[0] for f in listdir(train_path) if isfile(join(train_path, f))])
y=np.array(pd.get_dummies(y))



In [None]:
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
#oversampling for features data because dataset was unbalanced
oversample = RandomOverSampler(sampling_strategy='minority')
x_over, y_over = oversample.fit_resample(x_train.reshape(len(x_train),-1), y_train)
x_over=x_over.reshape(len(x_over),128,44,1)



In [None]:
x_over.shape

(462, 128, 44, 1)

In [None]:

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

model=keras.Sequential([
    #cnn
    layers.Conv2D(filters=32,kernel_size=(3,3),activation='relu',input_shape=(128,44,1)),
    keras.layers.Dropout(0.4),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(filters=64,kernel_size=(3,3),activation='relu'),
    keras.layers.Dropout(0.4),
    layers.MaxPooling2D((2,2)),
    
    #dense
    layers.Flatten(),
    layers.Dense(64,activation='relu'),
    keras.layers.Dropout(0.4),
    layers.Dense(16,activation='softmax')
    
])
model.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
model.fit(x_over,y_over, epochs=10, validation_data=(x_val, y_val),callbacks=[callback])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2dfc1ba1d30>

<keras.callbacks.EarlyStopping at 0x1b73212baf0>

In [None]:
model.predict(x_test)

array([[4.1424915e-02, 2.6661947e-01, 1.1676506e-02, ..., 5.6115179e-03,
        1.4773860e-02, 6.1642868e-04],
       [5.0713264e-05, 7.8531597e-03, 8.1049884e-04, ..., 1.1979738e-03,
        1.4750972e-02, 2.2595678e-03],
       [8.1568267e-03, 2.5052436e-02, 1.5371993e-02, ..., 3.5901412e-03,
        8.0203693e-03, 1.0902823e-01],
       ...,
       [2.1205142e-07, 2.4664440e-04, 1.8867168e-11, ..., 5.7249540e-08,
        1.5243929e-08, 7.2283345e-14],
       [6.9529214e-03, 3.4139387e-02, 2.1249756e-02, ..., 2.6499940e-02,
        1.6465783e-02, 2.8134674e-02],
       [2.7536985e-03, 1.6786311e-02, 2.8536087e-02, ..., 1.1113281e-02,
        1.0527865e-02, 4.5975032e-03]], dtype=float32)

In [None]:
def predictSound(soundFile):
    x=features_extractorcnn(soundFile)
    i=np.argmax(model.predict(x[np.newaxis,...]))
    return classes[i]
    
    

In [None]:
predictSound('blue.unknown.2uelmobe.ingestion-5b9bcd5b54-nclkz.wav')

'blue'

In [None]:
#The function allow you to record an audio and return its features
def record():
    fs=22050
    duration = 1  # seconds
    print ("Recording Audio")
    myrecording = sd.rec(duration * fs, samplerate=fs, channels=1,dtype='float64')
    sd.wait()
    print ("Audio recording complete , Play Audio")
    write('output.wav', fs, myrecording)
    x=features_extractorcnn('output.wav')
    os.remove("output.wav")
    sd.play(myrecording,fs)
    return x

Recording Audio
Audio recording complete , Play Audio


In [None]:
#the function call the record function and return prediction class for the record
def predictRecord():
    
    x=record() 
    i=np.argmax(model.predict(x[np.newaxis,...]))
    return classes[i]

In [None]:
predictRecord()

Recording Audio
Audio recording complete , Play Audio


'off'