In [2]:
import numpy as np
import librosa
import librosa.display
import os
import matplotlib.pyplot as plt
import pyaudio
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
%matplotlib inline

In [97]:
class melCNN(object):
    def __init__(self, sec=3, state=None, label=None, useCNN=None):
        self.FORMAT = pyaudio.paFloat32
        self.SEC = sec
        self.STATE = state
        self.useCNN = useCNN
        
        self.LABEL = label                
        self.labels = {0 : "nothing",
                       1 : "background_noise",
                       2 : "doorbell",
                       3 : "fire_alarm",
                       4 : "hair_dry",
                       5 : "typing",
                       6 : "someone_talking",
                       7 : "baby_crying"}
        
        self.data_dir = os.path.dirname("data/")
        os.makedirs(self.data_dir, exist_ok=True)
        self.train_dir = os.path.join(self.data_dir, "train")
        os.makedirs(self.train_dir, exist_ok=True)        
        self.test_dir = os.path.join(self.data_dir, "test")
        os.makedirs(self.test_dir, exist_ok=True)
        self.model_dir = os.path.dirname("model/")
        os.makedirs(self.model_dir, exist_ok=True)
        
        if self.LABEL != None:
            self.label_dir = os.path.join(self.train_dir, self.labels[self.LABEL])
            os.makedirs(self.label_dir, exist_ok=True)
        
        self.CHANNELS = 1
        self.RATE = 44100
        self.n_fft = 1024
        self.hop_length = 1024
        self.n_mels = 128
        self.f_min = 20
        self.f_max = 8000
        
        self.count = 0
        self.data = None
        self.mel = None
        self.total_len = self.RATE * self.SEC
        self.total_data = np.zeros(self.total_len)
        
        self.pa = pyaudio.PyAudio()
            
        self.stream = self.pa.open(format=self.FORMAT,
                                   channels=self.CHANNELS,
                                   rate=self.RATE,
                                   input=True,
                                   output=False,
                                   frames_per_buffer=self.RATE)
        
        if self.STATE == None:
            pass
        else:
            self.loop()
        
    def loop(self):
        try:            
            while True:
                start = time.time()
                
                self.audioinput()
                decibel = self.pltmel()                
                
                if self.STATE == "test":
                    pred, acc = self.test(self.useCNN)
                    print(f"dB : {round(decibel)}\t{pred}\tAcc : {acc}")
                end = time.time()
                print(str(round(end-start, 3))+"\tsec")
                self.count += 1
                
        except KeyboardInterrupt:
            self.stream.stop_stream()
            self.stream.close()
            self.pa.terminate()            
        
    def audioinput(self):
        for i in range(self.SEC):
            self.data = self.stream.read(self.RATE, exception_on_overflow=False)
            self.data = np.fromstring(self.data, np.float32)
            self.total_data[:-self.RATE] = self.total_data[self.RATE:]
            self.total_data[-self.RATE:] = self.data
            
    def pltmel(self):
        self.mel = librosa.feature.melspectrogram(y=self.total_data,
                                                  sr=self.RATE,
                                                  n_fft=self.n_fft,
                                                  hop_length=self.hop_length,
                                                  n_mels=self.n_mels,
                                                  power=1.0,
                                                  fmin=self.f_min,
                                                  fmax=self.f_max)
        
        plt.rcParams["figure.figsize"] = (2.24, 2.24)
        plt.axis("off")
        plt.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[])
        #plt.imshow(librosa.power_to_db(self.mel, ref=np.max))        
        self.db_spec = librosa.power_to_db(self.mel, ref=np.max)
        decibel = abs(np.min(self.db_spec))
        librosa.display.specshow(self.db_spec, y_axis="mel", x_axis="time")
        
        if self.STATE == "save_data":
            plt.savefig(str(os.path.join(self.label_dir, "{:03}.jpg".format(self.count))), bbox_inches=None, pad_inches=0, dpi=100)
        elif self.STATE == "test":
            plt.savefig(str(os.path.join(self.test_dir, "{:03}.jpg".format(self.count))), bbox_inches=None, pad_inches=0, dpi=100)
        plt.clf()
        
        return decibel
        
    def train(self, useCNN=True, epochs=10, hidden=128):
        n_classes = len(self.labels)
        model_name = "model_"+str(n_classes)+"class"        
        data = []
        label = []
        for i in os.listdir(self.train_dir):
            print(i)
            for k,v in self.labels.items():                
                if i == v:                    
                    label_dir = os.path.join(self.train_dir, i)
                    images = os.listdir(label_dir)
                    for img in images:                
                        data.append(plt.imread(os.path.join(label_dir, img)))
                        label.append(k)
        data = np.array(data)
        label = np.array(label)
        data, label = shuffle(data, label)
        
        data = data / 255.0
        data = data.astype("float32")
        label = label.astype("float32")
        
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(data, label, test_size=0.3, random_state=0)

        model_name = self.make_model(model_name=model_name, useCNN=useCNN, hidden=hidden, n_classes=n_classes)
        self.train_history = self.model.fit(self.train_x, self.train_y,
                                            epochs=epochs,
                                            validation_data=(self.test_x, self.test_y))
                        
        self.model.save_weights(os.path.join(self.model_dir, model_name+".h5"))
        print("Saved model to disk")
        
    def test(self, useCNN=True):        
        if self.count == 0:
            n_classes = len(self.labels)
            model_name = "model_"+str(n_classes)+"class"            
            
            hidden = 128
            model_name = self.make_model(model_name=model_name, useCNN=useCNN, hidden=hidden, n_classes=n_classes)
            self.model.load_weights(os.path.join(self.model_dir, model_name+".h5"))
        
        img = plt.imread(os.path.join(self.test_dir, "{:03}.jpg".format(self.count)))
        img = (np.expand_dims(img,0))
        prediction = self.model.predict(img)
        result = np.argmax(prediction[0])
        return self.labels[result], prediction[0][result]
    
    def make_model(self, model_name, n_classes, useCNN=True, hidden=128):        
        if useCNN:
            model_name += "_CNN"
            self.model = keras.Sequential([    
                keras.layers.Conv2D(filters=30, kernel_size=(3,3), activation="relu", padding="valid", input_shape=(224, 224, 3)),
                keras.layers.MaxPooling2D(pool_size=(3,3)),
                keras.layers.Dropout(0.5),
                keras.layers.Conv2D(filters=30, kernel_size=(3, 3), activation="relu", padding="valid"),
                keras.layers.MaxPooling2D(pool_size=(3,3)),
                keras.layers.Flatten(),
                keras.layers.Dense(128, activation="relu"),
                keras.layers.Dense(n_classes, activation="softmax")
            ])
        else:
            self.model = keras.Sequential([
                keras.layers.Flatten(input_shape=(224, 224, 3)),
                keras.layers.Dense(hidden, activation="relu"),
                keras.layers.Dense(hidden, activation="relu"),
                keras.layers.Dense(n_classes, activation="softmax")
            ])
        self.model.compile(optimizer='adam',
                           loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])
        return model_name

In [85]:
'''
label 종류
0 : nothing
1 : background_noise
2 : doorbell (3음)
3 : fire_alarm
4 : hair_dry
5 : typing
6 : someone_talking
7 : baby_crying
'''
mel = melCNN(state="save_data", label=7)
#mel = melCNN()



3.343	sec
2.977	sec
2.996	sec
3.004	sec
3.05	sec
2.939	sec
2.993	sec
3.003	sec
2.981	sec
3.011	sec
2.991	sec
3.029	sec
2.967	sec
2.995	sec
3.011	sec
2.985	sec
2.996	sec
3.014	sec
3.005	sec
2.993	sec
2.989	sec
3.01	sec
3.015	sec
2.96	sec
3.004	sec
2.982	sec
3.009	sec
3.014	sec
2.995	sec
3.014	sec
2.992	sec
2.997	sec
2.986	sec
3.008	sec
3.001	sec
2.99	sec
2.978	sec
3.029	sec
2.98	sec
3.008	sec
3.228	sec
2.793	sec
2.968	sec
3.009	sec
2.986	sec
2.995	sec
3.025	sec
2.978	sec
3.055	sec
3.01	sec
2.981	sec
3.024	sec
3.01	sec
2.923	sec
2.994	sec
2.999	sec
2.995	sec
2.989	sec
2.998	sec
3.013	sec
2.994	sec
3.006	sec
3.007	sec
3.012	sec
2.981	sec
3.007	sec
2.991	sec
3.016	sec
2.977	sec
3.013	sec
2.969	sec
3.036	sec
2.97	sec
2.991	sec
3.009	sec
3.005	sec
3.006	sec
3.006	sec
2.972	sec
3.022	sec
2.97	sec
3.009	sec
3.001	sec
2.995	sec
3.023	sec
3.009	sec
3.002	sec
2.974	sec
3.032	sec
2.995	sec
2.964	sec
3.016	sec
2.963	sec
3.033	sec
3.019	sec
2.992	sec
2.975	sec
2.983	sec
3.048	sec
2.973	sec
3.009	sec

<Figure size 161.28x161.28 with 0 Axes>

In [98]:
mel = melCNN()
mel.train(epochs=20)

baby_crying
background_noise
doorbell
fire_alarm
hair_dry
nothing
someone_talking
typing
Train on 1680 samples, validate on 720 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Saved model to disk


In [79]:
mel = melCNN()
mel.train(useCNN=False, epochs=20)

background_noise
doorbell
fire_alarm
hair_dry
nothing
someone_talking
typing
Train on 1470 samples, validate on 630 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Saved model to disk


In [15]:
mel.train_history.history['val_acc']

[0.55925924,
 0.84814817,
 0.9259259,
 0.9851852,
 0.9962963,
 0.9962963,
 0.9962963,
 0.9962963,
 0.9962963,
 0.9962963]

In [99]:
mel_test = melCNN(state="test", useCNN=True)



dB : 34.0	background_noise	Acc : 1.0
5.098	sec
dB : 30.0	background_noise	Acc : 1.0
2.098	sec
dB : 41.0	background_noise	Acc : 1.0
2.994	sec
dB : 32.0	background_noise	Acc : 1.0
2.99	sec
dB : 29.0	background_noise	Acc : 1.0
2.993	sec
dB : 34.0	background_noise	Acc : 1.0
3.009	sec
dB : 34.0	background_noise	Acc : 1.0
3.029	sec
dB : 34.0	background_noise	Acc : 1.0
3.077	sec
dB : 34.0	hair_dry	Acc : 1.0
2.903	sec
dB : 35.0	hair_dry	Acc : 1.0
2.987	sec
dB : 35.0	hair_dry	Acc : 1.0
3.001	sec
dB : 42.0	hair_dry	Acc : 1.0
3.013	sec
dB : 37.0	hair_dry	Acc : 1.0
3.004	sec
dB : 34.0	hair_dry	Acc : 1.0
2.987	sec
dB : 38.0	hair_dry	Acc : 1.0
3.09	sec
dB : 30.0	background_noise	Acc : 1.0
2.909	sec
dB : 46.0	fire_alarm	Acc : 1.0
2.993	sec
dB : 55.0	fire_alarm	Acc : 1.0
3.019	sec
dB : 42.0	fire_alarm	Acc : 1.0
2.984	sec
dB : 35.0	fire_alarm	Acc : 1.0
2.98	sec
dB : 30.0	nothing	Acc : 1.0
3.095	sec
dB : 41.0	doorbell	Acc : 1.0
2.916	sec
dB : 43.0	doorbell	Acc : 1.0
3.0	sec
dB : 46.0	doorbell	Acc : 1.0


<Figure size 161.28x161.28 with 0 Axes>

In [82]:
mel_test = melCNN(state="test", useCNN=False)



dB : 39.0	background_noise	Acc : 1.0
4.467	sec
dB : 30.0	nothing	Acc : 1.0
2.056	sec
dB : 45.0	background_noise	Acc : 1.0
3.0	sec
dB : 45.0	typing	Acc : 1.0
2.997	sec
dB : 47.0	doorbell	Acc : 1.0
2.992	sec
dB : 50.0	typing	Acc : 1.0
3.015	sec
dB : 46.0	someone_talking	Acc : 1.0
2.997	sec
dB : 42.0	typing	Acc : 1.0
2.966	sec
dB : 43.0	background_noise	Acc : 1.0
3.004	sec
dB : 44.0	typing	Acc : 1.0
3.021	sec
dB : 44.0	doorbell	Acc : 1.0
2.989	sec
dB : 47.0	typing	Acc : 1.0
2.995	sec
dB : 44.0	typing	Acc : 1.0
3.025	sec
dB : 29.0	nothing	Acc : 1.0
2.975	sec
dB : 26.0	nothing	Acc : 1.0
3.004	sec
dB : 50.0	someone_talking	Acc : 1.0
3.014	sec
dB : 47.0	background_noise	Acc : 1.0
2.987	sec
dB : 36.0	nothing	Acc : 1.0
3.002	sec
dB : 30.0	nothing	Acc : 1.0
3.007	sec


<Figure size 161.28x161.28 with 0 Axes>