In [None]:
import os
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Reshape, Flatten
from keras.layers import Activation, UpSampling2D, Conv2D
from keras.layers.merge import _Merge
from keras.layers.convolutional import Convolution2D, Conv2DTranspose
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam
from keras.datasets import mnist
from keras.callbacks import TensorBoard
from keras.utils.np_utils import to_categorical
from keras import backend as K
from functools import partial
import tensorflow as tf
import pyaudio
import wave
import pydub
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os
import re
import glob

import MySQLdb

import sys
sys.path.append("..") 
from utils import audio_tools as audio

In [9]:
# training data generated with preprocess_audio.py
DATAPATH = "/Users/jin/Desktop/DATA/training_data.npz"

# trained model paths for generator and discriminator
D_WEIGHT_PATH ="/Users/jin/Desktop/DATA/discriminator_epoch_1200_-2.32.h5"

In [10]:
# category info
CATEGORIES = np.load(DATAPATH)["category_names"]
nb_categories = len(CATEGORIES)

# for denomalizing mel_spectrogram
mel_means = np.load(DATAPATH)["mean"]
mel_stds = np.load(DATAPATH)["std"]


In [11]:
def make_discriminator(nb_categories):
    D = 64 # model size
    input_data = Input(shape=(128, 128, 1))
    x = Conv2D(D, (5, 5), strides=(2,2), padding='same')(input_data)
    x = LeakyReLU(alpha=0.2)(x)
    x = Conv2D(D * 2, (5, 5), strides=(2,2), kernel_initializer='he_normal',padding='same')(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Flatten()(x)
    
    real_fake = Dense(1, kernel_initializer='he_normal',name='real_fake')(x)
    
    categories = Dense(nb_categories,kernel_initializer='he_normal',name='categories',activation='softmax')(x)
    
    model = Model(input_data, [real_fake, categories])
    return model

#generator = make_generator()
discriminator = make_discriminator(nb_categories)

# load weights
#generator.load_weights(G_WEIGHT_PATH)
discriminator.load_weights(D_WEIGHT_PATH)

In [12]:
import librosa
from librosa import display
import matplotlib.pyplot as plt
reload(audio)
import IPython
from IPython.display import Audio
from IPython.display import clear_output

# check if the discriminator thinks the generated sound as real sound
CONFIDENCE_THRESH = 0.80

def denormalize(norm_s):
    assert norm_s.shape[0] == mel_means.shape[0]
    Y = (norm_s * (3.0 * mel_stds)) + mel_means
    return Y

def save_audio(y, path, category=None, stereo=False):
    _s = np.squeeze(y)
    
    if stereo:
        assert _s.shape[0] == 2
        channels = []
        for s in _s:
            s = denormalize(s)
            w = audio.inv_melspectrogram(s)
            channels.append(w)
        w = np.vstack(channels)
        w = np.transpose(w)
        audio.save_wav(w, path)    
    else:
        s = denormalize(_s)
        w = audio.inv_melspectrogram(s)
        audio.save_wav(w, path)
    
def show_spec(S, display=False):    
    if S.shape[0] == 2:
        S = S[0]
    S = np.squeeze(S)
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S)
    plt.show()
    
    if display:
        path = "/tmp/test.wav"
        save_audio(S, path)
        IPython.display.display(Audio(path))

def classify_drums(w, thresh=CONFIDENCE_THRESH):
    w = np.squeeze(w)
    w = w[np.newaxis, :, :, np.newaxis]
    r, p = discriminator.predict([w])
    if float(r) > thresh:
        return np.argmax(p)
    else:
        return -1

def generate_random_sound(max_try= 100):

    for i in range(max_try):
        seed = np.random.rand(1, 100)
        w =   generator.predict(seed)
        id_category = classify_drums(w[0])
        category =  CATEGORIES[id_category]
        if  id_category >= 0:
            save_audio(w, "./drums_%s.wav" % category, category=category)
            print category
            show_spec(w, True)
            break

IMAGE_SIZE = 128
def classify_sound(w, thresh=CONFIDENCE_THRESH):
    y, sr = librosa.core.load(w, sr =16000)
    
    db_mel = audio.melspectrogram(y)
    assert db_mel.shape[0] == IMAGE_SIZE
    
    dummy = np.ones((IMAGE_SIZE, IMAGE_SIZE))*(-80)
    db_mel = np.hstack((db_mel , dummy))
    db_mel = db_mel[:,:IMAGE_SIZE]
    
    w = np.squeeze(db_mel)
    w = w[np.newaxis,:,:, np.newaxis]
    r,p =discriminator.predict([w])
    id_category = np.argmax(p)
    category = CATEGORIES[id_category]
    
    return float(r)


In [13]:
## scanning splited files
def calaverage(dirname):
    try:
        count = 0
        summ = 0
        filenames = os.listdir(dirname)
        for filename in filenames:
            full_filename = os.path.join(dirname, filename)
            ext = os.path.splitext(full_filename)[-1]
            if ext == '.wav':
                count +=1
                a = int(classify_sound(full_filename))
                summ = summ + a
            
        average = summ/count
    except:
        average = -99
        
    return average

def remove_sound():
    for root, dirs, files in os.walk('seungjin'):
        for f in files:
            os.unlink(os.path.join(root, f))
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))

# 음향 데이터 인식

In [21]:
while True :
     
    CHUNK =128
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = 3
    WAVE_OUTPUT_FILENAME = "output.wav"
    
    p = pyaudio.PyAudio()
    
    stream = p.open(format=FORMAT,
                   channels=CHANNELS,
                   rate=RATE,
                   input=True,
                   frames_per_buffer=CHUNK)
    
    frames = []
    
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        
        data = stream.read(CHUNK)
        frames.append(data)
        
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    
    ############################################################
    try:
        
        sound_file = AudioSegment.from_wav("output.wav")
        Promotion = split_on_silence(sound_file, min_silence_len= 100,
                                silence_thresh= -17 )
        for i, jin in enumerate(Promotion):
            splited_file = "seungjin/jin{0}.wav".format(i)
            jin.export(splited_file, format="wav")

        ###Sound Classification###############################

        if calaverage("seungjin")>=150:
            ave= calaverage("seungjin")
            
        else :
            remove_sound()
            
            sound_file = AudioSegment.from_wav("output.wav")
            Promotion = split_on_silence(sound_file, min_silence_len= 100,
                                     silence_thresh= -22 )
            for i, jin in enumerate(Promotion):
                splited_file = "seungjin/jin{0}.wav".format(i)
                jin.export(splited_file, format="wav")
            
            ave= calaverage("seungjin")
        
        #     ###### upload ave on database ##########
        db = MySQLdb.connect("smurf1213.cafe24.com","smurf1213","1q2w3e4r!","smurf1213" )
        cursor = db.cursor()
        sql = "INSERT INTO RaspberryPi(value) \
               VALUES ('%d')" % (ave)
        cursor.execute(sql)
        db.commit()
        db.close()   
        

    finally:
        remove_sound()


KeyboardInterrupt: 