In [3]:
import os
import soundfile
import librosa
import numpy as np
import tensorflow as tf
import pickle
import pandas as pd
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)
from sklearn.model_selection import train_test_split

from src.features import feature_extraction_VGGish

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [7]:
%load_ext autoreload
%autoreload 2

## Functions

In [8]:
# %%writefile '/Users/greenapple/project3/src/features/feature_extraction_VGGish.py'

import os
import soundfile
import librosa
import numpy as np

import tensorflow as tf
from src.features import vggish_input
from src.features import vggish_params
from src.features import vggish_postprocess
from src.features import vggish_slim

slim = tf.contrib.slim

# %load_ext autoreload
# %autoreload 2

def read_audio(path, target_fs=None):
    (audio, fs) = soundfile.read(path)

    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
        
    if target_fs is not None and fs != target_fs:
        audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
        fs = target_fs
        
    return audio, fs
    

# Feature extraction
def extract_audioset_embedding(audio_file):
    """Extract log mel spectrogram features. 
    """
    
    # Arguments & parameters
    mel_bins = vggish_params.NUM_BANDS
    sample_rate = vggish_params.SAMPLE_RATE
    input_len = vggish_params.NUM_FRAMES
    embedding_size = vggish_params.EMBEDDING_SIZE
    
    '''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the 
    hop size. '''

    # Paths
    path = '/Users/greenapple/project3/data/house_activities_wavs/'
    audio_path = os.path.join(path, audio_file)
    checkpoint_path = '/Users/greenapple/project3/src/features/vggish_model.ckpt'
    pcm_params_path = '/Users/greenapple/project3/src/features/vggish_pca_params.npz'
    
    # Load model
    sess = tf.Session()
    
    tf.reset_default_graph()
    
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
    features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
    
    pproc = vggish_postprocess.Postprocessor(pcm_params_path)

    # Read audio
    (audio, _) = read_audio(audio_path, target_fs=sample_rate)
    
    # Extract log mel feature
    logmel = vggish_input.waveform_to_examples(audio, sample_rate)

    # Extract embedding feature
    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: logmel})
    
    # PCA
    postprocessed_batch = pproc.postprocess(embedding_batch)
    
    print('Audio length: {}'.format(len(audio)))
    print('Log mel shape: {}'.format(logmel.shape))
    print('Embedding feature shape: {}'.format(postprocessed_batch.shape))
    
    return postprocessed_batch

In [9]:
# Load pre-trained models
model_names = [
    'logreg1',
    'logreg2',
    'logreg3',
    'KNN',
    'NBmultinomial',
    'RF',
    'GBM'
]

model_fldr = '/Users/greenapple/project3/aws/models'

for model in model_names:
    pickling_out = open(os.path.join(model_fldr, f'{model}_10_cls_model_rand.pkl'), 'rb')
    exec(f'{model} = pickle.load(pickling_out)')
    pickling_out.close()     

In [10]:
# Converts an audio file to a dataframe with features that are ready for a model prediction
def audio_to_df(audio_file, audio_folder):
    
    # Extract features
    audio_path = os.path.join(audio_folder, audio_file)
    features = extract_audioset_embedding(audio_path)
    
    # Take date from the first 5 sec of the wav file
    features_5sec = features[:5]
    
    # Reshape array in 1 row with 640 features
    features_row = features_5sec.reshape(1, 640)
    features_df = pd.DataFrame(features_row)
    
    return features_df    

In [11]:
# Makes a dataframe with features extracted from the house activity audio files

def audio_list_to_df(audio_file_list, audio_folder):
    audio_df = pd.DataFrame()
    for audio_file in audio_file_list:
        row = audio_to_df(audio_file, audio_folder)
        audio_df = pd.concat([audio_df, row], ignore_index=True)   
    return audio_df

In [13]:
# List of home activities sound files for prediction
audio_file_list = [
    '20191029_073112.wav',
    '20191029_073337.wav',
    '20191029_073459.wav',
    '20191025_073237.wav',
    '20191025_073858.wav',
    '20191025_074029.wav',
    '20191025_074057.wav'
]

audio_folder = '/Users/greenapple/project3/data/house_activities_wavs/'

audio_df = audio_list_to_df(audio_file_list, audio_folder)

INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 80000
Log mel shape: (5, 96, 64)
Embedding feature shape: (5, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 160128
Log mel shape: (10, 96, 64)
Embedding feature shape: (10, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 80299
Log mel shape: (5, 96, 64)
Embedding feature shape: (5, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 160128
Log mel shape: (10, 96, 64)
Embedding feature shape: (10, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 160086
Log mel shape: (10, 96, 64)
Embedding feature shape: (10, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggi

In [14]:
# Takes in a list of sound files and models - returns predicted values

def sound_recognition(audio_files, models):
    audio_df_pred = pd.DataFrame()
   
    # Extract features
    audio_df = audio_list_to_df(audio_files.values(), audio_folder)
    
    # Predict
    for model in models.values():
        row = model.predict(audio_df)
        row = row.reshape(1, len(audio_files))
        row = pd.DataFrame(row)
        audio_df_pred = pd.concat([audio_df_pred, row], ignore_index=True)  
     
    # Rename columns with sound file names
    audio_df_pred.columns = audio_files.keys()
    
    # Add a column with model names
    audio_df_pred['model'] = models.keys()
    
     # Change column order
    columns = audio_df_pred.columns.to_list()
    columns = columns[-1:] + columns[:-1]
    audio_df_pred = audio_df_pred[columns]
    audio_df_pred
    
    return audio_df_pred

## Predicting sound class for house recordings

In [28]:
# Predict sound class from a list of sound files

# List of home activities sound files for prediction
audio_file_list = [
    '20191029_micr1.wav',
    '20191029_073337_t.wav',
    '20191029_073112.wav',
    '20191029_073337.wav',
    '20191029_073459.wav',
    '20191025_073237.wav',
    '20191025_073858.wav',
    '20191025_074029.wav',
    '20191025_074057.wav'
]

audio_folder = '/Users/greenapple/project3/data/house_activities_wavs/'

# Home activities labeles
audio_file_keys = [
    'micr1',
    'micr2',
    'vacuum cleaner',
    'microwave',
    'opening doors',
    'clarinet',
    'water',
    'human_cat_sound',
    'cat'
]

models = {
    'Logistic_regression1':logreg1,
    'Logistic_regression2':logreg2,
    'Logistic_regression3':logreg3,
    'KNN':KNN,
    'Naive_Bayes_multinomial':NBmultinomial,
    'Random_Forest':RF,
    'Gradient_boosting':GBM
}

audio_files = dict(zip(audio_file_keys, audio_file_list))

audio_df_pred = sound_recognition(audio_files, models)

INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 89750
Log mel shape: (5, 96, 64)
Embedding feature shape: (5, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 96299
Log mel shape: (6, 96, 64)
Embedding feature shape: (6, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 80000
Log mel shape: (5, 96, 64)
Embedding feature shape: (5, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 160128
Log mel shape: (10, 96, 64)
Embedding feature shape: (10, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_model.ckpt
Audio length: 80299
Log mel shape: (5, 96, 64)
Embedding feature shape: (5, 128)
INFO:tensorflow:Restoring parameters from /Users/greenapple/project3/src/features/vggish_mod

In [29]:
# Classification of recorded sounds
audio_df_pred

Unnamed: 0,model,micr1,micr2,vacuum cleaner,microwave,opening doors,clarinet,water,human_cat_sound,cat
0,Logistic_regression1,footsteps,blender,vacuum_cleaner,blender,meow,clarinet,water_tap,meow,meow
1,Logistic_regression2,microwave,blender,vacuum_cleaner,blender,meow,clarinet,water_tap,meow,meow
2,Logistic_regression3,meow,blender,vacuum_cleaner,blender,meow,clarinet,water_tap,meow,meow
3,KNN,microwave,door,vacuum_cleaner,door,music,music,water_tap,water_tap,microwave
4,Naive_Bayes_multinomial,microwave,door,vacuum_cleaner,door,speech,clarinet,meow,speech,meow
5,Random_Forest,music,speech,vacuum_cleaner,speech,music,music,music,music,music
6,Gradient_boosting,water_tap,blender,vacuum_cleaner,blender,water_tap,music,water_tap,speech,speech


## Little voting classifier

In [30]:
# It takes more than 8 hours to train a voting or meta- classifiers. Build a little voting classifier.

In [31]:
# Little voting classifier
def little_vote(df_pred):
    pred = []
    df_pred_vc = pd.DataFrame()
    
    for column in list(df_pred.columns):
        if column=='model':
            pred.append('little_VotingClassifier')
        else:
            pred.append(df_pred[column].mode()[0])
    
    pred_dict = dict(zip(list(df_pred.columns), pred))
    
    df_pred = df_pred.append(pred_dict, ignore_index=True)
        
    return df_pred
    

In [32]:
audio_df_pred_vc = little_vote(audio_df_pred)
audio_df_pred_vc

Unnamed: 0,model,micr1,micr2,vacuum cleaner,microwave,opening doors,clarinet,water,human_cat_sound,cat
0,Logistic_regression1,footsteps,blender,vacuum_cleaner,blender,meow,clarinet,water_tap,meow,meow
1,Logistic_regression2,microwave,blender,vacuum_cleaner,blender,meow,clarinet,water_tap,meow,meow
2,Logistic_regression3,meow,blender,vacuum_cleaner,blender,meow,clarinet,water_tap,meow,meow
3,KNN,microwave,door,vacuum_cleaner,door,music,music,water_tap,water_tap,microwave
4,Naive_Bayes_multinomial,microwave,door,vacuum_cleaner,door,speech,clarinet,meow,speech,meow
5,Random_Forest,music,speech,vacuum_cleaner,speech,music,music,music,music,music
6,Gradient_boosting,water_tap,blender,vacuum_cleaner,blender,water_tap,music,water_tap,speech,speech
7,little_VotingClassifier,microwave,blender,vacuum_cleaner,blender,meow,clarinet,water_tap,meow,meow


In [None]:
# Opening door recording was contaminated by the cat

In [19]:
# Estimate model train time. Explore later.
from scitime import Estimator 