# Machine Learning: Urban Sounds NaiveBayes Classifier

In [1]:
# Base
import numpy as np
import pandas as pd
import json
import re
import string
from os import listdir
import math
import time
import csv
import sys
import datetime

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Machine Learning
from sklearn import metrics, cross_validation

from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer, OneHotEncoder, LabelEncoder

from sklearn.cross_validation import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

#from sknn.mlp import Regressor, Classifier, Layer
#from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# Import Pre-Processed Wav File Data Set
wavData = pd.read_csv('feature_quant.csv')

In [3]:
wavData[0:5]

Unnamed: 0.1,Unnamed: 0,std,mean,power2,power7,power5,kurtosis,skewness,power10,power4,power9,power6,power1,power8,class,power3
0,100032.wav,5387.262558,-0.173201,0.218719,0.021339,0.107915,8.378661,0.290676,0.004058,0.286047,0.007275,0.049451,0.030802,0.011321,dog_bark,0.263073
1,100263.wav,172.622405,-2.877835,0.077798,0.102353,0.091911,10.631201,-0.037141,0.094223,0.112095,0.095986,0.145623,0.082727,0.108314,children_playing,0.08897
2,101729.wav,,,,,,,,,,,,,,air_conditioner,
3,101848.wav,1100.104596,-0.486906,0.170541,0.056053,0.109598,7.45959,-0.010953,0.053987,0.127636,0.0502,0.066393,0.156979,0.052119,street_music,0.156496
4,102102.wav,697.540407,-1.028463,0.058669,0.154061,0.125748,11.638605,0.204312,0.098656,0.118089,0.133428,0.113967,0.043734,0.075375,dog_bark,0.078275


In [4]:
# Remove Empty Rows
wavData = wavData[-np.isnan(wavData['mean'])]

In [5]:
feat = list(wavData.columns)
feat.remove('class')
feat.remove('Unnamed: 0')
feat

['std',
 'mean',
 'power2',
 'power7',
 'power5',
 'kurtosis',
 'skewness',
 'power10',
 'power4',
 'power9',
 'power6',
 'power1',
 'power8',
 'power3']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(wavData.loc[:,feat], wavData.loc[:,'class'], \
                                                    test_size=0.3, random_state=0)

In [37]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_predict = gnb.predict(X_test)

In [13]:
classes = set(wavData['class']); classes

{'air_conditioner',
 'car_horn',
 'children_playing',
 'dog_bark',
 'drilling',
 'engine_idling',
 'gun_shot',
 'jackhammer',
 'siren',
 'street_music'}

In [40]:
y_probs = gnb.predict_proba(X_test)
list(zip(y_probs[1], gnb.classes_))

[(0.041894552496326669, 'air_conditioner'),
 (0.053680435681601975, 'car_horn'),
 (0.31175245334618928, 'children_playing'),
 (0.019736285610748833, 'dog_bark'),
 (0.059446802262466153, 'drilling'),
 (0.093300648831557989, 'engine_idling'),
 (1.8605243121137103e-06, 'gun_shot'),
 (0.20499154300224065, 'jackhammer'),
 (0.024375700986609173, 'siren'),
 (0.19081971725794714, 'street_music')]

In [52]:
y_probs[0]

array([  2.98790581e-204,   0.00000000e+000,   0.00000000e+000,
         4.92943638e-195,   0.00000000e+000,   0.00000000e+000,
         1.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         2.09718511e-295])

In [90]:
y_logprobs = gnb.predict_log_proba(X_test)
zip(y_logprobs[1],gnb.classes_)

[(-3.1725994725271836, 'air_conditioner'),
 (-2.9247066700773932, 'car_horn'),
 (-1.165545824858123, 'children_playing'),
 (-3.9252964282977327, 'dog_bark'),
 (-2.822673445980552, 'drilling'),
 (-2.3719282169026661, 'engine_idling'),
 (-13.19465222172901, 'gun_shot'),
 (-1.5847865543422905, 'jackhammer'),
 (-3.71416850408675, 'siren'),
 (-1.6564261853808055, 'street_music')]

In [83]:
y_predict[1]

'children_playing'

In [19]:
print('Accuracy: %.2f' % accuracy_score(y_test,y_predict))
#print('Precision: %.2f' % precision_score(y_test,y_predict))
#print('Recall: %.2f' % recall_score(y_test,y_predict))
#print('F1: %.2f' % f1_score(y_test,y_predict))
confmat=confusion_matrix(y_true=y_test, y_pred=y_predict)
print(confmat)

Accuracy: 0.23
[[ 1  0  1  1  0  0  0  0  0  5]
 [ 0  0  4  3  0  1  2  2  0  8]
 [ 0  1  5  1  0  0  0  4  0 10]
 [ 1  2 10  9  3  2  5  6  0  5]
 [ 0  1  3  1  1  3  0  3  0  8]
 [ 1  0  1  1  1  3  0  1  0  3]
 [ 1  0  2  7  0  3  7  0  0  0]
 [ 0  0  0  0  0  0  0  2  0  2]
 [ 0  0  3  2  1  2  0  2  1  6]
 [ 1  1  1  0  0  2  0  2  0 13]]


In [61]:
# Set up loop to get Accuracy for each class as 1-vs-All
def runNBonevsall(var, DF, featList):
    # Create new response variable
    DF[var] = 0
    DF.loc[DF['class'] == var,var] = 1
    #feat = list(DF.columns)
    #print(feat)
    #feat.remove('class')
    #feat.remove('Unnamed: 0')
    #feat.remove(var)
    #print(feat)
    X_train, X_test, y_train, y_test = train_test_split(DF.loc[:,featList], DF.loc[:,var], \
                                                        test_size=0.35, random_state=0)
    print(var)
    
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_predict = gnb.predict(X_test)
    
    print('Accuracy: %.2f' % accuracy_score(y_test,y_predict))
    print('Precision: %.2f' % precision_score(y_test,y_predict))
    print('Recall: %.2f' % recall_score(y_test,y_predict))
    print('F1: %.2f' % f1_score(y_test,y_predict))
    
    confmat=confusion_matrix(y_true=y_test, y_pred=y_predict, labels = [1,0])
    print(confmat)
    print('\n')
    
    return {var:(accuracy_score(y_test,y_predict),precision_score(y_test,y_predict), \
                 recall_score(y_test,y_predict), f1_score(y_test,y_predict))}

In [62]:
resultsNB = [runNBonevsall(var, wavData, feat) for var in classes]

siren
Accuracy: 0.35
Precision: 0.09
Recall: 0.74
F1: 0.17
[[ 14   5]
 [135  60]]


air_conditioner
Accuracy: 0.31
Precision: 0.05
Recall: 0.88
F1: 0.09
[[  7   1]
 [146  60]]


children_playing
Accuracy: 0.48
Precision: 0.18
Recall: 0.96
F1: 0.30
[[ 24   1]
 [110  79]]


car_horn
Accuracy: 0.33
Precision: 0.12
Recall: 0.76
F1: 0.21
[[ 19   6]
 [137  52]]


gun_shot
Accuracy: 0.88
Precision: 0.45
Recall: 0.38
F1: 0.41
[[  9  15]
 [ 11 179]]


dog_bark
Accuracy: 0.28
Precision: 0.22
Recall: 0.80
F1: 0.35
[[ 41  10]
 [145  18]]


drilling
Accuracy: 0.36
Precision: 0.12
Recall: 0.82
F1: 0.21
[[ 18   4]
 [132  60]]


engine_idling
Accuracy: 0.33
Precision: 0.08
Recall: 0.93
F1: 0.15
[[ 13   1]
 [142  58]]


jackhammer
Accuracy: 0.40
Precision: 0.03
Recall: 1.00
F1: 0.06
[[  4   0]
 [128  82]]


street_music
Accuracy: 0.38
Precision: 0.13
Recall: 0.91
F1: 0.23
[[ 20   2]
 [131  61]]




In [23]:
# Set up loop to get Accuracy for each class as 1-vs-All
def runNBstacked(var, X_train, X_test, y_train, y_test):
    y_train = y_train[var]
    y_test = y_test[var] 
    
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_predict = gnb.predict(X_test)
    y_probs = gnb.predict_proba(X_test)
    
    probs1 = list(zip(*y_probs))[1]
    print(gnb.classes_)
    
    #print('Accuracy: %.2f' % accuracy_score(y_test,y_predict))
    #print('Precision: %.2f' % precision_score(y_test,y_predict))
    #print('Recall: %.2f' % recall_score(y_test,y_predict))
    #print('F1: %.2f' % f1_score(y_test,y_predict))
    
    #confmat=confusion_matrix(y_true=y_test, y_pred=y_predict, labels = [1,0])
    #print(confmat)
    #print('\n')
    
    #return {var:(accuracy_score(y_test,y_predict),precision_score(y_test,y_predict), \
    #             recall_score(y_test,y_predict), f1_score(y_test,y_predict))}
    return probs1

In [42]:
for var in classes:
    wavData[var] = 0
    wavData.loc[wavData['class'] == var,var] = 1

In [43]:
X_train, X_test, y_train, y_test = train_test_split(wavData.loc[:,feat], wavData.loc[:,list(classes)], \
                                                        test_size=0.3, random_state=0)

In [44]:
y_train[0:5]

Unnamed: 0,siren,gun_shot,children_playing,engine_idling,car_horn,drilling,jackhammer,air_conditioner,dog_bark,street_music
717,0,0,0,0,0,0,0,0,1,0
897,0,0,0,1,0,0,0,0,0,0
524,0,0,1,0,0,0,0,0,0,0
333,0,0,0,0,0,1,0,0,0,0
613,0,0,1,0,0,0,0,0,0,0


In [45]:
probsNB = {}
for var in classes:
    probsNB[var] = runNBstacked(var, X_train, X_test, y_train, y_test)

[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]


In [46]:
probsNB_DF = pd.DataFrame(probsNB)

In [47]:
probsNB_DF[0:5]

Unnamed: 0,air_conditioner,car_horn,children_playing,dog_bark,drilling,engine_idling,gun_shot,jackhammer,siren,street_music
0,5.116845e-124,0.0,0.0,1.535959e-127,0.0,0.0,1.0,0.0,0.0,4.2279190000000006e-219
1,0.9439267,0.9581969,0.9943343,0.9169558,0.965637,0.9765647,1e-05,0.9882577,0.9046644,0.9894239
2,7.694832e-129,3.061638e-96,6.360519e-07,3.564533e-36,0.0,5.179569e-182,1.0,6.819093999999999e-226,3.179036e-241,2.635884e-276
3,0.2970149,0.01679657,2.659761e-21,0.9557133,0.043617,0.9999642,6.1e-05,3.477917e-09,0.9867769,0.8826682
4,0.08654982,0.002069258,1.736786e-25,0.9611261,0.001361,0.99995,0.000486,5.917216e-12,0.9986835,0.4333


In [48]:
probsNB_DF['response'] = probsNB_DF.idxmax(axis=1)

In [49]:
y_predict = probsNB_DF['response']

In [50]:
y_predict[0:5]

0            gun_shot
1    children_playing
2            gun_shot
3       engine_idling
4       engine_idling
Name: response, dtype: object

In [51]:
y_test = y_test.idxmax(axis=1)

In [52]:
y_test[0:5]

885    dog_bark
77     drilling
830    gun_shot
634    drilling
80     dog_bark
dtype: object

In [53]:
print('Accuracy: %.2f' % accuracy_score(y_test,y_predict))
print('Precision: %.2f' % precision_score(y_test,y_predict))
print('Recall: %.2f' % recall_score(y_test,y_predict))
print('F1: %.2f' % f1_score(y_test,y_predict))

Accuracy: 0.24
Precision: 0.31
Recall: 0.24
F1: 0.21


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
