In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import seaborn as sns
sns.set()
path = '/home/huseinzol05/Documents/UrbanSound8K/audio/fold'

In [2]:
dataset = pd.read_csv('/home/huseinzol05/Documents/UrbanSound8K/metadata/UrbanSound8K.csv')
dataset = dataset.iloc[np.random.permutation(len(dataset))]
dataset.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
6302,39854-5-1-4.wav,39854,146.073254,150.073254,1,6,5,engine_idling
1813,145611-6-3-0.wav,145611,17.855555,18.975026,2,5,6,gun_shot
8274,84359-2-0-1.wav,84359,0.5,4.5,2,2,2,children_playing
2944,162134-7-5-0.wav,162134,120.266429,122.246864,1,10,7,jackhammer
4426,182739-2-0-17.wav,182739,8.5,12.5,1,2,2,children_playing


In [3]:
def extract_feature(path, t):
    Y, sample_rate = librosa.load(path)
    stft = np.abs(librosa.stft(Y))
    mfcss = librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40)
    chroma = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
    mel = librosa.feature.melspectrogram(Y, sr = sample_rate)
    contrast = librosa.feature.spectral_contrast(S = stft, sr = sample_rate)
    tonnetz = librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate)
    return mfcss[:, :t], chroma[:, :t], mel[:, :t], contrast[:, :t], tonnetz[:, :t]

def full_extract_feature(path, t):
    Y, sample_rate = librosa.load(path)
    stft = np.abs(librosa.stft(Y))
    mfcss = librosa.feature.mfcc(y = Y, sr = sample_rate, n_mfcc = 40)
    rmse = librosa.feature.rmse(y = Y)
    chroma_stft = librosa.feature.chroma_stft(S = stft, sr = sample_rate)
    chroma_cqt = librosa.feature.chroma_cqt(C = stft, sr = sample_rate)
    chroma_cens = librosa.feature.chroma_cens(C = stft, sr = sample_rate)
    mel = librosa.feature.melspectrogram(Y, sr = sample_rate)
    contrast = librosa.feature.spectral_contrast(S = stft, sr = sample_rate)
    centroid = librosa.feature.spectral_centroid(S = stft, sr = sample_rate)
    rolloff = librosa.feature.spectral_rolloff(S = stft, sr = sample_rate)
    bandwidth = librosa.feature.spectral_bandwidth(S = stft, sr = sample_rate)
    tonnetz = librosa.feature.tonnetz(y = librosa.effects.harmonic(Y), sr = sample_rate)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y = Y)
    return mfcss[:, :t], rmse[:, :t], chroma_stft[:, :t], chroma_cqt[:, :t], chroma_cens[:, :t], mel[:, :t], contrast[:, :t], centroid[:, :t], rolloff[:, :t], bandwidth[:, :t], tonnetz[:, :t], zero_crossing_rate[:, :t]

def parse_audio_file(dataset, shape, t = 60, full_extract = False, normalize = True):
    features, labels = np.empty((0, t, shape)), []
    dataset = dataset.ix[:, :].values
    for i in xrange(dataset.shape[0]):
        try:
            p = path + str(dataset[i, -3]) + '/' + str(dataset[i, 0])
            if full_extract:
                mfcss, rmse, chroma_stft, chroma_cqt, chroma_cens, mel, contrast, centroid, rolloff, bandwidth, tonnetz, zero_crossing_rate = full_extract_feature(p, t)
                ext_features = np.hstack([mfcss.T, rmse.T, chroma_stft.T, chroma_cqt.T, chroma_cens.T, mel.T, contrast.T, centroid.T, rolloff.T, bandwidth.T, tonnetz.T, zero_crossing_rate.T])
            else:
                mfcss, chroma, mel, contrast, tonnetz = extract_feature(p, t)
                ext_features = np.hstack([mfcss.T, chroma.T, mel.T, contrast.T, tonnetz.T])
            
            features = np.vstack([features, np.array([ext_features])])
            labels.append(int(dataset[i, -2]))
        except Exception as e:
            print e
            print 'skipped: ' + str(dataset[i, :])
            continue
    
    if normalize:
        features = (features - features.min()) / (features.max() - features.min())
        
    return features, labels
        

def one_hot(labels, shape):
    onehot = np.zeros((len(labels), shape))
    for i in xrange(onehot.shape[0]):
        onehot[i, labels[i]] = 1.0
    return onehot



In [4]:
class Model:
    
    def __init__(self, num_layers, size_layer, dimension_input, dimension_output, learning_rate, delta):
        
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(size_layer, activation = tf.nn.relu)
        
        self.rnn_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in xrange(num_layers)])
        
        self.X = tf.placeholder(tf.float32, [None, None, dimension_input])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        
        self.outputs, self.last_state = tf.nn.dynamic_rnn(self.rnn_cells, self.X, dtype = tf.float32)
        
        self.rnn_W = tf.Variable(tf.random_normal((size_layer, dimension_output)))
        self.rnn_B = tf.Variable(tf.random_normal([dimension_output]))
        
        self.logits = tf.matmul(self.outputs[-1], self.rnn_W) + self.rnn_B
        
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        
        l2 = sum(delta * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        
        self.cost += l2
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

In [5]:
num_layers = 2
size_layer = 256
learning_rate = 0.001
EPOCH = 10
BATCH_SIZE = 128
delta = 0.00005
FULL_EXTRACT = False
period = 30
dimension_output = np.unique(dataset['classID']).shape[0]
dimension_input = 222 if FULL_EXTRACT else 193

In [None]:
sess = tf.InteractiveSession()
model = Model(num_layers, size_layer, dimension_input, dimension_output, learning_rate, delta)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())

train_dataset = dataset.iloc[:int(dataset.shape[0] * 0.80), :]
test_dataset = dataset.iloc[int(dataset.shape[0] * 0.80):, :]

In [None]:
import time
from sklearn import metrics

ACCURACY, ACCURACY_TEST, LOST = [], [], []

for i in xrange(EPOCH):
    total_cost = 0; total_accuracy = 0; last_time = time.time()
    
    for x in xrange(0, (train_dataset.shape[0] // BATCH_SIZE) * BATCH_SIZE, BATCH_SIZE):
        print 'current batch: ' + str(x)
        features, labels = parse_audio_file(train_dataset.iloc[x : x + BATCH_SIZE], shape = dimension_input, t = period)
        batch_x = np.zeros((period, features.shape[0], dimension_input))
        for k in xrange(features.shape[0]):
            batch_x[:, k, :] = features[k, :, :]
        onehot = one_hot(labels, dimension_output)
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : onehot})
        total_accuracy += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : onehot})
        total_cost += loss
    
    diff = time.time() - last_time
    total_accuracy /= (train_dataset.shape[0] // BATCH_SIZE)
    total_cost /= (train_dataset.shape[0] // BATCH_SIZE)
    ACCURACY.append(total_accuracy)
    LOST.append(total_cost)
    
    print "total accuracy during training: " + str(total_accuracy)
    print "epoch: " + str(i + 1) + ", loss: " + str(total_cost) + ", speed: " + str(diff / (train_dataset.shape[0] // BATCH_SIZE)) + " s / batch"
    
    features, labels = parse_audio_file(test_dataset, shape = dimension_input, t = period)
    batch_x = np.zeros((period, features.shape[0], dimension_input))
    for k in xrange(features.shape[0]):
        batch_x[:, k, :] = features[k, :, :]
    onehot = one_hot(labels, dimension_output)
    accuracy_test = sess.run(model.accuracy, feed_dict = {model.X : features, model.Y : onehot})
    ACCURACY_TEST.append(accuracy_test)
    print "total accuracy during testing: " + str(accuracy_test)
    
    saver.save(sess, "./checkpoint/model.ckpt")
    if(i + 1) % 1 == 0:
        logits = sess.run(tf.cast(tf.argmax(model.logits, 1), tf.int32), feed_dict = {model.X : features})
        print(metrics.classification_report(np.array(labels), logits, target_names = np.unique(dataset['class'])))
            
    

current batch: 0
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['180156-1-8-0.wav' 180156 25.488681 26.086759 2 9 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['43784-3-0-0.wav' 43784 0.934147 1.116784 1 7 3 'dog_bark']
current batch: 128
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['66587-3-1-0.wav' 66587 3.077173 3.4286449999999995 1 5 3 'dog_bark']
current batch: 256


  if np.any(X < 0) or np.any(X_ref < 0):
  bad_idx = (Z < np.finfo(dtype).tiny)


Audio buffer is not finite everywhere
skipped: ['87275-1-2-0.wav' 87275 1.817251 1.871768 2 1 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['43805-8-1-0.wav' 43805 64.181737 64.461469 1 7 8 'siren']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['147672-3-0-0.wav' 147672 0.644886 0.997283 1 2 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['163460-6-0-0.wav' 163460 0.0 0.672222 1 2 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['175856-1-1-0.wav' 175856 0.9320440000000001 1.230957 1 8 1 'car_horn']
current batch: 384




all the input array dimensions except for the concatenation axis must match exactly
skipped: ['23161-6-1-0.wav' 23161 59.26566700000001 59.581943 2 4 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['187356-1-0-0.wav' 187356 20.664779 21.032153 2 4 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['160094-3-0-0.wav' 160094 0.12069500000000001 0.31345300000000004 1 1 3
 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['89207-3-0-0.wav' 89207 0.028387 0.389303 1 8 3 'dog_bark']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['117536-1-0-0.wav' 117536 31.064967 31.640588 2 4 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['151359-1-1-0.wav' 151359 9.100817999999999 9.255836 1 3 1 'car_horn']
all the input array di

all the input array dimensions except for the concatenation axis must match exactly
skipped: ['175848-1-0-0.wav' 175848 0.094818 0.632591 1 6 1 'car_horn']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['159702-6-0-0.wav' 159702 0.0 0.597821 1 6 6 'gun_shot']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['159708-6-2-0.wav' 159708 0.6251680000000001 0.9147790000000001 1 7 6
 'gun_shot']
current batch: 1536
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['159754-8-2-0.wav' 159754 9.989627 10.552889 1 5 8 'siren']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['34621-4-8-0.wav' 34621 12.288458 12.770989 1 2 4 'drilling']
all the input array dimensions except for the concatenation axis must match exactly
skipped: ['77509-1-0-0.wav' 77509 2.9673439999999998 3.5573080000000004 2 5 1
 'car_horn']
all the inp

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/lib/python2.7/dist-packages/audioread/gstdec.py", line 149, in run
    self.loop.run()
  File "/usr/lib/python2.7/dist-packages/gi/overrides/GLib.py", line 576, in run
    raise KeyboardInterrupt
KeyboardInterrupt



all the input array dimensions except for the concatenation axis must match exactly
skipped: ['125520-1-4-0.wav' 125520 7.3182979999999995 7.885858 1 8 1 'car_horn']
