In [2]:
import glob
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
%matplotlib inline
plt.style.use('ggplot')

NUMBER_OF_CLASSES = 2

In [3]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += (window_size / 2)
        
def extract_features(parent_dir, sub_dirs, file_ext="*.wav", bands=60, frames=41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    labels = []
    for l, sub_dir in enumerate(sub_dirs):
        print(l, sub_dir)
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            sound_clip, s = librosa.load(fn)
            label = fn.split('\\')[2].split('-')[1]
            for (start, end) in windows(sound_clip, window_size):
                start = int(start)
                end = int(end)
                if (len(sound_clip[start:end]) == window_size):
                    signal = sound_clip[start:end]
                    melspec = librosa.feature.melspectrogram(signal, n_mels=bands)
                    logspec = librosa.logamplitude(melspec)
                    logspec = logspec.T.flatten()[:, np.newaxis].T
                    log_specgrams.append(logspec)
                    labels.append(label)
                    
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features), np.array(labels,dtype = np.int)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [5]:
print("starting parse")
parent_dir = 'Sound-Data/renameThese'
tr_sub_dirs = ["Ambient_1_3", "Ambient_1_4", "Ambient_1_11", "toilet_sounds","Ambient_1_15"]

tr_features,tr_labels = extract_features(parent_dir,tr_sub_dirs)
tr_labels = one_hot_encode(tr_labels)

print("done with train, starting test")

ts_sub_dirs= ["Ambient_1_16","Ambient_1_17","Ambient_1_18"]
ts_features,ts_labels = extract_features(parent_dir,ts_sub_dirs)
ts_labels = one_hot_encode(ts_labels)
print("extraction done")

starting parse
0 Ambient_1_3
1 Ambient_1_4
2 Ambient_1_11
3 toilet_sounds
4 Ambient_1_15
done with train, starting test
0 Ambient_1_16
1 Ambient_1_17
2 Ambient_1_18
extraction done


In [6]:
# save the extracted data

f = open('tr_features.pckl', 'wb')
pickle.dump(tr_features,f)
f.close()
fl = open('tr_labels.pckl', 'wb')
pickle.dump(tr_labels,fl)
fl.close()

f = open('ts_features.pckl', 'wb')
pickle.dump(ts_features,f)
f.close()
fl = open('ts_labels.pckl', 'wb')
pickle.dump(ts_labels,fl)
fl.close()

In [4]:
# load the extracted data
import pickle

f = open("tr_features.pckl", 'rb')
tr_features = pickle.load(f)
f.close()
fl = open("tr_labels.pckl", "rb")
tr_labels = pickle.load(fl)
fl.close()

f = open("ts_features.pckl", 'rb')
ts_features = pickle.load(f)
f.close()


fl = open("ts_labels.pckl", "rb")
ts_labels = pickle.load(fl)
fl.close()

In [5]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev = 0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(1.0, shape = shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x,W,strides=[1,2,2,1], padding='SAME')

def apply_convolution(x,kernel_size,num_channels,depth):
    weights = weight_variable([kernel_size, kernel_size, num_channels, depth])
    biases = bias_variable([depth])
    return tf.nn.relu(tf.add(conv2d(x, weights),biases))

def apply_max_pool(x,kernel_size,stride_size):
    return tf.nn.max_pool(x, ksize=[1, kernel_size, kernel_size, 1], 
                          strides=[1, stride_size, stride_size, 1], padding='SAME')

In [6]:
frames = 41
bands = 60

feature_size = 2460 #60x41
num_labels = NUMBER_OF_CLASSES
num_channels = 2

batch_size = 50
kernel_size = 30
depth = 20
num_hidden = 200

learning_rate = 0.02
total_iterations = 2000

In [7]:
X = tf.placeholder(tf.float32, shape=[None,bands,frames,num_channels])
Y = tf.placeholder(tf.float32, shape=[None,num_labels])

cov = apply_convolution(X,kernel_size,num_channels,depth)

shape = cov.get_shape().as_list()
cov_flat = tf.reshape(cov, [-1, shape[1] * shape[2] * shape[3]])

f_weights = weight_variable([shape[1] * shape[2] * depth, num_hidden])
f_biases = bias_variable([num_hidden])
f = tf.nn.sigmoid(tf.add(tf.matmul(cov_flat, f_weights),f_biases))

out_weights = weight_variable([num_hidden, num_labels])
out_biases = bias_variable([num_labels])
y_ = tf.nn.softmax(tf.matmul(f, out_weights) + out_biases)

In [8]:
loss = -tf.reduce_sum(Y * tf.log(y_))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
cost_history = np.empty(shape=[1],dtype=float)

init = tf.global_variables_initializer()

print("starting model")
with tf.Session() as session:
    session.run(init)

    for itr in range(total_iterations):    
        offset = (itr * batch_size) % (tr_labels.shape[0] - batch_size)
        batch_x = tr_features[offset:(offset + batch_size), :, :, :]
        batch_y = tr_labels[offset:(offset + batch_size), :]
        
        _, c = session.run([optimizer, loss],feed_dict={X: batch_x, Y : batch_y})
        cost_history = np.append(cost_history,c)
    
    print('Test accuracy: ',round(session.run(accuracy, feed_dict={X: ts_features, Y: ts_labels}) , 3))
    fig = plt.figure(figsize=(15,10))
    plt.plot(cost_history)
    plt.axis([0,total_iterations,0,np.max(cost_history)])
    plt.show()
print("model finished")

starting model
