In [1]:
import os
from IPython.display import display,Audio,HTML
import scipy.io.wavfile as wav
import numpy as np
import speechpy
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import time
import xgboost as xgb
import tensorflow as tf
from sklearn import metrics
sns.set()



In [2]:
def extract_features(signal, fs):
    frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x,)),zero_padding=True)
    power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=1)
    logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,num_filters=1, fft_length=512, low_frequency=0, high_frequency=None)
    mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,num_filters=1, fft_length=512, low_frequency=0, high_frequency=None)
    mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
    mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
    return np.hstack([power_spectrum[:,0],logenergy[:,0],mfcc_cmvn[:,0],mfcc_feature_cube[:,0,1]])

def extract_files(folder):
    location = folder + '/'
    elements = os.listdir(location)
    results = []
    for i in elements:
        try:
            fs, signal = wav.read(location+i)
            results.append([folder]+extract_features(signal, fs).tolist())
        except:
            continue
    return results

In [3]:
folders = [i for i in os.listdir(os.getcwd())if i.find('.md') < 0 and i.find('.txt') < 0 and i.find('ipynb') < 0 and i.find('LICENSE') < 0]

In [4]:
output = []
for i in folders:
    print(i)
    output += extract_files(i)

yes
marvin
off
happy
bed
house
up
six
go
four
nine
left
no
three
wow
sheila
_background_noise_




right
on
five
seven
zero
stop
one
down
bird


  variance_normalized[i, :] = mean_subtracted[i, :] / window_variance


tree
eight
dog
two
cat


In [11]:
output = [i for i in output if len(i) == 397]
dataset=np.array(output)
np.random.shuffle(dataset)
labels = np.unique(dataset[:,0]).tolist()
target = LabelEncoder().fit_transform(dataset[:,0])
labels

['bed',
 'bird',
 'cat',
 'dog',
 'down',
 'eight',
 'five',
 'four',
 'go',
 'happy',
 'house',
 'left',
 'marvin',
 'nine',
 'no',
 'off',
 'on',
 'one',
 'right',
 'seven',
 'sheila',
 'six',
 'stop',
 'three',
 'tree',
 'two',
 'up',
 'wow',
 'yes',
 'zero']

In [6]:
class Model:
    def __init__(self, size_input, size_output):
        self.X = tf.placeholder(tf.float32, (None, size_input))
        self.Y = tf.placeholder(tf.float32, (None, size_output))
        
        w1 = tf.Variable(tf.random_normal([size_input, 784], stddev = np.sqrt(1/size_input)))
        b1 = tf.Variable(tf.random_normal([784], stddev = 0))
        
        w2 = tf.Variable(tf.random_normal([784, 256], stddev = np.sqrt(1/256.0)))
        b2 = tf.Variable(tf.random_normal([256], stddev = 0))
        
        w3 = tf.Variable(tf.random_normal([256, 100], stddev = np.sqrt(1/100.0)))
        b3 = tf.Variable(tf.random_normal([100], stddev = 0))
        
        w4 = tf.Variable(tf.random_normal([100, size_output], stddev = np.sqrt(1/(size_output * 1.0))))
        b4 = tf.Variable(tf.random_normal([size_output], stddev = 0))
        
        hidden1 = tf.nn.relu(tf.matmul(self.X, w1) + b1)
        hidden2 = tf.nn.relu(tf.matmul(hidden1, w2) + b2)
        hidden3 = tf.nn.relu(tf.matmul(hidden2, w3) + b3)
        self.logits = tf.matmul(hidden3, w4) + b4
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        correct_prediction = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

In [12]:
dataset=dataset[:, 1:].astype('float64')
condition = ~np.isnan(dataset).any(axis=1)
dataset=dataset[condition]
target=target[condition]
condition = ~np.isinf(dataset).any(axis=1)
dataset=dataset[condition]
target=target[condition]
train_X, test_X, train_Y, test_Y = train_test_split(MinMaxScaler().fit_transform(dataset), target, test_size = 0.2)

In [34]:
learning_rate = 0.0001
batch_size = 128
epoch = 1000

In [35]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(train_X.shape[1],len(labels))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
ACCURACY, LOST = [], []
for i in range(epoch):
    last = time.time()
    total_acc, total_loss = 0, 0
    for k in range(0, (train_X.shape[0] // batch_size) * batch_size, batch_size):
        batch_y = np.zeros((batch_size, len(labels)))
        for n in range(batch_size):
            batch_y[n, train_Y[k+n]] = 1.0
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X: train_X[k:k+batch_size,:], model.Y: batch_y})
        total_acc += sess.run(model.accuracy, feed_dict = {model.X: train_X[k:k+batch_size,:], model.Y: batch_y})
        total_loss += loss
    total_loss /= (train_X.shape[0] // batch_size)
    total_acc /= (train_X.shape[0] // batch_size)
    ACCURACY.append(total_acc)
    LOST.append(total_loss)
    if (i+1) % 100 == 0:
        print('epoch: ', i + 1, 'avg loss: ', total_loss, 'avg acc: ', total_acc, 'avg time: ', (time.time() - last) / (train_X.shape[0] // batch_size))

epoch:  100 avg loss:  2.2162935832 avg acc:  0.352056146978 avg time:  0.0022671091687548293
epoch:  200 avg loss:  1.87672025195 avg acc:  0.454069368132 avg time:  0.0022755704083285487
epoch:  300 avg loss:  1.63823067618 avg acc:  0.526850103022 avg time:  0.002270428688971551
epoch:  400 avg loss:  1.44041890725 avg acc:  0.584628262363 avg time:  0.002271171454544906
epoch:  500 avg loss:  1.24983836465 avg acc:  0.645368303571 avg time:  0.002266234748966091
epoch:  600 avg loss:  1.17506110128 avg acc:  0.666809752747 avg time:  0.0022591955059177273
epoch:  700 avg loss:  1.10258197866 avg acc:  0.687993646978 avg time:  0.0022657264719952593
epoch:  800 avg loss:  0.944048415665 avg acc:  0.738839285714 avg time:  0.00226986146235204
epoch:  900 avg loss:  0.822166120122 avg acc:  0.779361263736 avg time:  0.0022613956378056453
epoch:  1000 avg loss:  0.882767829266 avg acc:  0.758434924451 avg time:  0.0022641695462740385


In [36]:
batch_y = np.zeros((test_X.shape[0],len(labels)))
for n in range(test_X.shape[0]):
    batch_y[n, test_Y[n]] = 1.0
acc, logits = sess.run([model.accuracy, tf.cast(tf.argmax(model.logits, 1), tf.int32)], feed_dict = {model.X : test_X, model.Y : batch_y})
print('testing accuracy: ' + str(acc))
print(metrics.classification_report(test_Y, logits, target_names = labels))

testing accuracy: 0.331731
             precision    recall  f1-score   support

        bed       0.14      0.11      0.13       298
       bird       0.18      0.10      0.13       313
        cat       0.21      0.34      0.26       299
        dog       0.17      0.19      0.18       302
       down       0.24      0.11      0.15       449
      eight       0.41      0.35      0.38       415
       five       0.25      0.23      0.24       427
       four       0.19      0.29      0.23       420
         go       0.13      0.15      0.14       456
      happy       0.57      0.65      0.61       341
      house       0.56      0.46      0.50       300
       left       0.32      0.34      0.33       429
     marvin       0.47      0.53      0.50       306
       nine       0.33      0.22      0.26       434
         no       0.24      0.21      0.22       420
        off       0.27      0.22      0.24       425
         on       0.38      0.16      0.22       466
        one       