In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.mlab
import scipy.io.wavfile
import scipy
import os
import time
from scipy import signal
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
sns.set()

In [2]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def pad_audio(samples, L=16000):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

In [3]:
folders = [i for i in os.listdir(os.getcwd())if i.find('.md') < 0 and i.find('.txt') < 0 and i.find('ipynb') < 0 and i.find('py')  < 0 and i.find('LICENSE') < 0 and i.find('_background_noise_') < 0]

In [9]:
new_sample_rate = 8000
Y = []
X = []
for i in folders:
    print(i)
    for k in os.listdir(os.getcwd()+'/'+i):
        sample_rate, samples = scipy.io.wavfile.read(os.path.join(os.getcwd(), i, k))
        samples = pad_audio(samples)
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else: n_samples = [samples]
        for samples in n_samples:
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            Y.append(i)
            X.append(scipy.misc.imresize(specgram,[45, 40]).flatten())

yes
marvin
off
bed
house
up
six
go
four
nine
left
no
three
wow
sheila
right
on
five
seven
zero
stop
one
down
bird
tree
eight
dog
two
cat


In [10]:
X = np.array(X)
print(X.shape)
len(Y)

(62979, 1800)


62979

In [11]:
import lightgbm as lgb
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
labels = np.unique(Y)
target = LabelEncoder().fit_transform(Y)
train_X, test_X, train_Y, test_Y = train_test_split(X, target, test_size = 0.2)



In [12]:
params_lgd = {
    'boosting_type': 'dart',
    'objective': 'multiclass',
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'silent': False,
    'n_estimators': 10000,
    'reg_lambda': 0.0005,
    'device':'gpu'
    }
clf = lgb.LGBMClassifier(**params_lgd)
lasttime = time.time()
clf.fit(train_X,train_Y, eval_set=[(test_X,test_Y)], 
        eval_metric='logloss', early_stopping_rounds=20, verbose=True)
print('time taken to fit lgb:', time.time()-lasttime, 'seconds ')

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[1]	valid_0's multi_logloss: 3.15526
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's multi_logloss: 3.01132
[3]	valid_0's multi_logloss: 2.89098
[4]	valid_0's multi_logloss: 2.79342
[5]	valid_0's multi_logloss: 2.70512
[6]	valid_0's multi_logloss: 2.62507
[7]	valid_0's multi_logloss: 2.55135
[8]	valid_0's multi_logloss: 2.58867
[9]	valid_0's multi_logloss: 2.5208
[10]	valid_0's multi_logloss: 2.45682
[11]	valid_0's multi_logloss: 2.39718
[12]	valid_0's multi_logloss: 2.42141
[13]	valid_0's multi_logloss: 2.36492
[14]	valid_0's multi_logloss: 2.31165
[15]	valid_0's multi_logloss: 2.26279
[16]	valid_0's multi_logloss: 2.21751
[17]	valid_0's multi_logloss: 2.1751
[18]	valid_0's multi_logloss: 2.13462
[19]	valid_0's multi_logloss: 2.09548
[20]	valid_0's multi_logloss: 2.05903
[21]	valid_0's multi_logloss: 2.07195
[22]	valid_0's multi_logloss: 2.03659
[23]	valid_0's multi_logloss: 2.00345
[24]	valid_0's multi_logloss: 1.97242
[25]	valid_0's multi_logloss: 1.94248

[213]	valid_0's multi_logloss: 1.31775
Early stopping, best iteration is:
[193]	valid_0's multi_logloss: 1.31226
time taken to fit lgb: 2149.428333044052 seconds 


In [13]:
predicted = clf.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = labels))

accuracy validation set:  0.722689742775
             precision    recall  f1-score   support

        bed       0.64      0.68      0.66       359
       bird       0.82      0.74      0.78       327
        cat       0.77      0.72      0.74       353
        dog       0.78      0.59      0.67       340
       down       0.67      0.67      0.67       457
      eight       0.75      0.79      0.77       466
       five       0.73      0.65      0.69       514
       four       0.78      0.82      0.80       477
         go       0.54      0.61      0.57       463
      house       0.87      0.75      0.80       324
       left       0.75      0.70      0.72       488
     marvin       0.79      0.76      0.78       370
       nine       0.70      0.75      0.72       447
         no       0.70      0.57      0.63       518
        off       0.71      0.74      0.72       450
         on       0.66      0.66      0.66       466
        one       0.72      0.75      0.73       485
    