In [1]:
#!pip install --user tflearn
#!pip install --user librosa 'llvmlite==0.19.0'

In [2]:
from scipy.io import wavfile
from tflearn.data_utils import to_categorical
import numpy as np
import random
import librosa
import glob

random.seed(1)

dataset = []
label_to_idx = dict()
idx_to_label = dict()
time_size = 16
feat_size = 40

rate, noise_data = wavfile.read('dataset/noise.wav')
noise_rest = len(noise_data) - rate

def normalize(data):
    data = data.astype(float)
    mn, mx = data.min(), data.max()
    data = (data - mn) / (mx - mn) * 65535 - 32768
    return data.astype('short')

def read_dataset():
    for fname in glob.glob('dataset/*/*.wav', recursive=True):
        label = fname.split('.')[0].split('/')[1:]
        rate, data = wavfile.read(fname)
        data = normalize(data)
        yield label, data

def augment_and_pad_dataset(dataset):
    for info, data in dataset:
        for _ in range(8):
            file_rest = rate - len(data)
            if file_rest <= 0:
                file_data = data[:rate]
            else:
                file_offset = int(random.uniform(0, file_rest))
                file_data = np.pad(data, (file_offset, file_rest - file_offset), 'constant', constant_values=(0,0))
            for noise_level in range(4):
                noise_offset = int(random.uniform(0, noise_rest))
                noise_sample = noise_data[noise_offset:noise_offset + rate] * noise_level
                file_data += noise_sample
                yield (*info, noise_level), file_data

for info, data in augment_and_pad_dataset(read_dataset()):
#     wavfile.write('temp/%s.wav' % (info,), rate, data)

    mfcc = np.transpose(librosa.feature.mfcc(data, rate, n_mfcc=feat_size))

    label = info[0]
    idx = label_to_idx.get(label, None)
    if idx is None:
        idx = len(label_to_idx)
        label_to_idx[label] = idx
        idx_to_label[idx] = label
    dataset.append((mfcc, idx, info))
random.shuffle(dataset)

inp_data = [r[0] for r in dataset]
out_size = len(label_to_idx)
out_data = to_categorical([r[1] for r in dataset], out_size)

len(inp_data), out_size, label_to_idx.keys(), inp_data[0].shape

(896, 7, dict_keys(['1', '2', '3', '4', '5', '6', 'sil']), (16, 40))

In [4]:
import tensorflow as tf
import tflearn
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell

tf.reset_default_graph()

check_size = 20 #int(len(inp_data) * 0.1)
learn_size = len(inp_data) - check_size
trainX = inp_data[:learn_size]
trainY = out_data[:learn_size]
testX = inp_data[-check_size:]
testY = out_data[-check_size:]

g = tflearn.input_data(shape=[None,time_size,feat_size])

g = tflearn.reshape(g, [-1,time_size,feat_size,1])
g = tflearn.conv_2d(g, 64, (20,8), activation='relu')
g = tflearn.max_pool_2d(g, 3, strides=2)
g = tflearn.local_response_normalization(g)
g = tflearn.reshape(g, [-1,time_size,feat_size*16])

g = tflearn.gru(g, 256, dropout=0.7)

g = tflearn.fully_connected(g, out_size, activation='softmax')
g = tflearn.regression(g, optimizer='adam', loss='categorical_crossentropy', learning_rate=0.001)
m = tflearn.DNN(g)
m.fit(trainX, trainY, n_epoch=20, show_metric=True, snapshot_epoch=False, batch_size=64)

Training Step: 280  | total loss: [1m[32m0.19588[0m[0m | time: 2.944s
[2K| Adam | epoch: 020 | loss: 0.19588 - acc: 0.9747 -- iter: 876/876


In [5]:
hits, miss = 0, 0
mxpe, mnpv = 0, 10
for i, out in enumerate(m.predict(testX)):
    i0 = out.argsort()[-1]
    lbl = idx_to_label[i0]
    info = dataset[learn_size + i][2]
    l, v, n = info
    p = int(out[i0]*10)
    ok = l == lbl
    if ok:
        hits += 1
        mnpv = min(mnpv, p)
    else:
        miss += 1
        mxpe = max(mxpe, p)
    print('%s %d%% %s: %s' % (('+' if ok else '-'), p, lbl, info))
print(hits, miss, mxpe, mnpv)
print('---')
for f, _, info in dataset[100:140]:
    out = m.predict([f])[0]
    i0 = out.argsort()[-1]
    lbl = idx_to_label[i0]
    print(('v' if info[0] == lbl else '-'), int(out[i0]*10), lbl, info)

+ 9% 3: ('3', 'a', 0)
+ 9% sil: ('sil', 'a', 3)
+ 9% 1: ('1', 'a', 0)
+ 9% 2: ('2', 'c', 3)
+ 9% sil: ('sil', 'b', 3)
+ 9% 6: ('6', 'b', 1)
+ 9% sil: ('sil', 'c', 1)
+ 9% 3: ('3', 'b', 0)
+ 9% 3: ('3', 'c', 0)
+ 9% 1: ('1', 'e', 2)
+ 9% 1: ('1', 'd', 0)
+ 9% 1: ('1', 'b', 3)
+ 8% 3: ('3', 'b', 2)
+ 9% 1: ('1', 'c', 2)
+ 9% 4: ('4', 'b', 2)
+ 9% 2: ('2', 'd', 3)
+ 9% 2: ('2', 'c', 2)
+ 9% 3: ('3', 'a', 0)
+ 9% 2: ('2', 'e', 3)
+ 8% 1: ('1', 'a', 1)
20 0 0 8
---
v 9 sil ('sil', 'f', 2)
v 9 3 ('3', 'b', 3)
v 9 sil ('sil', 'f', 1)
v 9 4 ('4', 'a', 3)
v 9 sil ('sil', 'a', 0)
v 9 sil ('sil', 'b', 3)
v 9 1 ('1', 'e', 3)
v 9 6 ('6', 'a', 3)
v 9 1 ('1', 'b', 3)
v 9 5 ('5', 'b', 1)
v 9 1 ('1', 'd', 1)
v 9 sil ('sil', 'g', 3)
v 9 3 ('3', 'e', 0)
v 9 1 ('1', 'b', 1)
v 9 sil ('sil', 'c', 2)
v 9 6 ('6', 'a', 2)
v 9 sil ('sil', 'a', 0)
v 9 4 ('4', 'a', 3)
v 9 4 ('4', 'a', 0)
v 9 1 ('1', 'a', 3)
v 9 1 ('1', 'b', 0)
v 9 2 ('2', 'c', 1)
v 9 sil ('sil', 'g', 0)
v 9 4 ('4', 'b', 2)
v 9 3 ('3', 'e', 3)
v 9

In [8]:
rate, wave = wavfile.read('dataset/sil/a.wav')
rate, wave = wavfile.read('test3.wav')
wave = normalize(wave)
f = np.transpose(librosa.feature.mfcc(wave, rate, n_mfcc=feat_size))
for i in range(0, f.shape[0] - time_size + 1):
    unt=f[i:i+time_size]
    for out in m.predict([unt]):
        i0 = out.argsort()[-1]
        lbl = idx_to_label[i0]
        print(out[i0], lbl)

0.986017 3
0.9848 3
0.986238 3
0.989401 3
