In [1]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os

from glob import glob
import gc
import re
from scipy.io import wavfile
from scipy import signal

train_data_path = "./data/train/audio/"
test_data_path = "./data/test/audio"

In [2]:
def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

In [3]:
labels, fnames = list_wavs_fname(train_data_path)

./data/train/audio/


In [4]:
def pad_audio_array(audio_array, L=16000):
    s = np.min([audio_array.shape[0], L])
    output_array = np.zeros((L,), dtype=audio_array.dtype)
    output_array[:s] = audio_array[:s]
    return output_array

In [5]:
ohc = OneHotEncoder()

In [6]:
L = 16000
y_train_raw = []
x_train_raw = []
train_labels = 'yes no up down left right on off stop go silence unknown'.split()

for label, fname in zip(labels, fnames):
    sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
    samples = pad_audio_array(samples, L)
    
    if label == '_background_noise_':
        label = 'silence'
    elif label not in train_labels:
        label = 'unknown'
    
    y_train_raw.append(label)
    x_train_raw.append(samples)

y_train_raw = np.array(y_train_raw).reshape(-1, 1)
ohc.fit(y_train_raw.copy())
y_train = ohc.transform(y_train_raw)
x_train = np.array(x_train_raw)

  import sys


In [7]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=2017)

In [8]:
y_valid_raw = ohc.inverse_transform(y_valid)

In [9]:
real_words_valid_index = np.logical_and(y_valid_raw != 'silence', y_valid_raw != 'unknown').flatten()
x_valid_real_words = x_valid[real_words_valid_index]
y_valid_real_words = y_valid[real_words_valid_index]



In [10]:
x_valid_real_words.shape

(2428, 16000)

In [11]:
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_valid = np.reshape(x_valid, (x_valid.shape[0], 1, x_valid.shape[1]))
x_valid_real_words = np.reshape(x_valid_real_words, (x_valid_real_words.shape[0], 1, x_valid_real_words.shape[1]))

y_train = y_train.toarray()
y_valid = y_valid.toarray()
y_valid_real_words = y_valid_real_words.toarray()

In [12]:
import tensorflow as tf
from tensorflow.keras import regularizers

from tensorflow.keras import layers, optimizers, losses, activations, models
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras import backend as K

In [13]:
NUM_CLASSES = None
INPUT_SHAPE = None

inp = layers.Input(shape=(1,16000))

x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(inp)
x = layers.BatchNormalization()(x)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dense(100)(x)
x = layers.Dense(12, activation="softmax")(x)

model = Model(inputs=inp, outputs=x)

In [14]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1, 16000)]        0         
_________________________________________________________________
bidirectional (Bidirectional (None, 1, 256)            16516096  
_________________________________________________________________
batch_normalization (BatchNo (None, 1, 256)            1024      
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               25700     
_________________________________________________________________
dense_1 (Dense)              (None, 12)                1212      
Total params: 16,544,032
Trainable params: 16,543,520
Non-trainable params: 512
_______________________________________________

In [15]:
opt = optimizers.Adam()

model.compile(optimizer=opt, loss=losses.binary_crossentropy, metrics=['accuracy'])

In [16]:
model.fit(x_train, y_train, batch_size=32, validation_data=(x_valid_real_words, y_valid_real_words), epochs=4)

Train on 58254 samples, validate on 2428 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f3398244090>

In [17]:
y_valid_real_words.shape

(2428, 12)

In [18]:
x_valid_real_words.shape

(2428, 1, 16000)