In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir('../ml/datasets/speech/'))

['train.7z', 'test', 'train']


In [2]:
# Imports
import pickle
import keras
import random
import scipy

from pathlib import Path
from subprocess import check_output

from scipy import signal
from scipy.io import wavfile
import matplotlib.pyplot as plt

%matplotlib inline

Using TensorFlow backend.


In [3]:
answer_dict = {0: 'yes',
               1: 'no', 
               2: 'up', 
               3: 'down', 
               4: 'left',
               5: 'right',
               6: 'on',
               7: 'off',
               8: 'stop',
               9: 'go',
               10: 'unknown',
               11: 'silence'
              }

In [4]:
test_audio_path = '../ml/datasets/speech/test/audio'
train_audio_path = '../ml/datasets/speech/train/audio'

wavs = []

files = os.listdir(test_audio_path)

for f in files:
    if not f.endswith('wav'):
        continue
    wavs.append(f)

In [5]:
test_dict = {'test': wavs}

In [6]:
# Data Augmenting
def bandpass(sample_rate, samples):
    
    fs = sample_rate  # Sample frequency (Hz)
    fl = 180.0  # Human voices range from 85 Hz to 255 Hz
    fh = 240.0
    Q = 1.0  # Quality factor
    w0 = fl/(fs/2)  # Normalized Frequency
    w1 = fh/(fs/2)
    # Design notch filter
    b, a = scipy.signal.butter(3, [w0, w1], btype='bandpass', analog=True)
    samples = scipy.signal.lfilter(b,a,samples)*30

    return sample_rate, samples

In [7]:
def spectrogram(file):
    path = test_audio_path + '/'
        
    eps=1e-10
    sample_rate, samples = wavfile.read(path + file)
    frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
    
    # silence can end up being empty files, in this case we can just return one second of zeros
    if len(spectrogram.shape) < 2:
        return np.zeros((71,129))
    else:
        return np.log(np.abs(spectrogram).T+eps)

def stft(file):
    path = test_audio_path + '/'
        
    eps=1e-10
    sample_rate, samples = wavfile.read(path + file)
    frequencies, times, Zxx = signal.stft(samples, sample_rate, nperseg = sample_rate/50, noverlap = sample_rate/75)
    
    # silence can end up being empty files, in this case we can just return one second of zeros
    if len(Zxx.shape) < 2:
        return np.zeros((151,161))
    else:
        return np.log(np.abs(Zxx).T+eps)

In [8]:
class DataGenerator(keras.utils.Sequence):
    # Generates data for Keras
    def __init__(self, list_IDs, batch_size=32, x1dim=(71,129), x2dim=(151,161), n_channels=1,
                 n_classes=12):
        # Initialization
        self.x1dim = x1dim
        self.x2dim = x2dim
        self.batch_size = batch_size
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # Generate one batch of data
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X1, X2 = self.__data_generation(list_IDs_temp)
        return [X1,X2]

    def on_epoch_end(self):
        # Updates indexes after each epoch
        self.indexes = np.arange(len(self.list_IDs))

    def __data_generation(self, list_IDs_temp):
        # Generates data containing batch_size samples # X : (n_samples, *dim, n_channels)
        # Initialization

        X1 = np.empty((self.batch_size, *self.x1dim, self.n_channels))
        X2 = np.empty((self.batch_size, *self.x2dim, self.n_channels))

        # Generate data  
        for i, ID in enumerate(list_IDs_temp):
            spect = spectrogram(ID)
            padded = np.zeros((self.x1dim))
            padded[:spect.shape[0], :spect.shape[1]] = spect
            X1[i,] = padded[:, :, np.newaxis]        

        for i, ID in enumerate(list_IDs_temp):
            trans = stft(ID)
            #last = ID, self.dim, spect.shape

            padded = np.zeros((self.x2dim))
            padded[:trans.shape[0], :trans.shape[1]] = trans
            X2[i,] = padded[:, :, np.newaxis]

        return X1, X2

In [9]:
from keras.models import load_model

ensemble_model = load_model('../ml/models/retrained_ensemble_model.h5')

In [10]:
# Parameters
test_params = {'x1dim': (71,129),
               'x2dim': (151,161),
               'batch_size': 15,
               'n_classes':12,
               'n_channels': 1}

# Generators
test_generator = DataGenerator(test_dict['test'], **test_params)

results = ensemble_model.predict_generator(test_generator, steps=3523*3)

In [11]:
with open('results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
submission = pd.concat(
    [
        pd.Series(test_dict['test'], name='fname'),
        pd.Series(np.argmax(results, axis=1),name='label').map(answer_dict)
    ]
    ,axis=1
)
submission.to_csv("submission.csv",index=False)

In [13]:
file = []
label = []
thresh = []
for i in range(len(results)):
    if results[i,np.argmax(results[i])] > .95:
        file.append(test_dict['test'][i])
        label.append(answer_dict[np.argmax(results[i])])
print(len(file))

107333


In [22]:
from shutil import copyfile
for i in range(len(file)):
    copyfile(test_audio_path + '/' + file[i], '../ml/datasets/speech/train/audio/' + label[i] + '/' + file[i])
    #copyfile(test_audio_path + '/' + file[i], '../ml/speech/verify/' + label[i] + '_' + file[i])

In [23]:
# any file which is all zero is obviously silence.
from shutil import move

folders = os.listdir(train_audio_path)
for folder in folders:
    if folder == 'silence':
        continue
    files = os.listdir(train_audio_path + '/' + folder)
    for file in files:
        if not file.endswith('wav'):
            continue
        try:
            sample_rate, samples = wavfile.read(train_audio_path + '/' + folder + '/' + file)
            if np.argmax(samples) == 0:
                print(file, 'will be moved to silence from', folder)
                move(
                    train_audio_path + '/' + folder + '/' + file, 
                    train_audio_path + '/silence/' + file
                )
        except ValueError:
            print(file, folder, 'had a value error')

e5dadd24_nohash_0.wav will be moved to silence from wow
7014b07e_nohash_0.wav will be moved to silence from tree
7014b07e_nohash_0.wav will be moved to silence from five
3e7124ba_nohash_0.wav will be moved to silence from bird




In [24]:
contest_dict = {'yes': 0,
                'no': 1,
                'up': 2,
                'down': 3,
                'left': 4,
                'right': 5,
                'on': 6,
                'off': 7,
                'stop': 8,
                'go': 9,
                'unknown': 10,
                'silence': 11
               }

In [47]:
# Open test / validation lists
test_list = open("../ml/datasets/speech/train/testing_list.txt", "r").readlines()
validation_list = open("../ml/datasets/speech/train/validation_list.txt", "r").readlines()

train_labels = os.listdir(train_audio_path)
print(f'Number of labels: {len(train_labels)}')

wavs = []
labels = []

# create a list of all the wav files and their labels which is NOT background noise
for label in train_labels:
    if label == '_background_noise_':
        continue
    files = os.listdir(train_audio_path + '/' + label)
    for f in files:
        if not f.endswith('wav'):
            continue
        wavs.append(f)
        labels.append(label)

x_train = []
x_val = []
x_test = []
y_train = []
y_val = []
y_test = []

# sort by comparing path to list, anything not found on the lists will be used as training data
for i in range(len(wavs)):

    if any(labels[i] + '/' + wavs[i] in s for s in test_list):
        x_test.append(wavs[i])
        y_test.append(labels[i])
        test_list.remove(str(labels[i] + '/' + wavs[i] + '\n'))
    elif any(labels[i] + '/' + wavs[i] in s for s in validation_list):
        x_val.append(wavs[i])
        y_val.append(labels[i])
        validation_list.remove(str(labels[i] + '/' + wavs[i] + '\n'))
    else:
        x_train.append(wavs[i])
        y_train.append(labels[i])

# format as full file path, this will be useful when using a generator to train
x_train = ["{}/{}".format(y_train,x_train) for x_train, y_train in zip(x_train, y_train)]
x_val = ["{}/{}".format(y_val,x_val) for x_val, y_val in zip(x_val, y_val)]
x_test = ["{}/{}".format(y_test,x_test) for x_test, y_test in zip(x_test, y_test)]

# overwrite labels which are not present in the contest dictionary with the string 'unknown'
for i in range(len(y_train)):
    if not(y_train[i] in contest_dict):
        y_train[i] = 'unknown'

for i in range(len(y_val)):
    if not(y_val[i] in contest_dict):
        y_val[i] = 'unknown'

for i in range(len(y_test)):
    if not(y_test[i] in contest_dict):
        y_test[i] = 'unknown'

train_sequences = []
test_sequences = []

# create a list of numeric identifiers for use with NN when feeding dictionaries
for i in range(len(y_train)):
    train_sequences.append(contest_dict[y_train[i]])

for i in range(len(y_val)):
    train_sequences.append(contest_dict[y_val[i]])

for i in range(len(y_test)):
    test_sequences.append(contest_dict[y_test[i]])

label_list = x_train + x_val

# create label dictionaries
labels = dict(zip(label_list, train_sequences))
test_labels = dict(zip(x_test, test_sequences))

# create test, train, and validation dictionaries for training and final evaluation
test_dict = {'test': x_test}

partition = {'train': x_train,
             'validation': x_val}

# pickle the results
with open('SavedTestDict.pickle', 'wb') as handle:
    pickle.dump(test_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('SavedPartition.pickle', 'wb') as handle:
    pickle.dump(partition, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('SavedLabels.pickle', 'wb') as handle:
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('SavedTestLabels.pickle', 'wb') as handle:
    pickle.dump(test_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

Number of labels: 33


In [48]:
with open('../ml/speech/SavedTestDict.pickle', 'wb') as handle:
    pickle.dump(test_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../ml/speech/SavedPartition.pickle', 'wb') as handle:
    pickle.dump(partition, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../ml/speech/SavedLabels.pickle', 'wb') as handle:
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../ml/speech/SavedTestLabels.pickle', 'wb') as handle:
    pickle.dump(test_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [49]:
import gc
gc.collect()

46166