In [1]:
%matplotlib inline

In [2]:
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import numpy as np
import tensorflow as tf

In [4]:
import pandas as pd
from scipy import signal
from scipy.io import wavfile
import seaborn as sns
import librosa
import librosa.display
from sklearn.preprocessing import StandardScaler

In [5]:
import os
import sys
import re

In [6]:
import time
from datetime import datetime, timedelta

In [7]:
# Tensorflow setup
sess = None
tf.logging.set_verbosity(tf.logging.INFO)

def reset_vars():
    """Initializes all tf variables"""
    sess.run(tf.global_variables_initializer())

def reset_tf():
    """Closes the current tf session and opens new session"""
    global sess
    if sess:
        sess.close()
    tf.reset_default_graph()
    sess = tf.Session()

In [8]:
# Functions to initialize weights and biases
def weight_variable(shape):
    """Creates a variable of size shape with random small positive numbers"""
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial)


def bias_variable(shape):
    """Creates a variable of size shape with a constant small positive number"""
    initial = tf.constant(0.01, shape=shape)
    return tf.Variable(initial)

In [9]:
# Conv2d, max pooling, and dropout wrapper functions for simplicity
def conv2d(x, W, sx=1, sy=1):
    return tf.nn.conv2d(x, W, strides=[1, sx, sy, 1], padding='SAME')


def max_pool_2d(x, k=2):
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')


def dropout(x, d, is_training):
    if is_training is not None:
        return tf.nn.dropout(x, d)
    else:
        return x

In [10]:
LABELS_REQUIRED = ['yes', 'no', 'up', 'down', 'left',
                   'right', 'on', 'off', 'stop', 'go',
                   'silence']

In [11]:
BITRATE = 16                      # Bit rate
SAMRATE = 16000                   # Sample rate (Hz)
SAMTIME = 1000.0 / SAMRATE        # Sample time (ms)
MAXAMPS = float(2**BITRATE / 2)   # Max samples amplitute

In [12]:
num2label = {i+1:l for i, l in enumerate(LABELS_REQUIRED)}
num2label[0] = 'unknown'
label2num = {v:k for k, v in num2label.items()}

## Build model

In [13]:
reset_tf()

# Model parameters
melspec_shape = (64, 126)                           # Shape of Mel spectrum data (f x t)
melspec_size = melspec_shape[0] * melspec_shape[1]
mfcc_shape = (19, 126)                              # Shape of MFCC data
mfcc_size = mfcc_shape[0] * mfcc_shape[1]
sf_size = 122                                       # Length of 1D feature arrays e.g. ZCR and RMSE

n_classes = len(label2num)

batch_size = 50
num_iterations = 5000
display_step = 100

learning_rate = 2e-4
dropout_prob_value = 0.60                           # Dropout, probability to keep units

In [14]:
print "Total feature size:  {}".format(melspec_size + mfcc_size + sf_size)

Total feature size:  10580


In [15]:
# MODEL
def conv_net_speech_model(x_mel, x_mfcc, x_zcr, x_rmse, dropout_prob=None, is_training=False):
    
    #======================================================
    # Setup the parameters for the model
    #======================================================
    
    # Mel Spectrogram input size
    f_size = melspec_shape[0]
    t_size = melspec_shape[1]

    # Parameters for Conv layer 1 filter
    filter_size_t = 32
    filter_size_f = 8
    filter_count = 256
    filter_stride_t = 1
    filter_stride_f = 4
    
    # Paramaters for FC layers
    fc_output_channels_1 = 128
    fc_output_channels_2 = 128
    fc_output_channels_3 = n_classes
    
    # Number of elements in the first FC layer
    fc_element_count = int(filter_count \
                       * int(1 + (t_size - filter_size_t) / filter_stride_t) \
                       * int(1 + (f_size - filter_size_f) / filter_stride_f))
    
    #======================================================
    # Setup dictionaries containing weights and biases
    #======================================================
    
    weights = {
        'wconv1': weight_variable([filter_size_t, filter_size_f, 1, filter_count]),
        'wfc1': weight_variable([fc_element_count, fc_output_channels_1]),
        'wfc2': weight_variable([fc_output_channels_1, fc_output_channels_2]),
        'wfc3': weight_variable([fc_output_channels_2, fc_output_channels_3]),
    }
    biases = {
        'bconv1': bias_variable([filter_count]),
        'bfc1': bias_variable([fc_output_channels_1]),
        'bfc2': bias_variable([fc_output_channels_2]),
        'bfc3': bias_variable([fc_output_channels_3]),
    }
    
    #======================================================
    # Model definition and calculations
    #======================================================
    
    # Reshape input to [audio file number, time size, freq size, channel]
    x_mel_rs = tf.reshape(x_mel, [-1, t_size, f_size, 1])
    
    # Layer 1: first Conv layer, BiasAdd and ReLU
    x_mel_1 = tf.nn.relu(conv2d(x_mel_rs, weights['wconv1'],
                                sx=filter_stride_t,
                                sy=filter_stride_f) + biases['bconv1'])
    
    # Dropout 1:
    x_mel_dropout_1 = dropout(x_mel_1, dropout_prob, is_training)
    
    # Flatten layers
    x_mel_1_rs = tf.reshape(x_mel_dropout_1, [-1, fc_element_count])

    # Layer 2: first FC layer
    x_mel_2 = tf.matmul(x_mel_1_rs, weights['wfc1']) + biases['bfc1']
    
    # Dropout 2:
    x_mel_dropout_2 = dropout(x_mel_2, dropout_prob, is_training)
    
    # Layer 3: second FC layer
    x_mel_3 = tf.matmul(x_mel_dropout_2, weights['wfc2']) + biases['bfc2']
    
    # Dropout 3:
    x_mel_dropout_3 = dropout(x_mel_3, dropout_prob, is_training)
    
    # Layer 4: third FC layer
    x_mel_output = tf.matmul(x_mel_dropout_3, weights['wfc3']) + biases['bfc3']
    
    return x_mel_output

In [16]:
# Placeholder variables input
x_mel = tf.placeholder(tf.float32, [None, melspec_size], name='x_mel')
x_mfcc = tf.placeholder(tf.float32, [None, mfcc_size], name='x_mfcc')
x_zcr = tf.placeholder(tf.float32, [None, sf_size], name='x_zcr')
x_rmse = tf.placeholder(tf.float32, [None, sf_size], name='x_rsme')

In [17]:
# Placeholder variables output
y_true = tf.placeholder(tf.float32, shape=[None, n_classes], name='y_true')
y_true_class = tf.argmax(y_true, 1)

In [18]:
# Dropout keep probability and training flag
dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
is_training = tf.placeholder(tf.bool, name="is_training")

In [19]:
# Prediction from model
y_pred = conv_net_speech_model(x_mel, x_mfcc, x_zcr, x_rmse, dropout_prob=dropout_prob, is_training=is_training)

In [20]:
# Cross entropy loss function with softmax then takes mean
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true))
tf.summary.scalar('loss', loss)

# Train and backprop gradients function
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# Evaluation and accuracy
y_pred_class = tf.argmax(y_pred, 1)
correct_prediction = tf.equal(y_pred_class, y_true_class)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)

<tf.Tensor 'accuracy:0' shape=() dtype=string>

In [21]:
# Merge all summaries
merged = tf.summary.merge_all()

In [22]:
# Initialize
reset_vars()

# Set path to summary logs
now = datetime.now()
logs_path = now.strftime("%Y%m%d-%H%M%S") + '/summaries'

# Create summary writers
train_writer = tf.summary.FileWriter(logs_path + '/train', graph=tf.get_default_graph())
test_writer = tf.summary.FileWriter(logs_path + '/test', graph=tf.get_default_graph())

## Train model

In [111]:
def next_batch(num):
    """Return a total of 'num' random samples"""

    #data_list = # TODO: get files, do signal processing and return features

    idx = np.arange(0, len(data_list[0]))
    idx = np.random.choice(idx, size=num, replace=False)
    
    data_list_batch = []
    for data in data_list:    
        data_list_batch.append(data[idx])

    return data_list_batch

In [80]:
# TRAINING
def run_optimize(num_iterations, min_loss=0):
    # Start-time
    start_time = time.time()

    for i in xrange(num_iterations):
        
        # Get the training batch
        x_mel_batch, x_mfcc_batch, x_zcr_batch, x_rmse_batch, y_true_batch = next_batch(batch_size)

        # Training optimization
        sess.run(optimizer, feed_dict={x_mel: x_mel_batch,
                                       x_mfcc: x_mfcc_batch,
                                       x_zcr: x_zcr_batch,
                                       x_rmse: x_rmse_batch, 
                                       y_true: y_true_batch,
                                       dropout_prob: dropout_prob_value,
                                       is_training: True})
        
        if (i % display_step == 0) or (i == num_iterations - 1):
            # Training summary
            train_summary, loss_train, acc_train = sess.run([merged, loss, accuracy],
                                                            feed_dict={x_mel: x_mel_batch,
                                                                       x_mfcc: x_mfcc_batch,
                                                                       x_zcr: x_zcr_batch,
                                                                       x_rmse: x_rmse_batch,
                                                                       y_true: y_true_batch,
                                                                       dropout_prob: dropout_prob_value,
                                                                       is_training: False})
            train_writer.add_summary(train_summary, i + 1)
            
            # Testing summary
            test_summary, loss_test, acc_test = sess.run([merged, loss, accuracy],
                                                         feed_dict={x_mel: x_mel_test,
                                                                    x_mfcc: x_mfcc_test,
                                                                    x_zcr: x_zcr_test,
                                                                    x_rmse: x_rmse_test,
                                                                    y_true: y_true_test,
                                                                    dropout_prob: dropout_prob_value,
                                                                    is_training: False})
            test_writer.add_summary(test_summary, i + 1)
            
            # Display message
            msg = "  OPTIMIZE STEP: {:6d}, LOSS, {:.6f}, ACC: {:.6f}"
            print(msg.format(i + 1, loss_test, acc_test))
    
            # Check if loss is below minimum
            if loss_train < min_loss:
                break
    
    # End-time
    end_time = time.time()
    print "Time usage: {}".format(timedelta(seconds=int(round(end_time - start_time))))    
    return