In [1]:
%matplotlib inline

In [2]:
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import os
import sys
import re

In [6]:
import numpy as np
import tensorflow as tf

In [7]:
import time
from datetime import datetime, timedelta

## Load custom libraries

In [8]:
import config as cfg
from dataload import load_data, load_batch
from preprocessing import signalProcessBatch

## Tensorflow setup

In [9]:
# Tensorflow setup
sess = None
tf.logging.set_verbosity(tf.logging.INFO)

def reset_vars():
    """Initializes all tf variables"""
    sess.run(tf.global_variables_initializer())

def reset_tf():
    """Closes the current tf session and opens new session"""
    global sess
    if sess:
        sess.close()
    tf.reset_default_graph()
    sess = tf.Session()

In [10]:
# Functions to initialize weights and biases
def weight_variable(shape, name):
    """Creates a variable of size shape with random small positive numbers"""
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial, name=name)


def bias_variable(shape, name):
    """Creates a variable of size shape with a constant small positive number"""
    initial = tf.constant(0.01, shape=shape)
    return tf.Variable(initial, name=name)

In [11]:
# Conv2d, max pooling, and dropout wrapper functions for simplicity (No padding)
def conv2d(x, W, sx=1, sy=1):
    return tf.nn.conv2d(x, W, strides=[1, sx, sy, 1], padding='VALID')


def max_pool_2d(x, k=2):
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='VALID')


def dropout(x, d, is_training):
    if is_training is not None:
        return tf.nn.dropout(x, d)
    else:
        return x

## Build model

In [27]:
reset_tf()

# Model parameters
melspec_shape = (122, 64)                           # Shape of Mel spectrum data (t x f)
melspec_size = melspec_shape[0] * melspec_shape[1]
mfcc_shape = (122, 13)                              # Shape of MFCC data (t x mfcc)
mfcc_size = mfcc_shape[0] * mfcc_shape[1]
sf_size = 122                                       # Length of 1D feature arrays e.g. ZCR and RMSE

n_classes = len(cfg.NUM2LABEL)

batch_size = 128
silence_size = 4
num_iterations = 200
display_step = 10
checkpoint_step = 100

learning_rate = 5e-4
dropout_prob_value = 0.50                           # Dropout, probability to keep units

noise_factor_value = 0.1
noise_frac_value = 0.25

In [28]:
print "Total feature size:  {}".format(melspec_size + mfcc_size + sf_size)

Total feature size:  9516


In [29]:
# MODEL
def conv_net_speech_model(x_mel_in, x_mfcc_in, x_zcr_in, x_rmse_in, dropout_prob=None, is_training=False):
    
    #======================================================
    # Setup the parameters for the model
    #======================================================
    
    # Mel Spectrogram input size
    t_size = melspec_shape[0]
    f_size = melspec_shape[1]

    # Parameters for Conv layer 1 filter
    filter_size_t = 61
    filter_size_f = 8
    filter_count = 180
    filter_stride_t = 1
    filter_stride_f = 4
    
    # Paramaters for FC layers
    fc_output_channels_1 = 128
    fc_output_channels_2 = 128
    fc_output_channels_3 = n_classes
    
    # Number of elements in the first FC layer
    fc_element_count = int(filter_count \
                       * int(1 + (t_size - filter_size_t) / filter_stride_t) \
                       * int(1 + (f_size - filter_size_f) / filter_stride_f))
    
    #======================================================
    # Setup dictionaries containing weights and biases
    #======================================================
    
    weights = {
        'wconv1': weight_variable([filter_size_t, filter_size_f, 1, filter_count], 'wconv1'),
        'wfc1': weight_variable([fc_element_count, fc_output_channels_1], 'wfc1'),
        'wfc2': weight_variable([fc_output_channels_1, fc_output_channels_2], 'wfc2'),
        'wfc3': weight_variable([fc_output_channels_2, fc_output_channels_3], 'wfc3'),
    }
    biases = {
        'bconv1': bias_variable([filter_count], 'bconv1'),
        'bfc1': bias_variable([fc_output_channels_1], 'bfc1'),
        'bfc2': bias_variable([fc_output_channels_2], 'bfc2'),
        'bfc3': bias_variable([fc_output_channels_3], 'bfc3'),
    }
    
    #======================================================
    # Model definition and calculations
    #======================================================
    
    # Reshape input to [audio file number, time size, freq size, channel]
    x_mel_rs = tf.reshape(x_mel_in, [-1, t_size, f_size, 1])
    
    # Layer 1: first Conv layer, BiasAdd and ReLU
    x_mel_1 = tf.nn.relu(conv2d(x_mel_rs, weights['wconv1'],
                                sx=filter_stride_t,
                                sy=filter_stride_f) + biases['bconv1'])

    # Dropout 1:
    x_mel_dropout_1 = dropout(x_mel_1, dropout_prob, is_training)
    
    # Flatten layers
    x_mel_1_rs = tf.reshape(x_mel_dropout_1, [-1, fc_element_count])

    # Layer 2: first FC layer
    x_mel_2 = tf.matmul(x_mel_1_rs, weights['wfc1']) + biases['bfc1']
    
    # Dropout 2:
    x_mel_dropout_2 = dropout(x_mel_2, dropout_prob, is_training)
    
    # Layer 3: second FC layer
    x_mel_3 = tf.matmul(x_mel_dropout_2, weights['wfc2']) + biases['bfc2']
    
    # Dropout 3:
    x_mel_dropout_3 = dropout(x_mel_3, dropout_prob, is_training)
    
    # Layer 4: third FC layer
    x_mel_output = tf.matmul(x_mel_dropout_3, weights['wfc3']) + biases['bfc3']
    
    return x_mel_output

In [30]:
# Placeholders for signals preprocessing inputs
X_data = tf.placeholder(tf.float32, [None, cfg.SAMRATE], name='X_data')

noise_factor = tf.placeholder(tf.float32, shape=(), name='noise_factor')
noise_frac = tf.placeholder(tf.float32, shape=(), name='noise_frac')

In [31]:
# Define the audio features
x_mfcc, x_mel, x_zcr, x_rmse = signalProcessBatch(X_data,
                                                  noise_factor=noise_factor,
                                                  noise_frac=noise_frac,
                                                  window=512,
                                                  maxamps=cfg.MAXAMPS, sr=cfg.SAMRATE,
                                                  num_mel_bins=64, num_mfccs=13)

In [32]:
# Placeholder variables output (1-hot vectors of size n_classes)
y_true = tf.placeholder(tf.float32, shape=[None, n_classes], name='y_true')
y_true_class = tf.argmax(y_true, 1, name='y_true_class')

In [33]:
# Dropout keep probability and training flag
dropout_prob = tf.placeholder(tf.float32, shape=(), name='dropout_prob')
is_training = tf.placeholder(tf.bool, name="is_training")

In [34]:
# Prediction from model
y_pred = conv_net_speech_model(x_mel, x_mfcc, x_zcr, x_rmse, dropout_prob=dropout_prob, is_training=is_training)

In [35]:
# Cross entropy loss function with softmax then takes mean
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true))
tf.summary.scalar('loss', loss)

# Train and backprop gradients function
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# Evaluation and accuracy
y_pred_class = tf.argmax(y_pred, 1, name='y_pred_class')
correct_prediction = tf.equal(y_pred_class, y_true_class)
confusion_matrix = tf.confusion_matrix(y_true_class, y_pred_class, num_classes=n_classes)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)

<tf.Tensor 'accuracy:0' shape=() dtype=string>

In [36]:
# Merge all summaries
merged = tf.summary.merge_all()

In [37]:
# Saver for checkpoints
saver = tf.train.Saver(tf.global_variables())

In [38]:
tf.global_variables()

[<tf.Variable 'wconv1:0' shape=(61, 8, 1, 180) dtype=float32_ref>,
 <tf.Variable 'wfc1:0' shape=(167400, 128) dtype=float32_ref>,
 <tf.Variable 'wfc2:0' shape=(128, 128) dtype=float32_ref>,
 <tf.Variable 'wfc3:0' shape=(128, 12) dtype=float32_ref>,
 <tf.Variable 'bconv1:0' shape=(180,) dtype=float32_ref>,
 <tf.Variable 'bfc1:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'bfc2:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'bfc3:0' shape=(12,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'wconv1/Adam:0' shape=(61, 8, 1, 180) dtype=float32_ref>,
 <tf.Variable 'wconv1/Adam_1:0' shape=(61, 8, 1, 180) dtype=float32_ref>,
 <tf.Variable 'wfc1/Adam:0' shape=(167400, 128) dtype=float32_ref>,
 <tf.Variable 'wfc1/Adam_1:0' shape=(167400, 128) dtype=float32_ref>,
 <tf.Variable 'wfc2/Adam:0' shape=(128, 128) dtype=float32_ref>,
 <tf.Variable 'wfc2/Adam_1:0' shape=(128, 128) dtype=float3

In [39]:
tf.trainable_variables()

[<tf.Variable 'wconv1:0' shape=(61, 8, 1, 180) dtype=float32_ref>,
 <tf.Variable 'wfc1:0' shape=(167400, 128) dtype=float32_ref>,
 <tf.Variable 'wfc2:0' shape=(128, 128) dtype=float32_ref>,
 <tf.Variable 'wfc3:0' shape=(128, 12) dtype=float32_ref>,
 <tf.Variable 'bconv1:0' shape=(180,) dtype=float32_ref>,
 <tf.Variable 'bfc1:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'bfc2:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'bfc3:0' shape=(12,) dtype=float32_ref>]

## Train model

In [40]:
# TRAINING
def run_optimize(num_iterations, logs_path, min_loss=0):
    # Start-time
    start_time = time.time()
    msg = "\n====================\nStarting training...\n===================="
    tf.logging.info(msg)
    
    df = load_data(cfg.DATA_DIR)

    w = 0.075
    tf.logging.info("Begin iterations...")
    for i in xrange(num_iterations):
        
        # Get the training batch
        X_train, y_true_batch = load_batch(df, cfg.DATA_DIR,
                                           batch_size=batch_size, silence_size=silence_size,
                                           label='train',
                                           random=True, seed=None,
                                           w=w, samples=cfg.SAMRATE)
        
        # Preprocess the training batch
        x_mfcc_batch, x_mel_batch, x_zcr_batch, x_rmse_batch = sess.run(
            [x_mfcc, x_mel, x_zcr, x_rmse],
            feed_dict={X_data: X_train,
                       noise_factor: noise_factor_value,
                       noise_frac: noise_frac_value})

        # Training optimization
        sess.run(optimizer, feed_dict={x_mel: x_mel_batch,
                                       x_mfcc: x_mfcc_batch,
                                       x_zcr: x_zcr_batch,
                                       x_rmse: x_rmse_batch, 
                                       y_true: y_true_batch,
                                       dropout_prob: dropout_prob_value,
                                       is_training: True})
        
        # Checkpoint save and validation step
        if ((i + 1) % checkpoint_step == 0) or (i == num_iterations - 1):
            
            # Checkpoint
            checkpoint_path = os.path.join(logs_path, 'model.ckpt')
            msg = "Saving checkpoint to: {}-{}"
            tf.logging.info(msg.format(checkpoint_path, i + 1))
            saver.save(sess, checkpoint_path, global_step=i + 1)
            
            # Load the validation batches
            val_batch_size = 100
            total_val_accuracy = 0
            total_conf_matrix = None
            set_size = 6700
            for j in xrange(0, set_size, val_batch_size):
                X_val, y_true_val = load_batch(df, cfg.DATA_DIR,
                                               batch_size=val_batch_size, silence_size=silence_size,
                                               label='val',
                                               random=False, seed=j,
                                               w=1.0, samples=cfg.SAMRATE)
    
                # Preprocess the validation batch
                x_mfcc_val, x_mel_val, x_zcr_val, x_rmse_val = sess.run(
                    [x_mfcc, x_mel, x_zcr, x_rmse],
                    feed_dict = {X_data: X_val,
                                 noise_factor: 0.0,
                                 noise_frac: 0.0})
                
                # Validation summary
                val_summary, loss_val, acc_val, conf_matrix = sess.run(
                    [merged, loss, accuracy, confusion_matrix],
                    feed_dict={x_mel: x_mel_val,
                               x_mfcc: x_mfcc_val,
                               x_zcr: x_zcr_val,
                               x_rmse: x_rmse_val,
                               y_true: y_true_val,
                               dropout_prob: 1.0,
                               is_training: False})
                total_val_accuracy += (acc_val * val_batch_size) / set_size
                if total_conf_matrix is None:
                    total_conf_matrix = conf_matrix
                else:
                    total_conf_matrix += conf_matrix
            
            msg = "Confusion Matrix:\n {}"
            tf.logging.info(msg.format(total_conf_matrix))
            msg = "VALIDATION ACC: {:6f}, (N = {})"
            tf.logging.info(msg.format(total_val_accuracy, set_size))

        # Display step
        if (i == 0) or ((i + 1) % display_step == 0) or (i == num_iterations - 1):
            # Training summary, loss and accuracy
            train_summary, loss_train, acc_train = sess.run(
                [merged, loss, accuracy],
                feed_dict={x_mel: x_mel_batch,
                           x_mfcc: x_mfcc_batch,
                           x_zcr: x_zcr_batch,
                           x_rmse: x_rmse_batch,
                           y_true: y_true_batch,
                           dropout_prob: 1.0,
                           is_training: False})
            train_writer.add_summary(train_summary, i + 1)
            
            # Display message
            msg = "  OPTIMIZE STEP: {:6d}, LOSS, {:.6f}, ACC: {:.6f}"
            tf.logging.info(msg.format(i + 1, loss_train, acc_train))
    
            # Check if loss is below minimum
            if loss_train < min_loss:
                break
    
    # End-time
    end_time = time.time()
    msg = "Time usage: {}"
    tf.logging.info(msg.format(timedelta(seconds=int(round(end_time - start_time)))))
    return

## Run Training (short test)

In [41]:
# Initialize
reset_vars()

# Set path to summary logs
now = datetime.now()
logs_path = os.path.join(cfg.OUT_DIR, now.strftime("%Y%m%d-%H%M%S"), 'summaries')

# Create summary writers
train_writer = tf.summary.FileWriter(os.path.join(logs_path, 'train'), graph=tf.get_default_graph())
test_writer = tf.summary.FileWriter(os.path.join(logs_path, 'test'), graph=tf.get_default_graph())

In [42]:
run_optimize(num_iterations, logs_path, min_loss=0)

INFO:tensorflow:
Starting training...
INFO:tensorflow:Begin iterations...
INFO:tensorflow:  OPTIMIZE STEP:      1, LOSS, 3.348485, ACC: 0.140625
INFO:tensorflow:  OPTIMIZE STEP:     10, LOSS, 2.487481, ACC: 0.156250
INFO:tensorflow:  OPTIMIZE STEP:     20, LOSS, 2.438335, ACC: 0.140625
INFO:tensorflow:  OPTIMIZE STEP:     30, LOSS, 2.671888, ACC: 0.125000
INFO:tensorflow:  OPTIMIZE STEP:     40, LOSS, 2.434289, ACC: 0.117188
INFO:tensorflow:  OPTIMIZE STEP:     50, LOSS, 4.500966, ACC: 0.093750
INFO:tensorflow:  OPTIMIZE STEP:     60, LOSS, 2.481188, ACC: 0.132812
INFO:tensorflow:  OPTIMIZE STEP:     70, LOSS, 3.206299, ACC: 0.164062
INFO:tensorflow:  OPTIMIZE STEP:     80, LOSS, 2.535654, ACC: 0.117188
INFO:tensorflow:  OPTIMIZE STEP:     90, LOSS, 2.693345, ACC: 0.085938
INFO:tensorflow:Saving checkpoint to: ../models/20180215-125515/summaries/model.ckpt-100
INFO:tensorflow:Confusion Matrix:
 [[   0    0   46  280    0 1075 1682  166   18  234  316  142]
 [   0    0    4    9    0   