# Audio Recognition Lab

## Introduction 


In [34]:
import tensorflow as tf
import pandas as pd 
import numpy as np 
import oci
import os 
import tarfile 
import random 
import math 
import re 
import hashlib 

from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
from tensorflow.python.platform import gfile
from tensorflow.python.util import compat

In [29]:
def which_set(filename, validation_percentage, testing_percentage):
    """Determines which data partition the file should belong to.
    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.
    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.
    Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.
    Returns:
    String, one of 'training', 'validation', or 'testing'.
    """
    
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r'_nohash_.*$', '', base_name)
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
    
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
    
    if percentage_hash < validation_percentage:
        result = 'validation'
    
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = 'testing'
    
    else:
        result = 'training'
    
    return result

In [63]:
def prepare_words_list(wanted_words):
    """Prepends common tokens to the custom word list.
    Args:
    wanted_words: List of strings containing the custom words.
    Returns:
    List with the standard silence and unknown tokens added.
    """
    return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words

# Push Data to OCI (for JR)

../data/data_speech_commands_v0.02.tar.gz


In [10]:
bucket_name = "oow-2018"
filenames = ['data_speech_commands_v0.02.tar.gz']
config = oci.config.from_file()
object_storage_client = oci.object_storage.ObjectStorageClient(config)
namespace = object_storage_client.get_namespace().data
bucket_list = object_storage_client.list_buckets(namespace, config['compartment-id'])

In [14]:
with open("../data/data_speech_commands_v0.02.tar.gz", 'rb') as f:
    obj = object_storage_client.put_object(namespace, bucket_name, "data_speech_commands_v0.02.tar.gz", f)

# Pull Data From OCI 

In [5]:
bucket_name = "oow-2018"
filenames = ['data_speech_commands_v0.02.tar.gz']
config = oci.config.from_file()
object_storage_client = oci.object_storage.ObjectStorageClient(config)
namespace = object_storage_client.get_namespace().data
bucket_list = object_storage_client.list_buckets(namespace, config['compartment-id'])

if not os.path.exists("../data"):  
    os.mkdir("../data")

for file in filenames:
    get_obj = object_storage_client.get_object(namespace, bucket_name, file)
    with open("../data/"+file, 'wb') as f:
        for chunk in get_obj.data.raw.stream(1024 * 1024, decode_content=False):
            f.write(chunk)

# The Data 

In [8]:
tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.InteractiveSession()

## Extract the tarfile 

In [17]:
#audio_processor = input_data.AudioProcessor(
#      FLAGS.data_url, FLAGS.data_dir,
#      FLAGS.silence_percentage, FLAGS.unknown_percentage,
#      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
#      FLAGS.testing_percentage, model_settings, FLAGS.summaries_dir)

#self.data_dir = data_dir
#    self.maybe_download_and_extract_dataset(data_url, data_dir)
#    self.prepare_data_index(silence_percentage, unknown_percentage,
#                            wanted_words, validation_percentage,
#                            testing_percentage)
#    self.prepare_background_data()
#    self.prepare_processing_graph(model_settings, summaries_dir)

data_dir = "../data/wav/"
tarfile.open("../data/data_speech_commands_v0.02.tar.gz", 'r:gz').extractall(data_dir)

## Prepare the data, split, & create the labels 

In [64]:
wanted_words = ['dog', 'bed', 'bird', 'cat', 'left', 'house', 'happy', 'go']
BACKGROUND_NOISE_DIR_NAME = "_background_noise_"
validation_percentage = 10.0
testing_percentage = 10.0
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 
SILENCE_LABEL = '_silence_'
SILENCE_INDEX = 0
UNKNOWN_WORD_LABEL = '_unknown_'
UNKNOWN_WORD_INDEX = 1
RANDOM_SEED = 59185
silence_percentage = 5.0
unknown_percentage = 5.0

In [65]:
random.seed(RANDOM_SEED)
wanted_words_index = {}
for index, wanted_word in enumerate(wanted_words):
    wanted_words_index[wanted_word] = index + 2

data_index = {'validation': [], 'testing': [], 'training': []}
unknown_index = {'validation': [], 'testing': [], 'training': []}
all_words = {}
search_path = os.path.join(data_dir, '*', '*.wav')
print(search_path)

../data/wav/*/*.wav


In [66]:
for wav_path in gfile.Glob(search_path):
    _, word = os.path.split(os.path.dirname(wav_path))
    word = word.lower()
    # Treat the '_background_noise_' folder as a special case, since we expect
    # it to contain long audio samples we mix in to improve training.
    if word == BACKGROUND_NOISE_DIR_NAME:
        continue
    all_words[word] = True
    set_index = which_set(wav_path, validation_percentage, testing_percentage)
    # If it's a known class, store its detail, otherwise add it to the list
    # we'll use to train the unknown label.
    if word in wanted_words_index:
        data_index[set_index].append({'label': word, 'file': wav_path})
    else:
        unknown_index[set_index].append({'label': word, 'file': wav_path})

if not all_words:
    raise Exception('No .wavs found at ' + search_path)

for index, wanted_word in enumerate(wanted_words):
    if wanted_word not in all_words:
        raise Exception('Expected to find ' + wanted_word +
                        ' in labels but only found ' +
                        ', '.join(all_words.keys()))
# We need an arbitrary file to load as the input for the silence samples.
# It's multiplied by zero later, so the content doesn't matter.
silence_wav_path = data_index['training'][0]['file']

In [67]:
for set_index in ['validation', 'testing', 'training']:
    set_size = len(data_index[set_index])
    silence_size = int(math.ceil(set_size * silence_percentage / 100))
    for _ in range(silence_size):
        data_index[set_index].append({
            'label': SILENCE_LABEL,
            'file': silence_wav_path
        })
    # Pick some unknowns to add to each partition of the data set.
    random.shuffle(unknown_index[set_index])
    unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
    data_index[set_index].extend(unknown_index[set_index][:unknown_size])
# Make sure the ordering is random.
for set_index in ['validation', 'testing', 'training']:
     random.shuffle(data_index[set_index])
# Prepare the rest of the result data structure.
words_list = prepare_words_list(wanted_words)
word_to_index = {}
for word in all_words:
    if word in wanted_words_index:
        word_to_index[word] = wanted_words_index[word]
    else:
        word_to_index[word] = UNKNOWN_WORD_INDEX
word_to_index[SILENCE_LABEL] = SILENCE_INDEX

# Process the Data

In [70]:
background_data = []
background_dir = os.path.join(data_dir, BACKGROUND_NOISE_DIR_NAME)
print(background_dir)
#if not os.path.exists(background_dir):
#    return background_data
with tf.Session(graph=tf.Graph()) as sess:
    wav_filename_placeholder = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    search_path = os.path.join(data_dir, BACKGROUND_NOISE_DIR_NAME,'*.wav')
    for wav_path in gfile.Glob(search_path):
        wav_data = sess.run(wav_decoder,
                            feed_dict={wav_filename_placeholder: wav_path}).audio.flatten()
    background_data.append(wav_data)
    if not background_data:
        raise Exception('No background wav files were found in ' + search_path)

../data/wav/_background_noise_


In [79]:
tf.__version__

'1.9.0'

# JUNK ---------------------------------------------------------

In [None]:
  def prepare_processing_graph(self, model_settings, summaries_dir):
    """Builds a TensorFlow graph to apply the input distortions.
    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.
    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:
      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.
    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.
    Raises:
      ValueError: If the preprocessing mode isn't recognized.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = model_settings['desired_samples']
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          scaled_foreground,
          self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
      # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
      spectrogram = contrib_audio.audio_spectrogram(
          background_clamp,
          window_size=model_settings['window_size_samples'],
          stride=model_settings['window_stride_samples'],
          magnitude_squared=True)
      tf.summary.image(
          'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
      # The number of buckets in each FFT row in the spectrogram will depend on
      # how many input samples there are in each window. This can be quite
      # large, with a 160 sample window producing 127 buckets for example. We
      # don't need this level of detail for classification, so we often want to
      # shrink them down to produce a smaller result. That's what this section
      # implements. One method is to use average pooling to merge adjacent
      # buckets, but a more sophisticated approach is to apply the MFCC
      # algorithm to shrink the representation.
      if model_settings['preprocess'] == 'average':
        self.output_ = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
        tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1)
      elif model_settings['preprocess'] == 'mfcc':
        self.output_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
        tf.summary.image(
            'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
      else:
        raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
                         ' "average")' % (model_settings['preprocess']))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.summary.merge_all(scope='data')
      self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
                                                   tf.get_default_graph())

In [None]:
model_settings = models.prepare_model_settings(
      len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
      FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
      FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess)

In [None]:
  model_settings = models.prepare_model_settings(
      len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
      FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
      FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess)

  fingerprint_size = model_settings['fingerprint_size']
  label_count = model_settings['label_count']
  time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)

In [None]:
# The Model 

In [None]:
# Training the Model 

In [None]:
# Results 

In [None]:
# Deployment 