# Train a Simple Audio Recognition Model

This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.

The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>


**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.

## Configure Defaults

**MODIFY** the following constants for your specific use case.

In [1]:
# A comma-delimited list of the words you want to train for.
# The options are: yes,no,up,down,left,right,on,off,stop,go
# All the other words will be used to train an "unknown" label and silent
# audio data with no spoken words will be used to train a "silence" label.
WANTED_WORDS = "snoring,no_snoring"

# The number of steps and learning rates can be specified as comma-separated
# lists to define the rate at each stage. For example,
# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001
# will run 12,000 training loops in total, with a rate of 0.001 for the first
# 8,000, and 0.0001 for the final 3,000.
TRAINING_STEPS = "25,25"
LEARNING_RATE = "0.005,0.005"
WINDOW_STRIDE = 20
PREPROCESS = "micro"
DATASET_DIR = '/home/jinhao/Snoring-Detection/Snoring_Dataset_@16000/'
# Calculate the total number of steps, which is used to identify the checkpoint
# file name.
TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(","))))
LOGS_DIR = 'logs/'
# Print the configuration to confirm it
print("Training these words: %s" % WANTED_WORDS)
print("Training steps in each stage: %s" % TRAINING_STEPS)
print("Learning rate in each stage: %s" % LEARNING_RATE)
print("Total number of training steps: %s" % TOTAL_STEPS)

Training these words: snoring,no_snoring
Training steps in each stage: 25,25
Learning rate in each stage: 0.005,0.005
Total number of training steps: 50


In [2]:
import tensorflow as tf
print(tf.__version__)

2023-03-04 16:29:21.719929: E tensorflow/core/platform/hadoop/hadoop_file_system.cc:132] HadoopFileSystem load error: libhdfs.so: cannot open shared object file: No such file or directory


1.14.0


In [4]:
import sys
sys.path.append("../deployment/tensorflow1/tensorflow/examples/speech_commands/")
import input_data
import models

SAMPLE_RATE = 16000
CLIP_DURATION_MS = 1000
WINDOW_SIZE_MS = 30.0
FEATURE_BIN_COUNT = 40
BACKGROUND_FREQUENCY = 0
BACKGROUND_VOLUME_RANGE = 0
TIME_SHIFT_MS = 0.0
SILENCE = 10
DATA_URL = '' #'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
VALIDATION_PERCENTAGE = 10
TESTING_PERCENTAGE = 10


# In[36]:


model_settings = models.prepare_model_settings(
    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),
    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,
    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)
audio_processor = input_data.AudioProcessor(
    DATA_URL, DATASET_DIR,SILENCE,10,
    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,
    TESTING_PERCENTAGE, model_settings, LOGS_DIR)


2023-03-04 16:29:40.089197: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 6091720 exceeds 10% of system memory.


In [5]:
# Check version of python using
# https://medium.com/@nrk25693/how-to-add-your-conda-environment-to-your-jupyter-notebook-in-just-4-steps-abeab8b8d084
import sys
print(sys.executable)

/usr/bin/python3


In [9]:
import numpy as np
from scipy.io.wavfile import read as wav_read
import io,os
import ffmpeg
#import librosa
import scipy.io.wavfile
import tensorflow as tf

In [10]:
# Helper function to run inference (on a single input this time)
# Note: this also includes additional manual pre-processing
def run_tflite_inference_singleFile(tflite_model_path, custom_audio, sr_custom_audio, model_type="Float"):
  #
  # Preprocess the sample to get the features we pass to the model
  #
  # First re-sample to the needed rate
#   custom_audio_resampled = librosa.resample(np.float64(custom_audio), sr_custom_audio, SAMPLE_RATE)
#   # Then extract the loudest one second
#   scipy.io.wavfile.write('custom_audio.wav', SAMPLE_RATE, np.int16(custom_audio_resampled))
#   os.system('./extract_loudest_section/gen/bin/extract_loudest_section custom_audio.wav ./trimmed')
#   # Finally pass it through the TFLiteMicro preprocessor to produce the 
#   # spectrogram/MFCC input that the model expects
#   custom_model_settings = models.prepare_model_settings(
#       0, SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,
#       WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)

  custom_audio_processor = input_data.AudioProcessor(None, None, 0, 0,'', 0, 0,
                                                    model_settings, None)
  custom_audio_preprocessed = custom_audio_processor.get_features_for_wav(
                                        custom_audio, model_settings, TF_SESS)
  # Reshape the output into a 1,1960 matrix as that is what the model expects
  custom_audio_input = custom_audio_preprocessed[0].flatten()
#   print (custom_audio_input)
#   custom_audio_input = custom_audio
#   print (custom_audio_input)

  test_data = np.reshape(custom_audio_input,(1,len(custom_audio_input)))

  #
  # Initialize the interpreter
  #
  interpreter = tf.lite.Interpreter(tflite_model_path)
  interpreter.allocate_tensors()
  input_details = interpreter.get_input_details()[0]
  output_details = interpreter.get_output_details()[0]

  #
  # For quantized models, manually quantize the input data from float to integer
  #
  if model_type == "Quantized":
    input_scale, input_zero_point = input_details["quantization"]
    test_data = test_data / input_scale + input_zero_point
    test_data = test_data.astype(input_details["dtype"])

  #
  # Run the interpreter
  #
  interpreter.set_tensor(input_details["index"], test_data)
  interpreter.invoke()
  output = interpreter.get_tensor(output_details["index"])[0]
  top_prediction = output.argmax()

  #
  # Translate the output
  #
  top_prediction_str = ''
  if top_prediction == 0 or top_prediction == 1:
    top_prediction_str = WANTED_WORDS.split(',')[top_prediction]
  elif top_prediction == 2:
    top_prediction_str = 'silence'
  else:
    raiseError('Label not defined')

  print('%s model guessed the value to be %s' % (model_type, top_prediction_str))


In [12]:
TF_SESS = tf.compat.v1.InteractiveSession()
# Then test the model -- do they all work as you'd expect?
print("Testing snoring")
run_tflite_inference_singleFile('conv_micro/conv_model.tflite',DATASET_DIR+'snoring/1_44.wav' , 16000, model_type="Quantized")
print("Testing no snoring")
run_tflite_inference_singleFile('conv_micro/conv_model.tflite',DATASET_DIR+'no_snoring/0_14.wav' , 16000, model_type="Quantized")
TF_SESS.close()

Testing snoring




TypeError: __init__() missing 1 required positional argument: 'summaries_dir'