In [10]:
import os
import numpy as np
import tensorflow as tf

import time
import pyaudio
import wave


# Set seed for experiment reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [11]:
wake_word_model = tf.keras.models.load_model("models/Own_Wake_Word_model_augmentation.h5")
# wake_word_model = tf.keras.models.load_model("models/Prerecorded_Wake_Word_model.h5")
wake_word_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resizing (Resizing)          (None, 32, 32, 1)         0         
_________________________________________________________________
normalization (Normalization (None, 32, 32, 1)         3         
_________________________________________________________________
conv2d (Conv2D)              (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 14, 14, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 12544)             0

In [12]:
data_dir = 'data/augmented_data'
# data_dir = 'data/commands'

commands = os.listdir(data_dir)
print("Commands: ", commands)

Commands:  ['background', 'hallo', 'ja', 'moin', 'nein']


In [13]:
def decode_audio(audio_binary):
  """
  Decodes a .wav file to a float tensor.

  Args:
      audio_binary: Binary audio to decode
  """
  audio, _ = tf.audio.decode_wav(audio_binary)
  return tf.squeeze(audio, axis=-1)

def get_label(file_path):
  """
  Get the class label from file path

  Args:
      file_path:  Path to extract the class label
  """
  parts = tf.strings.split(file_path, os.path.sep)
  return parts[-2] 

def get_waveform_and_label(file_path, is_label):
  """
  Decodes .wav file and gets class label

  Args:
      file_path:  Path to extract the class label
      is_label:   Is there a label from the .wav file
  """
  label=None
  if is_label:
    label = get_label(file_path)
  audio_binary = tf.io.read_file(file_path)
  waveform = decode_audio(audio_binary)
  return waveform, label

def get_spectrogram(waveform):
  """
  Creates a spectrogram of the audio file.

  Args:
      waveform: Audio signal
  """
  # Padding for files with less than 16000 samples
  zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)

  # Concatenate audio with padding so that all audio clips will be of the 
  # same length
  waveform = tf.cast(waveform, tf.float32)
  equal_length = tf.concat([waveform, zero_padding], 0)
  spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=128)
      
  spectrogram = tf.abs(spectrogram)

  return spectrogram

In [14]:
def record_audio_file(filename = "prediction.wav", seconds = 1, fs = 16000, channels = 1, sample_format = pyaudio.paInt16, chunk = 1024):
    """
    Records an audio file and saves it in a .wav file

    Args:
        filename:       Name of the .wav file
        seconds:        Length if the audio file
        fs:             Sampling rate
        channels:       Number of channels of the audio signal
        sample_format:  Format in which the audio signal is stored
        chunk:          Splits the audio signal into pieces
    """
    p = pyaudio.PyAudio()  # Create an interface to PortAudio

    print('Recording')

    stream = p.open(format=sample_format,
                    channels=channels,
                    rate=fs,
                    frames_per_buffer=chunk,
                    input=True)

    frames = []  # Initialize array to store frames

    # Store data in chunks for 1 seconds
    for i in range(0, int(fs / chunk * seconds)):
        data = stream.read(chunk)
        frames.append(data)

    # Stop and close the stream 
    stream.stop_stream()
    stream.close()
    # Terminate the PortAudio interface
    p.terminate()

    print('Finished recording')

    # Save the recorded data as a WAV file
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(frames))
    wf.close()


In [15]:
def start_prediction(filename = "prediction.wav"):
    """
    Records an audio file and predicts what was detected.

    Args:
        filename:   Name of the .wav file
    """
    while True:
        input(f"Press Enter and say a word which should be predicted or stop the prediction: ")

        record_audio_file()

        wvf, lab = get_waveform_and_label(filename, False)
        spec = get_spectrogram(wvf)
        spec = np.expand_dims(spec,axis=0)

        prediction = wake_word_model.predict(spec)
        print(prediction)

        print(commands[np.argmax(prediction)])

        time.sleep(1)


In [16]:
start_prediction()

Recording
Finished recording
[[ 13.6180525  -1.9106178 -11.243571   -3.3285751  -2.9715655]]
background
Recording
Finished recording
[[  5.1767397   1.510049   -8.251047    9.282818  -10.038045 ]]
moin
Recording
Finished recording
[[-5.2694993  6.4521894  5.2985816 -8.378808   1.8166587]]
hallo
Recording
Finished recording
[[ -3.352351    5.1266603   4.2976036 -15.989975   10.711103 ]]
nein
Recording
Finished recording
[[  5.5594635  15.044405   15.737747  -15.701364  -15.442806 ]]
ja
Recording
Finished recording
[[-0.8916323 14.042105  -0.9712932 -8.632631  -1.9799287]]
hallo
Recording
Finished recording
[[ 2.6649547 -3.3254304 -4.244253   5.1195564 -4.4556823]]
moin
Recording
Finished recording
[[ 1.8692368 -3.048441  -0.9631272  2.5073102 -2.3059633]]
moin
Recording
Finished recording
[[ 2.1796489  3.0457776 -3.0928876  2.7128122 -5.749606 ]]
hallo
Recording
Finished recording
[[ 3.268942  4.860447 -5.190403  4.319694 -8.204845]]
hallo
Recording
Finished recording
[[ 1.6974376 -2.07

KeyboardInterrupt: 