Note that this is a hacked together version of Home Assistant's custom wake word tutorial script. We do not want to train an openWakeWord model, but we just use it to generate and augment our samples. There is some added coded to save the augmented samples as wav files for use in training.

In [None]:
# @title  { display-mode: "form" }
# @markdown # 1. Test Example Training Clip Generation
# @markdown Since openWakeWord models are trained on synthetic examples of your
# @markdown target wake word, it's a good idea to make sure that the examples
# @markdown sound correct. Type in your target wake word below, and run the
# @markdown cell to listen to it.
# @markdown
# @markdown Here are some tips that can help get the wake word to sound right:

# @markdown - If your wake word isn't being pronounced in the way
# @markdown you want, try spelling out the sounds phonetically with underscores
# @markdown separating each part.
# @markdown For example: "hey siri" --> "hey_seer_e".

# @markdown - Spell out numbers ("2" --> "two")

# @markdown - Avoid all punctuation except for "?" and "!", and remove unicode characters

import os
import sys
from IPython.display import Audio
if not os.path.exists("./piper-sample-generator"):
    !git clone https://github.com/rhasspy/piper-sample-generator
    !wget -O piper-sample-generator/models/en_US-libritts_r-medium.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'

    # Install system dependencies
!pip install piper-phonemize
!pip install webrtcvad

if "piper-sample-generator/" not in sys.path:
    sys.path.append("piper-sample-generator/")

from generate_samples import generate_samples

target_word = 'hey jar_vis' # @param {type:"string"}
# target_word = 'ow_kay na_boo' # @param {type:"string"}

def text_to_speech(text):
    generate_samples(text = text,
                max_samples=1,
                length_scales=[1.1],
                noise_scales=[0.7], noise_scale_ws = [0.7],
                output_dir = './', batch_size=1, auto_reduce_batch_size=True,
                file_names=["test_generation.wav"]
                )

text_to_speech(target_word)
Audio("test_generation.wav", autoplay=True)


In [None]:
# @title  { display-mode: "form" }
# @markdown # 2. Download Data
# @markdown Training custom models requires downloading a wide variety of data
# @markdown that will help make the model perform well in real-world scenarios.
# @markdown This example notebook will download small samples of background noise,
# @markdown music, and Room Impulse Responses (to add echo). This will still produce
# @markdown a custom model that performs well, but if you are interested in adding even more,
# @markdown feel free to extend this notebook to download the full datasets and even add
# @markdown your own!
# @markdown
# @markdown Downloading this example data will usually take about 15 minutes.

# @markdown **Important note!** The data downloaded here has a mixture of difference
# @markdown licenses and usage restrictions. As such, any custom models trained with this
# @markdown data should be considered as appropriate for **non-commercial** personal use only.

# ## Install all dependencies
# !pip install datasets
# !pip install scipy
# !pip install tqdm

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# # install openwakeword (full installation to support training)
# !git clone https://github.com/dscripka/openwakeword
# !pip install -e ./openwakeword
# !cd openwakeword

# # install other dependencies
# !pip install mutagen==1.47.0
# !pip install torchinfo==1.8.0
# !pip install torchmetrics==1.2.0
# !pip install speechbrain==0.5.14
# !pip install audiomentations==0.33.0
# !pip install torch-audiomentations==0.11.0
# !pip install acoustics==0.2.6
# !pip uninstall tensorflow -y
# !pip install tensorflow-cpu==2.8.1
# !pip install tensorflow_probability==0.16.0
# !pip install onnx_tf==1.10.0
# !pip install pronouncing==0.2.0
# !pip install datasets==2.14.6
# !pip install deep-phonemizer==0.0.19

# Download required models (workaround for Colab)
import os
# os.makedirs("./openwakeword/openwakeword/resources/models")
# !wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx -O ./openwakeword/openwakeword/resources/models/embedding_model.onnx
# !wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite -O ./openwakeword/openwakeword/resources/models/embedding_model.tflite
# !wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx -O ./openwakeword/openwakeword/resources/models/melspectrogram.onnx
# !wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite -O ./openwakeword/openwakeword/resources/models/melspectrogram.tflite

# Imports
import sys

if "piper-sample-generator/" not in sys.path:
    sys.path.append("piper-sample-generator/")
from generate_samples import generate_samples

import numpy as np
import torch
import sys
from pathlib import Path
import uuid
import yaml
import datasets
import scipy
from tqdm import tqdm

## Download all data

## Download MIR RIR data (takes about ~2 minutes)
output_dir = "./mit_rirs"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    rir_dataset = datasets.load_dataset("davidscripka/MIT_environmental_impulse_responses", split="train", streaming=True)
    # Save clips to 16-bit PCM wav files
    for row in tqdm(rir_dataset):
        name = row['audio']['path'].split('/')[-1]
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

## Download noise and background audio (takes about ~3 minutes)

# Audioset Dataset (https://research.google.com/audioset/dataset/index.html)
# Download one part of the audioset .tar files, extract, and convert to 16khz
# For full-scale training, it's recommended to download the entire dataset from
# https://huggingface.co/datasets/agkphysics/AudioSet, and
# even potentially combine it with other background noise datasets (e.g., FSD50k, Freesound, etc.)

if not os.path.exists("audioset"):
    os.mkdir("audioset")

    fname = "bal_train09.tar"
    out_dir = f"audioset/{fname}"
    link = "https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/" + fname
    !wget -O {out_dir} {link}
    !cd audioset && tar -xvf bal_train09.tar

    output_dir = "./audioset_16k"
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # Save clips to 16-bit PCM wav files
    audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("audioset/audio").glob("**/*.flac")]})
    audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))
    for row in tqdm(audioset_dataset):
        name = row['audio']['path'].split('/')[-1].replace(".flac", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

# Free Music Archive dataset
# https://github.com/mdeff/fma

output_dir = "./fma"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    fma_dataset = datasets.load_dataset("rudraml/fma", name="small", split="train", streaming=True)
    fma_dataset = iter(fma_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000)))

    # Save clips to 16-bit PCM wav files
    n_hours = 1  # use only 1 hour of clips for this example notebook, recommend increasing for full-scale training
    for i in tqdm(range(n_hours*3600//30)):  # this works because the FMA dataset is all 30 second clips
        row = next(fma_dataset)
        name = row['audio']['path'].split('/')[-1].replace(".mp3", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))
        i += 1
        if i == n_hours*3600//30:
            break

# Download pre-computed openWakeWord features for training and validation

# training set (~2,000 hours from the ACAV100M Dataset)
# See https://huggingface.co/datasets/davidscripka/openwakeword_features for more information
if not os.path.exists("./openwakeword_features_ACAV100M_2000_hrs_16bit.npy"):
    !wget https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy

# validation set for false positive rate estimation (~11 hours)
if not os.path.exists("validation_set_features.npy"):
    !wget https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/validation_set_features.npy


In [None]:
!pip install -e ./openwakeword
!pip install piper-phonemize
!pip install webrtcvad
!pip install torch
!pip install mutagen
!pip install torchinfo
!pip install torchmetrics
!pip install speechbrain
!pip install audiomentations
!pip install torch-audiomentations
!pip install acoustics
# !pip uninstall tensorflow -y
# !pip install tensorflow-cpu==2.8.1
# !pip install tensorflow_probability==0.16.0
!pip install onnx_tf
!pip install pronouncing
!pip install datasets==2.14.6
!pip install deep-phonemizer

In [None]:
# @title  { display-mode: "form" }
# @markdown # 3. Train the Model
# @markdown Now that you have verified your target wake word and downloaded the data,
# @markdown the last step is to adjust the training paramaters (or keep
# @markdown the defaults below) and start the training!

# @markdown Each paramater controls a different aspect of training:
# @markdown - `number_of_examples` controls how many examples of your wakeword
# @markdown are generated. The default (1,000) usually produces a good model,
# @markdown but between 30,000 and 50,000 is often the best.

# @markdown - `number_of_training_steps` controls how long to train the model.
# @markdown Similar to the number of examples, the default (10,000) usually works well
# @markdown but training longer usually helps.

# @markdown - `false_activation_penalty` controls how strongly false activations
# @markdown are penalized during the training process. Higher values can make the model
# @markdown much less likely to activate when it shouldn't, but may also cause it
# @markdown to not activate when the wake word isn't spoken clearly and there is
# @markdown background noise.

# @markdown With the default values shown below,
# @markdown this takes about 30 - 60 minutes total on the normal CPU Colab runtime.
# @markdown If you want to train on more examples or train for longer,
# @markdown try changing the runtime type to a GPU to significantly speedup
# @markdown the example generating and model training.

# @markdown When the model finishes training, you can navigate to the `my_custom_model` folder
# @markdown in the file browser on the left (click on the folder icon), and download
# @markdown the [your target wake word].onnx or  <your target wake word>.tflite files.
# @markdown You can then use these as you would any other openWakeWord model!

import yaml
import sys
sys.path.append('./openwakeword/openwakeword')

target_word = 'hey jar_vis'

# Load default YAML config file for training
config = yaml.load(open("./openwakeword/examples/custom_model.yml", 'r').read(), yaml.Loader)

# Modify values in the config and save a new version
number_of_examples = 100000 # @param {type:"slider", min:100, max:50000, step:50}
number_of_training_steps = 10000  # @param {type:"slider", min:0, max:50000, step:100}
false_activation_penalty = 1500  # @param {type:"slider", min:100, max:5000, step:50}
config["target_phrase"] = [target_word]
config["model_name"] = config["target_phrase"][0].replace(" ", "_")
config["n_samples"] = number_of_examples
config["n_samples_val"] = max(500, number_of_examples//10)
config["steps"] = number_of_training_steps
config["target_accuracy"] = 0.5
config["target_recall"] = 0.25
config["output_dir"] = "./my_custom_model_100k"
config["max_negative_weight"] = false_activation_penalty

config["background_paths"] = ['./audioset_16k', './fma']  # multiple background datasets are supported
config["false_positive_validation_data_path"] = "validation_set_features.npy"
config["feature_data_files"] = {"ACAV100M_sample": "openwakeword_features_ACAV100M_2000_hrs_16bit.npy"}

with open('my_model.yaml', 'w') as file:
    documents = yaml.dump(config, file)

# Generate clips
!{sys.executable} openwakeword/openwakeword/train.py --training_config my_model.yaml --generate_clips


You will need to modify the file structures in the following code!
(TODO: Make this more seamless)

In [None]:
import numpy as np
import torch
import sys
from pathlib import Path
import uuid
import yaml
import datasets
import scipy
from tqdm import tqdm
import os

sys.path.append('./openwakeword/openwakeword')
# import openwakeword as oww
from data import augment_clips
from utils import compute_features_from_generator

from scipy.io.wavfile import write

batch_size=16
positive_train_clips_directory = os.path.join("/workspace","external_storage","my_custom_model","hey_jar_vis","positive_train")
negative_train_clips_directory = os.path.join("/workspace","external_storage","my_custom_model","hey_jar_vis","negative_train")
positive_test_clips_directory = os.path.join("/workspace","external_storage","my_custom_model","hey_jar_vis","positive_test")
negative_test_clips_directory = os.path.join("/workspace","external_storage","my_custom_model","hey_jar_vis","negative_test")

augmented_clips_directory = os.path.join("/workspace","external_storage", "augmented_hey_jarvis")

if not os.path.exists(augmented_clips_directory):
    os.mkdir(augmented_clips_directory)

positive_clips_train = [str(i) for i in Path(positive_train_clips_directory).glob("*.wav")]
negative_clips_train = [str(i) for i in Path(negative_train_clips_directory).glob("*.wav")]
positive_clips_test = [str(i) for i in Path(positive_test_clips_directory).glob("*.wav")]
negative_clips_test = [str(i) for i in Path(negative_test_clips_directory).glob("*.wav")]

audioset_clips_directory = os.path.join("./audioset_16k")
fma_clips_directory = os.path.join("./fma")
rirs_directory = os.path.join("./mit_rirs")

rir_paths = [str(i) for i in Path(rirs_directory).glob("*.wav")]



positive_train_generator = augment_clips(positive_clips_train,
                     total_length=24000,
                     batch_size=batch_size,
                     background_clip_paths=[audioset_clips_directory, fma_clips_directory],
                     RIR_paths=rir_paths)

negative_train_generator = augment_clips(negative_clips_train,
                     total_length=24000,
                     batch_size=batch_size,
                     background_clip_paths=[audioset_clips_directory, fma_clips_directory],
                     RIR_paths=rir_paths)

positive_test_generator = augment_clips(positive_clips_test,
                     total_length=24000,
                     batch_size=batch_size,
                     background_clip_paths=[audioset_clips_directory, fma_clips_directory],
                     RIR_paths=rir_paths)

negative_test_generator = augment_clips(negative_clips_test,
                     total_length=24000,
                     batch_size=batch_size,
                     background_clip_paths=[audioset_clips_directory, fma_clips_directory],
                     RIR_paths=rir_paths)

augmented_positive_train_directory = os.path.join(augmented_clips_directory, "positive_train")
augmented_negative_train_directory = os.path.join(augmented_clips_directory, "negative_train")
augmented_positive_test_directory = os.path.join(augmented_clips_directory, "positive_test")
augmented_negative_test_directory = os.path.join(augmented_clips_directory, "negative_test")

generator_outputs = [[positive_train_generator, augmented_positive_train_directory, positive_train_clips_directory], 
                     [negative_train_generator, augmented_negative_train_directory, negative_train_clips_directory],
                     [positive_test_generator, augmented_positive_test_directory, positive_test_clips_directory],
                     [negative_test_generator, augmented_negative_test_directory, negative_test_clips_directory]]

for [generator, output_directory, source_directory] in generator_outputs:
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    
    audio_data = next(generator)
    n_total = len(os.listdir(source_directory))
    counter = 0

    for clip in audio_data:
      wav_path = os.path.join(output_directory, str(counter) + '.wav')
      write(wav_path, 16000, clip)
      counter += 1

    for audio_data in tqdm(generator, total=n_total//batch_size, desc="Augmenting clips"):
      for clip in audio_data:
        wav_path = os.path.join(output_directory, str(counter) + '.wav')
        write(wav_path, 16000, clip)
        counter += 1