In [None]:
# Downloads audio data for augmentation
# Borrowed from openWakeWord's automatic_model_training.ipynb, accessed March 4, 2024

import datasets
import scipy
import os

import numpy as np

from pathlib import Path
from tqdm import tqdm

## Download MIR RIR data

output_dir = "./mit_rirs"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    rir_dataset = datasets.load_dataset("davidscripka/MIT_environmental_impulse_responses", split="train", streaming=True)
    # Save clips to 16-bit PCM wav files
    for row in tqdm(rir_dataset):
        name = row['audio']['path'].split('/')[-1]
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

## Download noise and background audio

# Audioset Dataset (https://research.google.com/audioset/dataset/index.html)
# Download one part of the audioset .tar files, extract, and convert to 16khz
# For full-scale training, it's recommended to download the entire dataset from
# https://huggingface.co/datasets/agkphysics/AudioSet, and
# even potentially combine it with other background noise datasets (e.g., FSD50k, Freesound, etc.)

if not os.path.exists("audioset"):
    os.mkdir("audioset")

    fname = "bal_train09.tar"
    out_dir = f"audioset/{fname}"
    link = "https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/" + fname
    !wget -O {out_dir} {link}
    !cd audioset && tar -xvf bal_train09.tar

    output_dir = "./audioset_16k"
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # Save clips to 16-bit PCM wav files
    audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("audioset/audio").glob("**/*.flac")]})
    audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))
    for row in tqdm(audioset_dataset):
        name = row['audio']['path'].split('/')[-1].replace(".flac", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

# Free Music Archive dataset
# https://github.com/mdeff/fma

output_dir = "./fma"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    fma_dataset = datasets.load_dataset("rudraml/fma", name="small", split="train", streaming=True)
    fma_dataset = iter(fma_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000)))

    # Save clips to 16-bit PCM wav files
    n_hours = 1  # use only 1 hour of clips for this example notebook, recommend increasing for full-scale training
    for i in tqdm(range(n_hours*3600//30)):  # this works because the FMA dataset is all 30 second clips
        row = next(fma_dataset)
        name = row['audio']['path'].split('/')[-1].replace(".mp3", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))
        i += 1
        if i == n_hours*3600//30:
            break

In [None]:
# Downloads already generated spectrogram features (made for microWakeWord in particular) for various negative datasets 
output_dir = './negative_datasets'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    link_root = "https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main/"
    filenames = ['dinner_party_background.zip', 'no_speech_background.zip', 'speech_backround.zip']
    for fname in filenames:
        link = link_root + fname
        
        out_dir = f"negative_datasets/{fname}"
        !wget -O {out_dir} {link}
        !cd {output_dir} && unzip -q {fname}

In [None]:
# Loads mp3 clips in generated_samples/positive_validation and prepares to augment them

from microwakeword.audio.augmentation import Augmentation
from microwakeword.audio.clips import Clips
from microwakeword.audio.spectrograms import SpectrogramGeneration

clips = Clips(input_directory='generated_samples/positive/validation', 
              file_pattern='*.mp3', 
              max_clip_duration_s=None,
              remove_silence=True, # HA Cloud TTS samples have extra silence at end, so trim it off first.
              )
augmenter = Augmentation(augmentation_duration_s=3.9,
                         augmentation_probabilities = {
                                "SevenBandParametricEQ": 0.25,
                                "TanhDistortion": 0.25,
                                "PitchShift": 0.25,
                                "BandStopFilter": 0.25,
                                "AddColorNoise": 0.25,
                                "AddBackgroundNoise": 0.75,
                                "Gain": 1.0,
                                "RIR": 0.5,
                            },
                         impulse_paths = ['mit_rirs'],
                         background_paths = ['fma', 'audioset'],
                         background_min_snr_db = -10,
                         background_max_snr_db = 0,
                         min_jitter_s = 0.1,
                         max_jitter_s = 0.2,
                         )
spectrograms = SpectrogramGeneration(clips=clips,
                                     augmenter=augmenter,
                                     slide_frames=5,    # Uses the same spectrogram 5 times, just shifted over by one frame. This simulates the streaming inferences while training/validating in nonstreaming mode.
                                     )

In [None]:
# Augment a random clip and play it back to verify it works well
import IPython
from microwakeword.audio.audio_utils import save_clip

random_clip = clips.get_random_clip()
augmented_clip = augmenter.augment_clip(random_clip)
save_clip(augmented_clip, 'augmented_clip.wav')

IPython.display.display(IPython.display.Audio("augmented_clip.wav"))

In [None]:
# Augmented samples and save them for a validation set

import os
from mmap_ninja.ragged import RaggedMmap

if not os.path.exists('generated_sets'):
    os.mkdir('generated_sets')
if not os.path.exists('generated_sets/validation'):
    os.mkdir('generated_sets/validation')

RaggedMmap.from_generator(
    out_dir='generated_sets/validation/wakeword_mmap',
    sample_generator=spectrograms.spectrogram_generator(repeat=10),
    batch_size=100,
    verbose=True,
)

In [None]:
# Generate augmented spectorgrams for the testing set
# Don't use slide_frames for testing set
# You really should use a completely new set of samples from new voices rather than reuse the validation set...

clips = Clips(input_directory='generated_samples/positive/validation', 
              file_pattern='*.mp3', 
              max_clip_duration_s=None,
              remove_silence=True, # HA Cloud TTS samples have extra silence at end, so trim it off first.
              )
augmenter = Augmentation(augmentation_duration_s=3.9,
                         augmentation_probabilities = {
                                "SevenBandParametricEQ": 0.25,
                                "TanhDistortion": 0.25,
                                "PitchShift": 0.25,
                                "BandStopFilter": 0.25,
                                "AddColorNoise": 0.25,
                                "AddBackgroundNoise": 0.75,
                                "Gain": 1.0,
                                "RIR": 0.5,
                            },
                         impulse_paths = ['mit_rirs'],
                         background_paths = ['fma', 'audioset'],
                         background_min_snr_db = -10,
                         background_max_snr_db = 0,
                         min_jitter_s = 0.1,
                         max_jitter_s = 0.2,
                         )
spectrograms = SpectrogramGeneration(clips=clips,
                                     augmenter=augmenter,
                                     )
RaggedMmap.from_generator(
    out_dir='generated_sets/testing/wakeword_mmap',
    sample_generator=spectrograms.spectrogram_generator(repeat=10),
    batch_size=100,
    verbose=True,
)