In [1]:
import os
import pyaudio
import numpy as np
import wave
from datetime import datetime
import librosa
from scipy.io import wavfile

C:\Users\dk100\Anaconda3\envs\Wake_Word2\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\dk100\Anaconda3\envs\Wake_Word2\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


To generate more data for training the neural network, data augmentation is applied. 4 different augmentation will applied to the data.

## Noise

In [2]:
def manipulate_noise(data, noise_factor):
    """
    Adds additional meaningless data to the audiodata.

    Args:
        data:           Audio signal which should be augmented
        noise_factor:   Factor to put noise to the data
    """
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

## Shift

In [3]:
def manipulate_shift(data, sampling_rate, shift_max=1, shift_direction='both'):
    """
    Randomly shift audiodata to the left or right.

    Args:
        data:               Audio signal which should be augmented
        samplig_rate:       Sampling rate
        shift_max:          Maximum factor to shift the audio signal
        shift_direction:    Direction to shift the audio signal
    """
    shift = np.random.randint(sampling_rate * shift_max)
    if shift_direction == 'right':
        shift = -shift
    elif shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift    
            
    augmented_data = np.roll(data, shift)
    # Set to silence for heading/ tailing
    if shift > 0:
        augmented_data[:shift] = 0
    else:
        augmented_data[shift:] = 0
    return augmented_data

## Pitch

In [4]:
def manipulate_pitch(data, sampling_rate, pitch_factor):
    """
    Changes the pitch of the audio signal randomly.

    Args:
        data:           Audio signal which should be augmented
        samplig_rate:   Sampling rate
        pitch_factor:   Factor to change the pitch
    """
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor, bins_per_octave=32).astype('int16')

## Speed

In [5]:
def manipulate_speed(data, speed_factor):
    """
    Saves the augmented data as .wav file.

    Args:
        data:           Audio signal which should be augmented
        speed_factor:   Factor to Speed up or slow down the audio signal
    """
    augmented_data_speed = librosa.effects.time_stretch(data, speed_factor).astype('int16')
    zero_padding = np.zeros(16000 - augmented_data_speed.shape[0], dtype=np.int16)
    return np.concatenate([augmented_data_speed, zero_padding], 0)

In [6]:
def save_augmented_audio(save_path, label, augmented_data, sample_format = pyaudio.paInt16, channels = 1, fs = 16000):
    """
    Saves the augmented data as .wav file.

    Args:
        save_path:      Path to store the augmented data
        label:          Label of the data
        augmented_data: Augmented timeseries
        sample_format:  Format in which the audio signal is stored 
        channels:       Number of channels of the audio signal
        fs:             Sampling rate
    """
    p = pyaudio.PyAudio()  # Create an interface to PortAudio

    if not os.path.isdir('/'.join([save_path, label])):
        os.mkdir('/'.join([save_path, label]))

    # Save the recorded data as a WAV file
    wf = wave.open(save_path + "/" + label + "/" + (datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")[:-3]) + ".wav", 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(augmented_data))
    wf.close()

In [9]:
def augment_data(save_path, data_to_augment, fs = 16000):
    """
    Executes the data augmentation.

    Args:
        save_path:          Path to store the augmented data
        data_to_augment:    Classes for which the data should be augmented 
        fs:                 Sampling rate
    """
    if os.path.isdir('/'.join(["data", "recorded_data", data_to_augment])):
        data_path = "data/recorded_data/" + data_to_augment
        print("Audio from folder ", data_path," get augmented")
    else:
        data_path = "data/recorded_data"
        print("Audio from all subfolders of ", data_path," get augmented")

    if not os.path.isdir(save_path):
        os.mkdir(save_path)
    
    for path, subdirs, files in os.walk(data_path):
        for name in files:
            print("Next data to augment: ",name)
            _, data = wavfile.read(os.path.join(path, name))
            data_float = data.astype("float")

            label = os.path.basename(os.path.normpath(path))

            for i in range(0,50,5):
                augmented_data_noise = manipulate_noise(data, i)
                save_augmented_audio(save_path, label, augmented_data_noise)

            for i in range(0,10):
                # Shift is random, will be repeated 10 times
                augmented_data_shift = manipulate_shift(data, fs, shift_max=0.1, shift_direction='both')
                save_augmented_audio(save_path, label, augmented_data_shift)

            for i in range(-4,6):
                augmented_data_pitch = manipulate_pitch(data_float, fs, i)
                save_augmented_audio(save_path, label, augmented_data_pitch)

            for i in range(0,10):
                augmented_data_speed = manipulate_speed(data_float, 1.0+(i/10.0))
                save_augmented_audio(save_path, label, augmented_data_speed)

Call the `augment_data` function and apply the 4 different augmentations to your data.

In [10]:
augment_data("data/augmented_data", "all")

Audio from all subfolders of  data/recorded_data  get augmented
Next data to augment:  2021-09-13_15-08-00.wav
Next data to augment:  2021-09-13_15-08-01.wav
Next data to augment:  2021-09-13_15-08-02.wav
Next data to augment:  2021-09-13_15-08-03.wav
Next data to augment:  2021-09-13_15-08-04.wav
Next data to augment:  2021-09-13_15-08-05.wav
Next data to augment:  2021-09-13_15-08-06.wav
Next data to augment:  2021-09-13_15-08-07.wav


KeyboardInterrupt: 