In [1]:
import numpy as np
import os
import random
import tqdm
import math

from scripts.audio_file import AudioFile



In [2]:
# collect paths of all audio tracks with positive words
marvin_path = '../data/raw/SpeechDataset/marvin/'
marvin_filenames = [marvin_path + p for p in os.listdir(marvin_path)]

# select excluded filenames, and get available negative words
exclude = ['_background_noise_', 'marvin', 'LICENSE', 'README.md',
           'testing_list.txt', 'validation_list.txt']
speechdata_path = '../data/raw/SpeechDataset/'
negative_words = [n for n in os.listdir(speechdata_path)
                  if n not in exclude]

# collect paths of all audio tracks with negative words
nw_filenames = []
for nw in negative_words:
    filenames = [speechdata_path + nw + '/' + file
                 for file in os.listdir(speechdata_path + nw)]
    nw_filenames += filenames

In [3]:
print('Number of marvin recordings:', len(marvin_filenames))
print('Number of recordings if we augment each sample once:',
      len(marvin_filenames)*2)
print('Desired ratio is 1:9 between positive and negative words, '
      '\ntherefore number of negative words must be:',
      int(len(marvin_filenames)*2 / 0.1 - len(marvin_filenames)*2))

Number of marvin recordings: 2100
Number of recordings if we augment each sample once: 4200
Desired ratio is 1:9 between positive and negative words, 
therefore number of negative words must be: 37800


### Augmenting positive audio clips

In [6]:
# augment positive words
augmented_path = 'data/augmented/'
print('Augmenting positive words...')
for filename in tqdm.tqdm(marvin_filenames):
    filename_end = filename.split('/')[-1]
    m = AudioFile(filename)
    # permissible factor values [-5, 5)
    factor = np.random.choice(np.arange(-5.0, 5.0, 0.1))
    m.shift_pitch(factor=factor)
    m.save_wav(augmented_path + filename_end)

# get filenames of augmented clips
marvin_filenames_aug = [augmented_path + f for f in os.listdir(augmented_path)]

Augmenting positive words...


100%|██████████| 2100/2100 [01:48<00:00, 19.41it/s]


### Stratified split

In [17]:
# shuffle and split each filename
random.shuffle(marvin_filenames)
random.shuffle(marvin_filenames_aug)
random.shuffle(nw_filenames)

# select fewer negative words
nw_filenames_trim = nw_filenames[:37800]

# split
marvin_train_line = math.ceil(len(marvin_filenames)*0.8)
nw_train_line = math.ceil(len(nw_filenames_trim)*0.8)

marvin_val_line = math.ceil(len(marvin_filenames)*0.9)
nw_val_line = math.ceil(len(nw_filenames_trim)*0.9)

marvin_filenames_train = marvin_filenames[:marvin_train_line]
marvin_filenames_aug_train = marvin_filenames_aug[:marvin_train_line]
nw_filenames_train = nw_filenames_trim[:nw_train_line]

marvin_filenames_val = marvin_filenames[marvin_train_line:marvin_val_line]
marvin_filenames_aug_val = marvin_filenames_aug[marvin_train_line:marvin_val_line]
nw_filenames_val = nw_filenames_trim[nw_train_line:nw_val_line]

marvin_filenames_test = marvin_filenames[marvin_val_line:]
marvin_filenames_aug_test = marvin_filenames_aug[marvin_val_line:]
nw_filenames_test = nw_filenames_trim[nw_val_line:]

# finally add original and augmented positive clips together
marvin_train = marvin_filenames_train + marvin_filenames_aug_train
marvin_val = marvin_filenames_val + marvin_filenames_aug_val
marvin_test = marvin_filenames_test + marvin_filenames_aug_test

random.shuffle(marvin_train)
random.shuffle(marvin_val)
random.shuffle(marvin_test)

### Create data files

In [34]:
# test what is the maximum length of a positive file
for path in tqdm.tqdm(marvin_train + marvin_val + marvin_test):
    sample = AudioFile(path)

    max_length = 0
    if sample.data.shape[0] > max_length:
        max_length = sample.data.shape[0]

print(max_length)

100%|██████████| 4200/4200 [00:36<00:00, 113.62it/s]

22050





In [54]:
def create_batches(files, set_name='train', batch_size=100):

    for i, file in tqdm.tqdm(enumerate(files)):

        if i % batch_size == 0:

            # save previous batch
            if i != 0:
                np.save(f'data/{set_name}/X{math.ceil(i/batch_size) - 1}.npy', X_batch)
                np.save(f'data/{set_name}/y{math.ceil(i/batch_size) - 1}.npy', y_batch)

            # initiate empty batch
            X_batch = np.zeros((batch_size, max_length, ))
            y_batch = np.zeros((batch_size, 1))

        # add 1 if positive word
        if '/marvin/' in file or '/augmented/' in file:
            y_batch[i % batch_size, :] = 1

        # add audio data to the batch
        clip = AudioFile(file)
        clip_data = clip.data
        X_batch[i % batch_size, :clip_data.shape[0], ] = clip_data

    # final batch
    X_batch = X_batch[:i % batch_size + 1, :, ]
    y_batch = y_batch[:i % batch_size + 1, :]

    np.save(f'data/{set_name}/X{math.ceil(i/batch_size) - 1}.npy', X_batch)
    np.save(f'data/{set_name}/y{math.ceil(i/batch_size) - 1}.npy', y_batch)

In [55]:
# creating a batches of 100 samples
train_files = marvin_train + nw_filenames_train
val_files = marvin_val + nw_filenames_val
test_files = marvin_test + nw_filenames_test

random.shuffle(train_files)
random.shuffle(val_files)
random.shuffle(test_files)

print('Generating train data...')
create_batches(train_files, set_name='train', batch_size=100)
print('Generating val data...')
create_batches(val_files, set_name='val', batch_size=100)
print('Generating test data...')
create_batches(test_files, set_name='test', batch_size=100)

Generating train data...


33600it [09:26, 59.29it/s]


Generating val data...


4200it [01:10, 59.85it/s]


Generating test data...


4200it [01:10, 59.63it/s]
