# Preprocessing

**GOAL**: Create data that can be fed to the deep neural net.

### Properties

- *.npy* file for every sample
- length: 5s
- sample rate: 16kHz
- max padding: 1.5s
- Total: 100'000 samples

### Distribution

- Languagaes are evenly distributed (33.33% each)
- 50% Youtube, 50% Voxforge
- 80% Trainingset, 10% Validationset, 10% Testset

In [1]:
# Libraries
import os
import numpy as np
import librosa

from helper_functions import *

In [10]:
# Parameters
sample_duration = 5 * 16000
# if files are between 3.5 and 5s, the rest up to 5.0 is filled blank
min_duration = 3.5

In [52]:
# Partitioning
input_path = '../data/raw/'

# Sets how many samples should be saved from what source
distribution = {
    # path : (train, validate, test, margin)
    input_path+'youtube/fr/bfmtv':      np.array([4444, 555, 555, 5], dtype='int32'),
    input_path+'youtube/fr/franceinfo': np.array([4444, 555, 555, 5], dtype='int32'),
    input_path+'youtube/fr/france24':   np.array([4445, 556, 556, 5], dtype='int32'),
    input_path+'youtube/en/cnn':        np.array([6666, 833, 833, 5], dtype='int32'),
    input_path+'youtube/en/bbc':        np.array([6667, 834, 834, 5], dtype='int32'),
    input_path+'youtube/de/ard':        np.array([5167, 726, 726, 5], dtype='int32'),
    input_path+'youtube/de/dw':         np.array([5167, 686, 686, 5], dtype='int32'),
    input_path+'youtube/de/zdf':        np.array([3000, 255, 255, 5], dtype='int32'),
    input_path+'voxforge/en':           np.array([13333, 1666, 1666, 0], dtype='int32'),
    input_path+'voxforge/de':           np.array([13333, 1667, 1667, 0], dtype='int32'),
    input_path+'voxforge/fr':           np.array([13334, 1667, 1667, 0], dtype='int32')
}

# check that distribution is ok sum should be 100000
print(sum(distribution.values()))

# Output Directories
output_path = 'preprocessed_data/'
destination = {
    input_path+'youtube/fr/bfmtv': 'youtube/fr/',
    input_path+'youtube/fr/franceinfo':'youtube/fr/',
    input_path+'youtube/fr/france24':'youtube/fr/',
    input_path+'youtube/en/cnn':'youtube/en/',
    input_path+'youtube/en/bbc': 'youtube/en/',
    input_path+'youtube/de/ard':  'youtube/de/',
    input_path+'youtube/de/dw': 'youtube/de/',
    input_path+'youtube/de/zdf': 'youtube/de/',
    input_path+'voxforge/en': 'voxforge/en',
    input_path+'voxforge/de': 'voxforge/de',
    input_path+'voxforge/fr': 'voxforge/fr'
}

[40000  5000  5000     0]


In [53]:
def create_samples(path, partition, output_sub_path, min_duration=3.5):
    
    # Split of data, shape is [num_train, num_val, num_test]
    division = partition[:3]
    
    # Always cut margin from beginning and end of raw audio, e.g because of youtube intros
    margin = partition[3]
    
    # Points to split between train and val
    limit1 = division[0]/sum(division)
    # Point to split between val and test data
    limit2 = sum(division[:2])/sum(division)
    
    # Choice of samples
    # Pointers are simply strings with the file_path and the time to start reading the file at
    print("pointers...")
    pointers = get_sample_pointers(path, min_duration=min_duration, margin=margin)
    train_pool, val_pool, test_pool = np.split(pointers, [int(len(pointers)*limit1), int(len(pointers)*limit2)])
    
    # Make a random choice from the pointers to select samples
    train_choice = np.random.choice(len(train_pool), division[0], replace=False)
    val_choice = np.random.choice(len(val_pool), division[1], replace=False)
    test_choice = np.random.choice(len(test_pool), division[2], replace=False)
    
    # Take choice
    train_pointers = train_pool[train_choice]
    val_pointers = val_pool[val_choice]
    test_pointers = test_pool[test_choice]

    # Create Numpy arrays filled with samples
    print("samples...")
    make_samples(train_pointers, path, os.path.join(output_path, 'train', output_sub_path))
    make_samples(val_pointers, path, os.path.join(output_path, 'val', output_sub_path))
    make_samples(test_pointers, path, os.path.join(output_path, 'test', output_sub_path))

In [54]:
# For all sources in the distribution
for (path, array) in distribution.items():
    print(destination[path])
    # Create samples
    create_samples(path, array, destination[path])

voxforge/en
pointers...
73985 3.5 0
samples...
function call
got paths
function call
got paths
function call
got paths
voxforge/de
pointers...
18424 3.5 0
samples...
function call
got paths
function call
got paths
function call
got paths
voxforge/fr
pointers...
23246 3.5 0
samples...
function call
got paths
function call
got paths
function call
got paths


## Listening

In [16]:
files = get_paths('preprocessed_data/train/youtube/fr')

In [None]:
ipd.Audio(np.load(files[0]), rate=16000)