# Data Splitting

- discard all files < 2s
- create samples with duration = 5s
    - fill with zeros at the end
- one-hot encode labels
- shuffle
- create .npy files with data, labels

In [11]:
import os
import numpy as np
import librosa

In [157]:
relative_path = '../data/raw'
num_samples = 30000 # how many 5s samples 
min_duration = 2
sample_duration = 5 * 16000 # 5s * sample_rate

In [158]:
# returns list with file Paths
def get_paths(lang):
    relative_lang = os.path.join(relative_path, lang)
    files = os.listdir(relative_lang)
    for i in range(len(files)):
        files[i] = os.path.join(relative_lang, files[i])
    return files

In [159]:
data = np.zeros((num_samples, sample_duration))
labels = np.zeros((num_samples, 3))

In [160]:
sample_index = 0

# for every language
for i, lang in enumerate(['french', 'english', 'german']):
    
    # for all files
    samples_per_language = 0
    for j, file in enumerate(get_paths(lang)):
        #load file with librosa
        signal, sr = librosa.load(file, sr=None)
        
        # determine number of non_overlapping, min 2s samples from file
        num_signal_samples = len(signal)//sample_duration
        if ((len(signal)%sample_duration)/sr > 2): 
            num_signal_samples += 1
        
        # for every possible sample
        k = 0
        while (k < num_signal_samples and samples_per_language < num_samples/3):
            # fill data, labels
            part = signal[k*sample_duration : (k+1) * sample_duration]
            data[sample_index, :len(part)] = part
            labels[sample_index, i] = 1.
            
            k += 1 # counter for possible samples
            sample_index += 1 # global sample counter
            samples_per_language += 1 # counter per language
        
        # if balance achieved
        if (samples_per_language == num_samples/3):
            break

### Shuffle

[Stackoverflow](https://stackoverflow.com/questions/43229034/randomly-shuffle-data-and-labels-from-different-files-in-the-same-order)

In [161]:
idx = np.random.permutation(num_samples)
data,labels = data[idx], labels[idx]

### Check

In [162]:
import IPython.display as ipd

In [163]:
ipd.Audio(data[3], rate=16000) # sounds german

In [164]:
labels[3] # is german ;)

array([0., 0., 1.])

### Save to .npy

In [165]:
np.save('../data/data_'+ str(num_samples), data)
np.save('../data/labels_'+ str(num_samples), labels)