# Data Splitting

- create samples with duration = 5s
- zero padding up to 1.5s
- one-hot encode labels
- create .hdf5 files with data, labels

TOT = 100'000

The structure is like this. The languages are evenly distributed
- Voxforge (50'000)
    - Training (40'000)
    - Validation (5'000)
    - Test (5'000)
- Youtube (50'000)
    - Training (40'000)
    - Validation (5'000)
    - Test (5'000)

Help: https://www.uetke.com/blog/python/how-to-use-hdf5-files-in-python/

In [9]:
import os
import numpy as np
import h5py
import librosa

from my_data_methods import *

In [2]:
sample_duration = 5 * 16000
min_duration = 3.5

## Voxforge

In [3]:
relative_path = 'raw/voxforge/'
num_training = 40000
num_validation = 5000
num_test = 5000

In [37]:
with h5py.File('split/data_100.hdf5', 'w') as f:
    
    # Create Dataset
    dset_train_data = f.create_dataset("voxforge/train/data", (num_training, sample_duration), compression="gzip")
    dset_train_labels = f.create_dataset("voxforge/train/labels", (num_training, 3), compression="gzip")
    dset_val_data = f.create_dataset("voxforge/val/data", (num_validation, sample_duration), compression="gzip")
    dset_val_labels = f.create_dataset("voxforge/val/labels", (num_validation, 3), compression="gzip")
    dset_test_data = f.create_dataset("voxforge/test/data", (num_test, sample_duration), compression="gzip")
    dset_test_labels = f.create_dataset("voxforge/test/labels", (num_test, 3), compression="gzip")
    
    # returns array such that sum(arr) = tot and all elem ~ tot/3
    train_parts = compute_partitions(num_training, 3)
    val_parts = compute_partitions(num_validation, 3)
    test_parts = compute_partitions(num_test, 3)
    
    for i, lang in enumerate(['french', 'english', 'german']):
        
        print(f"{lang}, sampling...") # progress
        (train_data, train_labels, 
         val_data, val_labels,
         test_data, test_labels) = get_split(relative_path+lang, i, train_parts[i], val_parts[i], test_parts[i], min_duration=min_duration)
        
        # Write Voxforge
        print(f"{lang}, writing...") # progress
        tsi, vsi, tesi = int(i*(num_training//3)), int(i*(num_validation//3)), int(i*(num_test//3))
        
        dset_train_data[tsi: tsi+train_parts[i]] = train_data[:]
        dset_train_labels[tsi: tsi+train_parts[i]] = train_labels[:]
        
        dset_val_data[vsi: vsi+val_parts[i]] = val_data[:]
        dset_val_labels[vsi: vsi+val_parts[i]] = val_labels[:]
        
        dset_test_data[tesi: tesi+test_parts[i]] = test_data[:]
        dset_test_labels[tesi: tesi+test_parts[i]] = test_labels[:]
print("done")

french, sampling...
pointers...
samples...
function call
got paths
function call
got paths
function call
got paths
french, writing...
english, sampling...
pointers...
samples...
function call
got paths
function call
got paths
function call
got paths
english, writing...
german, sampling...
pointers...
samples...
function call
got paths
function call
got paths
function call
got paths
german, writing...
done


## Youtube


In [7]:
# Partitioning
rp = 'raw/youtube/'
distribution = {
    rp+'fr/bfmtv': (4444, 555, 555, 0), # (4444, 555, 555, 0)
    rp+'fr/franceinfo': (4444, 555, 555, 0),
    rp+'fr/france24': (4445, 556, 556, 0),
    rp+'en/cnn': (6666, 833, 833, 1),
    rp+'en/bbc': (6667, 834, 834, 1),
    rp+'de/ard': (5167, 556, 556, 2),
    rp+'de/dw': (5167, 556, 556, 2),
    rp+'de/zdf': (3000, 555, 555, 2),
}

# check that distribution is ok
train_sum, val_sum = 0, 0
for path, (num_train, num_val, num_test, lang_index) in distribution.items():
    train_sum, val_sum = train_sum+num_train, val_sum+num_val
print(f"Train_sum: {train_sum}")
print(f"Val_sum: {val_sum}")

Train_sum: 40000
Val_sum: 5000


In [8]:
twi, vwi, tewi = 0, 0, 0

with h5py.File('split/data_100.hdf5', 'r+') as f:
    
    # Load Datasets
    #dset_train_data = f.create_dataset("youtube/train/data", (num_training, sample_duration), compression="gzip")
    #dset_train_labels = f.create_dataset("youtube/train/labels", (num_training, 3), compression="gzip")
    #dset_val_data = f.create_dataset("youtube/val/data", (num_validation, sample_duration), compression="gzip")
    #dset_val_labels = f.create_dataset("youtube/val/labels", (num_validation, 3), compression="gzip")
    #dset_test_data = f.create_dataset("youtube/test/data", (num_test, sample_duration), compression="gzip")
    #dset_test_labels = f.create_dataset("youtube/test/labels", (num_test, 3), compression="gzip")
    dset_train_data = f["youtube/train/data"]
    dset_train_labels = f["youtube/train/labels"]
    dset_val_data = f["youtube/val/data"]
    dset_val_labels = f["youtube/val/labels"]
    dset_test_data = f["youtube/test/data"]
    dset_test_labels = f["youtube/test/labels"]
    
    for path, (num_train, num_val, num_test, lang_index) in distribution.items():
        
        print(f"{path}, sampling...") # progress
        (train_data, train_labels,
         val_data, val_labels,
         test_data, test_labels) = get_split(path, lang_index, num_train, num_val, num_test, min_duration=3.5, margin=5)
    
        print(f"{path}, writing...") # progress
        dset_train_data[twi: twi+num_train] = train_data[:]
        dset_train_labels[twi: twi+num_train] = train_labels[:]
        dset_val_data[vwi: vwi+num_val] = val_data[:]
        dset_val_labels[vwi: vwi+num_val] = val_labels[:]
        dset_test_data[tewi: tewi+num_test] = test_data[:]
        dset_test_labels[tewi: tewi+num_test] = test_labels[:]
        
        twi += num_train
        vwi += num_val
        tewi += num_test

raw/youtube/fr/bfmtv, sampling...
pointers...
7646 3.5 5
samples...
function call
got paths
function call
got paths
function call
got paths
raw/youtube/fr/bfmtv, writing...
raw/youtube/fr/franceinfo, sampling...
pointers...
12652 3.5 5
samples...
function call
got paths
function call
got paths
function call
got paths
raw/youtube/fr/franceinfo, writing...
raw/youtube/fr/france24, sampling...
pointers...
6349 3.5 5
samples...
function call
got paths
function call
got paths
function call
got paths
raw/youtube/fr/france24, writing...
raw/youtube/en/cnn, sampling...
pointers...
11492 3.5 5
samples...
function call
got paths
function call
got paths
function call
got paths
raw/youtube/en/cnn, writing...
raw/youtube/en/bbc, sampling...
pointers...
11337 3.5 5
samples...
function call
got paths
function call
got paths
function call
got paths
raw/youtube/en/bbc, writing...
raw/youtube/de/ard, sampling...
pointers...
7877 3.5 5
samples...
function call
got paths
function call
got paths
function c

### Check

In [10]:
import IPython.display as ipd

In [28]:
with h5py.File('split/data_100.hdf5', 'r') as f:
    listen_data_set = f['youtube/train/data']
    listen_data = listen_data_set[2000:2010]
    listen_data_labels = f['youtube/train/labels'][2000:2010]
    print(f["youtube/train/labels"].shape)

(40000, 3)


In [34]:
ipd.Audio(listen_data[7], rate=16000) # sounds german

In [30]:
np.sum(listen_data_labels, axis=0) # is english ;)

array([10.,  0.,  0.], dtype=float32)