### Generator
- https://keras.io/models/sequential/
- https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html

In [130]:
from keras.datasets import reuters
from keras import models, layers
import numpy as np
import matplotlib.pyplot as plt
from kapre.time_frequency import Melspectrogram

In [160]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    
    def __init__(self, datagroup, sets, split, batch_size=32, dim=(1, 80000), n_classes=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.datagroup = datagroup
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.sets = sets    # voxforge/youtube
        self.split = split  # train/val/test
        
        self.get_id_list()
        self.on_epoch_end()
        
    def get_id_list(self):
        
        # first dimension
        if (len(self.sets) == 2):
            num_vox = datagroup['voxforge/'+split+'/labels'].len()
            num_you = datagroup['youtube/'+split+'/labels'].len()
            num_tot_samples = (num_you + num_vox)
            
            self.id_list = np.zeros((num_tot_samples, 2))
            self.id_list[:num_vox, 0] = np.arange(num_vox)
            self.id_list[num_vox:, 0] = np.arange(num_you)
            self.id_list[num_you:, 1] = 1.
            
            
        # train either voxforge xor youtube
        else:
            size = datagroup[self.sets[0]+'/'+self.split+'/labels'].len()
            self.id_list = np.concatenate((np.arange(size).reshape(size, 1), np.zeros((size, 1))), 1)
        

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.id_list) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.id_list[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, Y = self.__data_generation(indexes)

        return X, Y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        if self.shuffle == True:
            np.random.shuffle(self.id_list)

    def __data_generation(self, id_list_temp):
        'Generates data containing batch_size samples' 
        # Initialization
        X = np.empty((self.batch_size, *self.dim))
        Y = np.empty((self.batch_size, self.n_classes))

        # Generate data
        for i, (ID, s) in enumerate(id_list_temp):
            # Store sample
            X[i,0] = datagroup[self.sets[int(s)] + '/' + self.split + '/data'][ID]

            # Store class
            Y[i] = datagroup[self.sets[int(s)] + '/' + self.split + '/labels'][ID]
        print("got batch")
        return X, Y

### Training

In [161]:
params = {'batch_size': 64,
          'shuffle': True}

with h5py.File('../preprocessing/datasets/data_100.hdf5', 'r') as datagroup:

    # Generators
    training_generator = DataGenerator(datagroup, ['voxforge', 'youtube'], 'train', **params)
    validation_generator = DataGenerator(datagroup, ['youtube'], 'val', **params)

In [162]:
model = models.Sequential()
model.add(Melspectrogram(n_dft=512, input_shape=(1, 5 * 16000,),
                         padding='same', sr=16000, n_mels=28,
                         fmin=0.0, fmax=10000, power_melgram=1.0,
                         return_decibel_melgram=False, trainable_fb=False,
                         trainable_kernel=False))
model.add(layers.Flatten())
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

In [163]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
melspectrogram_10 (Melspectr (None, 28, 313, 1)        270364    
_________________________________________________________________
flatten_10 (Flatten)         (None, 8764)              0         
_________________________________________________________________
dense_28 (Dense)             (None, 50)                438250    
_________________________________________________________________
dense_29 (Dense)             (None, 3)                 153       
Total params: 708,767
Trainable params: 438,403
Non-trainable params: 270,364
_________________________________________________________________


In [164]:
model.compile(optimizer='Rmsprop',
              metrics=['accuracy'],
              loss='categorical_crossentropy')

In [165]:
with h5py.File('../preprocessing/datasets/data_100.hdf5', 'r') as datagroup:

# Train model on dataset
    model.fit_generator(generator=training_generator,
                        validation_data=validation_generator,
                        verbose=2)

Epoch 1/1
got batch
got batch
got batch


KeyboardInterrupt: 

In [93]:
a = np.ones((5, 1), dtype='int64')

In [94]:
b = np.zeros((5, 1), dtype='int64')

In [95]:
np.concatenate((a, b), 1)

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0]])

In [104]:
np.arange(15).reshape(15, 1)

array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12],
       [13],
       [14]])