# "Berlin Net"

In [1]:
from comet_ml import Experiment
import keras
from keras import models, layers
import numpy as np
import matplotlib.pyplot as plt
from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
import sys
sys.path.append('../')
from utils import DataFeed
from keras import regularizers

Using TensorFlow backend.


## Model

Similar Architecture to [paper](https://github.com/twerkmeister/iLID/blob/master/Deep%20Audio%20Paper%20Thomas%20Werkmeister%2C%20Tom%20Herold.pdf)

Changes:
- add dropout
- no pooling stride
- remove batch normalization

In [2]:
model = models.Sequential()
model.add(Melspectrogram(n_dft=512, input_shape=(1, 5 * 16000,),
                         padding='same', sr=16000, n_mels=28,
                         fmin=0.0, fmax=10000, power_melgram=1.0,
                         return_decibel_melgram=True, trainable_fb=False,
                         trainable_kernel=False))
model.add(Normalization2D(str_axis='data_sample'))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Dropout(0.3))
model.add(layers.Flatten())
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
melspectrogram_1 (Melspectro (None, 28, 313, 1)        270364    
_________________________________________________________________
normalization2d_1 (Normaliza (None, 28, 313, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 26, 311, 64)       640       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 155, 64)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 153, 64)       36928     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 5, 76, 64)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 3, 74, 128)        73856     
__________

In [3]:
model.compile(optimizer='RMSprop',
              metrics=['accuracy', 'mae'],
              loss='categorical_crossentropy')

## Train on 50'000 samples

In [3]:
data_path = '../preprocessing/preprocessed_data'

In [3]:
train_data, train_labels = DataFeed.Dataset.create(data_path, ['train/voxforge', 'train/youtube'], num=50000, use_premade=False)

In [5]:
val_data, val_labels = DataFeed.Dataset.create(data_path, ['val/youtube', 'val/voxforge'], num=-1, shuffle=True)

In [27]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=5),
             keras.callbacks.ModelCheckpoint('berlin_net_small.h5', monitor='val_loss', save_best_only=True)]

In [37]:
history = model.fit(x=train_data,
                    y=train_labels,
                    batch_size=128, 
                    epochs=16,
                    validation_data=(val_data, val_labels), 
                    shuffle=True,
                    callbacks=callbacks)

Train on 50000 samples, validate on 10000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [38]:
history = model.fit(x=train_data,
                    y=train_labels,
                    batch_size=128, 
                    epochs=5,
                    validation_data=(val_data, val_labels), 
                    shuffle=True,
                    callbacks=callbacks)

Train on 50000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
11008/50000 [=====>........................] - ETA: 27s - loss: 0.1268 - acc: 0.9555 - mean_absolute_error: 0.0424

KeyboardInterrupt: 

In [39]:
small_model = keras.models.load_model('berlin_net_small.h5', custom_objects={'Melspectrogram':Melspectrogram, 
                                                                             'Normalization2D': Normalization2D})
tet_data, test_labels = DataFeed.Dataset.create(data_path, ['test/youtube', 'test/voxforge'], num=-1, shuffle=True)
small_model.evaluate(x=test_data,
                     y=test_labels), model.metrics_names

int_axis=0 passed but is ignored, str_axis is used instead.


([0.18541387659870087, 0.9369, 0.05716399599462747],
 ['loss', 'acc', 'mean_absolute_error'])

## Train Model on 100'000 samples

In [6]:
training_generator = DataFeed.DataGenerator(data_path, ['train/voxforge', 'train/youtube'], num=-1, batch_size=128)

In [7]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=3),
             keras.callbacks.ModelCheckpoint('berlin_net.h5', monitor='val_loss', save_best_only=True)]

In [8]:
history = model.fit_generator(generator=training_generator,
                              epochs=10,
                              validation_data=(val_data, val_labels), 
                              shuffle=True,
                              use_multiprocessing=True,
                              workers=8,
                              max_queue_size=20,          
                              callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
history = model.fit_generator(generator=training_generator,
                              epochs=5,
                              validation_data=(val_data, val_labels), 
                              shuffle=True,
                              use_multiprocessing=True,
                              workers=8,
                              max_queue_size=20,          
                              callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [5]:
big_model = keras.models.load_model('berlin_net/berlin_net.h5', custom_objects={'Melspectrogram':Melspectrogram, 
                                                                             'Normalization2D': Normalization2D})
#test_data, test_labels = DataFeed.Dataset.create(data_path, ['test/youtube', 'test/voxforge'], num=-1, shuffle=True)
big_model.evaluate(x=test_data,
                   y=test_labels), big_model.metrics_names

int_axis=0 passed but is ignored, str_axis is used instead.


([0.141640976549685, 0.9511, 0.04648835944905877],
 ['loss', 'acc', 'mean_absolute_error'])