# "Conv2d to rnn"

In [43]:
from comet_ml import Experiment
import keras
from keras import models, layers
import numpy as np
import matplotlib.pyplot as plt
from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
import sys
sys.path.append('../')
from utils import DataFeed
from keras.regularizers import l2
from keras.optimizers import RMSprop

### Load Data

In [3]:
data_path='../preprocessing/preprocessed_data'

In [None]:
train_data, train_labels = DataFeed.Dataset.create(data_path, ['train/voxforge', 'train/youtube'], num=50000, use_premade=True)

In [41]:
training_generator = DataFeed.DataGenerator(data_path, ['train/voxforge', 'train/youtube'], num=-1, batch_size=64)

In [4]:
val_data, val_labels = DataFeed.Dataset.create(data_path, ['val/youtube', 'val/voxforge'], num=-1, shuffle=True)

In [5]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=5),
             keras.callbacks.ModelCheckpoint('conv2d_to_rnn.h5', monitor='val_loss', save_best_only=True)]

### Simple First Model

In [42]:
weight_decay = 0.01

model1 = models.Sequential()
model1.add(Melspectrogram(n_dft=512, input_shape=(1, 5 * 16000,),
                         padding='same', sr=16000, n_mels=32,
                         fmin=0.0, fmax=5000, power_melgram=1.0,
                         return_decibel_melgram=True, trainable_fb=False,
                         trainable_kernel=False))
model1.add(Normalization2D(str_axis='data_sample'))

model1.add(layers.Conv2D(16, (4, 4), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model1.add(layers.MaxPooling2D((2, 2 )))
model1.add(layers.BatchNormalization())

model1.add(layers.Conv2D(32, (2, 2), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model1.add(layers.MaxPooling2D((2, 1)))
model1.add(layers.BatchNormalization())

model1.add(layers.Conv2D(32, (2, 2), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model1.add(layers.MaxPooling2D((2, 1)))
model1.add(layers.BatchNormalization())

model1.add(layers.Conv2D(64, (2, 2), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model1.add(layers.MaxPooling2D((2, 1)))
model1.add(layers.BatchNormalization())

model1.add(layers.Conv2D(64, (2, 2), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model1.add(layers.MaxPooling2D((2, 1)))
model1.add(layers.BatchNormalization())

model1.add(layers.Reshape((-1, 64)))
model1.add(layers.GRU(64, dropout=0.3))
model1.add(layers.Dense(3, activation='softmax'))
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
melspectrogram_11 (Melspectr (None, 32, 313, 1)        271392    
_________________________________________________________________
normalization2d_10 (Normaliz (None, 32, 313, 1)        0         
_________________________________________________________________
conv2d_51 (Conv2D)           (None, 32, 313, 16)       272       
_________________________________________________________________
max_pooling2d_51 (MaxPooling (None, 16, 156, 16)       0         
_________________________________________________________________
batch_normalization_51 (Batc (None, 16, 156, 16)       64        
_________________________________________________________________
conv2d_52 (Conv2D)           (None, 16, 156, 32)       2080      
_________________________________________________________________
max_pooling2d_52 (MaxPooling (None, 8, 156, 32)        0         
__________

=> 94%

### Model 2
derived from https://github.com/YerevaNN/Spoken-language-identification/blob/master/theano/networks/tc_net_rnn_onernn.py

In [None]:
shared_GRU = layers.CuDNNGRU(15)
model = models.Sequential()
model.add(Melspectrogram(n_dft=512, input_shape=(1, 5 * 16000,),
                         padding='same', sr=16000, n_mels=223, n_hop=256,
                         fmin=0.0, fmax=5500, power_melgram=1.0,
                         return_decibel_melgram=False, trainable_fb=False,
                         trainable_kernel=False))
model.add(layers.Conv2D(16, (7, 7), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3 ), strides=(2,2), padding='same'))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (5, 5), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))
model.add(layers.BatchNormalization())

model.add(layers.Permute((3,2,1)))
model.add(layers.TimeDistributed(shared_GRU))
model.add(layers.Reshape((-1,)))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(3, activation='softmax'))
#model.summary()

=> 91%

### Model 3: Inception
derived from https://github.com/HPI-DeepLearning/crnn-lid/blob/master/keras/models/inceptionv3_crnn.py

In [42]:
from keras.applications.inception_v3 import InceptionV3

In [68]:
input_tensor = layers.Input(shape=(1, 80000))
x = Melspectrogram(n_dft=512,
                   padding='same', sr=16000, n_mels=128, n_hop=256,
                   fmin=0.0, fmax=5500, power_melgram=1.0,
                   return_decibel_melgram=False, trainable_fb=False,
                   trainable_kernel=False)(input_tensor)

inception_model = InceptionV3(include_top=False, weights=None, input_tensor=x)

x = inception_model.output

#x = GlobalAveragePooling2D()(x)

# (bs, y, x, c) --> (bs, x, y, c)
x = layers.Permute((2, 1, 3))(x)

# (bs, x, y, c) --> (bs, x, y * c)
_a, _y, _c = [int(s) for s in x.shape[1:]]
x = layers.Reshape((_a, _y*_c))(x)
x = layers.wrappers.Bidirectional(layers.LSTM(64, return_sequences=False), merge_mode="concat")(x)
x = layers.Dense(3, activation='softmax')(x)

model = models.Model(input_tensor, outputs=x)
#model.summary()

tooooo biiiigggg...

### Model 3: MobileNet

In [55]:
from keras.applications.mobilenetv2 import MobileNetV2

In [69]:
input_tensor = layers.Input(shape=(1, 80000))
x = Melspectrogram(n_dft=512,
                   padding='same', sr=16000, n_mels=223, n_hop=360,
                   fmin=0.0, fmax=5500, power_melgram=1.0,
                   return_decibel_melgram=False, trainable_fb=False,
                   trainable_kernel=False)(input_tensor)

inception_model = MobileNetV2(include_top=False, weights=None, input_tensor=x,
                            alpha=0.25, pooling=None)

x = inception_model.output

#x = GlobalAveragePooling2D()(x)

# (bs, y, x, c) --> (bs, x, y, c)
x = layers.Permute((2, 1, 3))(x)

# (bs, x, y, c) --> (bs, x, y * c)
_a, _y, _c = [int(s) for s in x.shape[1:]]
x = layers.Reshape((_a, _y*_c))(x)
x = layers.GRU(64, return_sequences=False, dropout=0.5)(x)
x = layers.Dense(3, activation='softmax')(x)

model = models.Model(input_tensor, outputs=x)
#model.summary()

KeyboardInterrupt: 

## Train

In [44]:
model1.compile(optimizer=RMSprop(lr=0.001),
              metrics=['accuracy', 'mae'],
              loss='categorical_crossentropy')

history = model1.fit_generator(generator=training_generator,
                              epochs=15,
                              validation_data=(val_data, val_labels), 
                              shuffle=True,
                              use_multiprocessing=True,
                              workers=8,
                              max_queue_size=20,          
                              callbacks=[keras.callbacks.ModelCheckpoint('conv2d_to_rnn.h5', monitor='val_loss', save_best_only=True)])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [45]:
model1.compile(optimizer=RMSprop(lr=0.0001),
              metrics=['accuracy', 'mae'],
              loss='categorical_crossentropy')

history = model1.fit_generator(generator=training_generator,
                              epochs=5,
                              validation_data=(val_data, val_labels), 
                              shuffle=True,
                              use_multiprocessing=True,
                              workers=8,
                              max_queue_size=20,          
                              callbacks=[keras.callbacks.ModelCheckpoint('conv2d_to_rnn.h5', monitor='val_loss', save_best_only=True)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [47]:
model1.compile(optimizer=RMSprop(lr=0.0001),
              metrics=['accuracy', 'mae'],
              loss='categorical_crossentropy')

history = model1.fit_generator(generator=training_generator,
                              epochs=5,
                              validation_data=(val_data, val_labels), 
                              shuffle=True,
                              use_multiprocessing=True,
                              workers=8,
                              max_queue_size=20,          
                              callbacks=[keras.callbacks.ModelCheckpoint('conv2d_to_rnn.h5', monitor='val_loss', save_best_only=True)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [49]:
best_model = keras.models.load_model('conv2d_to_rnn.h5', custom_objects={'Melspectrogram':Melspectrogram, 
                                                                         'Normalization2D': Normalization2D})
test_data, test_labels = DataFeed.Dataset.create(data_path, ['test/voxforge', 'test/youtube'], num=-1, shuffle=True)
best_model.evaluate(x=test_data,
                    y=test_labels), best_model.metrics_names

int_axis=0 passed but is ignored, str_axis is used instead.


([0.1806356432557106, 0.9583, 0.03331518059950322],
 ['loss', 'acc', 'mean_absolute_error'])