# "Conv2d to rnn"

In [7]:
import keras
from keras import models, layers
import numpy as np
import matplotlib.pyplot as plt
from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
import sys
sys.path.append('../')
from utils import DataFeed
from keras.regularizers import l2

### Load Data

In [15]:
data_path='../preprocessing/preprocessed_data'

In [16]:
train_data, train_labels = DataFeed.Dataset.create(data_path, ['train/voxforge', 'train/youtube'], num=50000, use_premade=True)
val_data, val_labels = DataFeed.Dataset.create(data_path, ['val/youtube', 'val/voxforge'], num=-1, shuffle=True)

In [17]:
callback_stopearly = keras.callbacks.EarlyStopping(monitor='val_acc',
                                                   patience=5)

### Simple First Model

In [3]:
shared_GRU = layers.GRU(1)

model = models.Sequential()
model.add(Melspectrogram(n_dft=512, input_shape=(1, 5 * 16000,),
                         padding='same', sr=16000, n_mels=32,
                         fmin=0.0, fmax=8000, power_melgram=1.0,
                         return_decibel_melgram=False, trainable_fb=False,
                         trainable_kernel=False))
model.add(layers.Conv2D(16, (4, 4), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 2 )))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(16, (2, 2), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 1)))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (2, 2), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 1)))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (2, 2), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 1)))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (2, 2), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 1)))
model.add(layers.BatchNormalization())

model.add(layers.Reshape((32, -1)))
model.add(layers.GRU(32))
model.add(layers.Dense(3, activation='softmax'))
#model.summary()

=> 88%

### Model 2
derived from https://github.com/YerevaNN/Spoken-language-identification/blob/master/theano/networks/tc_net_rnn_onernn.py

In [5]:
shared_GRU = layers.GRU(1)
model = models.Sequential()
model.add(Melspectrogram(n_dft=512, input_shape=(1, 5 * 16000,),
                         padding='same', sr=16000, n_mels=128, n_hop=256,
                         fmin=0.0, fmax=5500, power_melgram=1.0,
                         return_decibel_melgram=False, trainable_fb=False,
                         trainable_kernel=False))
model.add(layers.Conv2D(16, (7, 7), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3 ), strides=(2,2), padding='same'))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (5, 5), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))
model.add(layers.BatchNormalization())

model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))

model.add(layers.Reshape((-1, 32)))
model.add(layers.TimeDistributed(shared_GRU))
model.add(layers.Dense(3, activation='softmax'))
#model.summary()

### Model 3
derived from https://github.com/HPI-DeepLearning/crnn-lid/blob/master/keras/models/crnn.py

In [29]:
weight_decay = 0.001

model = models.Sequential()
model.add(Melspectrogram(n_dft=512, input_shape=(1, 5 * 16000,),
                         padding='same', sr=16000, n_mels=129, n_hop=313,
                         fmin=0.0, fmax=5500, power_melgram=1.0,
                         return_decibel_melgram=False, trainable_fb=False,
                         trainable_kernel=False))

model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2), strides=(2,2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2), strides=(2,2)))

model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model.add(layers.BatchNormalization())
#model.add(layers.MaxPooling2D((2, 2), strides=(2,2)))

model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2), strides=(2,2)))

model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model.add(layers.BatchNormalization())
#model.add(layers.MaxPooling2D((2, 2), strides=(2,2)))

model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2), strides=(2,2)))

model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2), strides=(2,2)))

# (bs, y, x, c) --> (bs, x, y, c)
model.add(layers.Permute((2, 1, 3)))

# (bs, x, y, c) --> (bs, x, y * c)
bs, x, y, c = model.layers[-1].output_shape
model.add(layers.Reshape((x, y*c)))

model.add(layers.wrappers.Bidirectional(layers.LSTM(256, return_sequences=False), merge_mode="concat"))
model.add(layers.Dense(3, activation="softmax"))

In [30]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
melspectrogram_16 (Melspectr (None, 129, 256, 1)       296321    
_________________________________________________________________
conv2d_89 (Conv2D)           (None, 129, 256, 64)      640       
_________________________________________________________________
batch_normalization_87 (Batc (None, 129, 256, 64)      256       
_________________________________________________________________
max_pooling2d_69 (MaxPooling (None, 64, 128, 64)       0         
_________________________________________________________________
conv2d_90 (Conv2D)           (None, 64, 128, 128)      73856     
_________________________________________________________________
batch_normalization_88 (Batc (None, 64, 128, 128)      512       
_________________________________________________________________
max_pooling2d_70 (MaxPooling (None, 32, 64, 128)       0         
__________

## Train

In [24]:
model.compile(optimizer='Rmsprop',
              metrics=['accuracy'],
              loss='categorical_crossentropy')

In [25]:
history = model.fit(x=train_data,
                        y=train_labels,
                        batch_size=64, 
                        epochs=20,
                        validation_data=(val_data, val_labels), 
                        shuffle=True,
                        callbacks=[callback_stopearly])

Train on 50000 samples, validate on 10000 samples
Epoch 1/20

KeyboardInterrupt: 