# Simple RNN models

Now that I have extracted MFCC/Mel filter bank features from the audio, I would like to test the simplest neural network models. Basic RNN (GRU, LSTM) models will be tested.

Trained on GeForce GTX 1080.

So far the results on the validation set are:

| Feature | Model  | Hidden Dim  | Accuracy  |
|:--------:|:-----:|:-----------:|:---------:|
| MFCC (39d)| LSTM | 128        |  .9206 |
| MFCC (39d)| LSTM | 256        |  .9297 |
| MFCC (39d)| LSTM | 512        |  **.9356** |
| MFCC (39d)| GRU  | 128        |  .9238 |
| MFCC (39d)| GRU  | 256        |  .9316 |
| MFCC (39d)| GRU  | 512        |  .9278 |


In [2]:
import pandas as pd
import numpy as np
from collections import Counter

from keras.layers import Dense, LSTM, GRU, Activation, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping

from keras.utils import to_categorical
from sklearn.metrics import classification_report

## load data

In [3]:
%ls preprocessed

test_mel.npy   test_spec.npy  train_mfcc.npy  valid_mel.npy   valid_spec.npy
test_mfcc.npy  train_mel.npy  train_spec.npy  valid_mfcc.npy


In [4]:
features = ["mel","mfcc"]

train, valid = {},{}

for f in features:
    train[f] = np.load("preprocessed/train_%s.npy" % f).T
    train[f] = np.swapaxes(train[f], 1,2)
    print("loaded %s" % str(train[f].shape))
for f in features:
    valid[f] = np.load("preprocessed/valid_%s.npy" % f).T
    valid[f] = np.swapaxes(valid[f], 1,2)
    print("loaded %s" % str(valid[f].shape))

loaded (51088, 99, 40)
loaded (51088, 99, 39)
loaded (6798, 99, 40)
loaded (6798, 99, 39)


In [5]:
train_labels = pd.read_csv("./data/train_list.csv")["label"].tolist()
len(train_labels), train_labels[:5]

(51088, ['right', 'right', 'right', 'right', 'right'])

In [6]:
valid_labels = pd.read_csv("./data/valid_list.csv")["label"].tolist()
len(valid_labels), valid_labels[:5]

(6798, ['right', 'right', 'right', 'right', 'right'])

In [7]:
target_labels = "yes, no, up, down, left, right, on, off, stop, go".split(", ")
target_labels

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

In [8]:
train_labels = list(map(lambda x: x if x in target_labels else "unknown", train_labels))
valid_labels = list(map(lambda x: x if x in target_labels else "unknown", valid_labels))

In [9]:
label_counter = Counter(train_labels)
label_order, _ = list(zip(*label_counter.most_common(11)))
label_order

('unknown',
 'stop',
 'on',
 'go',
 'yes',
 'no',
 'right',
 'up',
 'down',
 'off',
 'left')

In [10]:
train_labels = list(map(lambda x: label_order.index(x), train_labels))
valid_labels = list(map(lambda x: label_order.index(x), valid_labels))

In [11]:
train_labels = to_categorical(train_labels, num_classes=11)
train_labels.shape

(51088, 11)

In [12]:
valid_labels = to_categorical(valid_labels, num_classes=11)
valid_labels.shape

(6798, 11)

## Using only MFCC as features

In [17]:
# Stop training well validation accuracy decreases
early_stop_callback = EarlyStopping(monitor='val_categorical_accuracy', 
                                    patience=2, mode='max')

### LSTM hidden_size=128

In [None]:
train["mfcc"].shape

In [15]:
hidden_size = 128
batch_size = 32

In [None]:
lstm_model = Sequential()
lstm_model.add(LSTM(hidden_size, input_shape=(99, 39)))
lstm_model.add(Dense(11))
lstm_model.add(Activation("softmax"))

lstm_model.compile(optimizer="adam", loss="categorical_crossentropy",
                   metrics=['categorical_accuracy'])


In [None]:
lstm_model.fit(x=train["mfcc"], y=train_labels, 
               validation_data=(valid["mfcc"], valid_labels),
               batch_size=batch_size, 
               callbacks=[early_stop_callback],
               epochs=10, shuffle=True)

Train on 51088 samples, validate on 6798 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f34a0215898>

validation accuracy = 0.9206

### LSTM hidden_size=256

In [14]:
hidden_size = 256
batch_size = 64

In [15]:
lstm_model = Sequential()
lstm_model.add(LSTM(hidden_size, input_shape=(99, 39)))
lstm_model.add(Dense(11))
lstm_model.add(Activation("softmax"))

lstm_model.compile(optimizer="adam", loss="categorical_crossentropy",
                   metrics=['categorical_accuracy'])

In [16]:
lstm_model.fit(x=train["mfcc"], y=train_labels, 
               validation_data=(valid["mfcc"], valid_labels),
               batch_size=batch_size,
               callbacks=[early_stop_callback],
               epochs=20, shuffle=True)

Train on 51088 samples, validate on 6798 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


<keras.callbacks.History at 0x7f7d556a9fd0>

final validation accuracy = 0.9297

### LSTM Hidden Size 512

In [20]:
hidden_size = 512
batch_size = 64

In [21]:
lstm_model = Sequential()
lstm_model.add(LSTM(hidden_size, input_shape=(99, 39)))
lstm_model.add(Dense(11))
lstm_model.add(Activation("softmax"))

lstm_model.compile(optimizer="adam", loss="categorical_crossentropy",
                   metrics=['categorical_accuracy'])

In [22]:
lstm_model.fit(x=train["mfcc"], y=train_labels, 
               validation_data=(valid["mfcc"], valid_labels),
               batch_size=batch_size,
               callbacks=[early_stop_callback],
               epochs=20, shuffle=True)

Train on 51088 samples, validate on 6798 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.callbacks.History at 0x7f4345e16b38>

validation accuracy 0.9356

### LSTM Hidden Size 1024

In [23]:
hidden_size = 1024
batch_size = 64

In [24]:
lstm_model = Sequential()
lstm_model.add(LSTM(hidden_size, input_shape=(99, 39)))
lstm_model.add(Dense(11))
lstm_model.add(Activation("softmax"))

lstm_model.compile(optimizer="adam", loss="categorical_crossentropy",
                   metrics=['categorical_accuracy'])

In [25]:
lstm_model.fit(x=train["mfcc"], y=train_labels, 
               validation_data=(valid["mfcc"], valid_labels),
               batch_size=batch_size,
               callbacks=[early_stop_callback],
               epochs=20, shuffle=True)

Train on 51088 samples, validate on 6798 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.History at 0x7f42301a6828>

validation accuracy 0.9415

### GRU hidden_size=128

In [None]:
hidden_size = 128
batch_size = 64

In [None]:
gru_model = Sequential()
gru_model.add(GRU(hidden_size, input_shape=(99, 39)))
gru_model.add(Dense(11))
gru_model.add(Activation("softmax"))

gru_model.compile(optimizer="adam", loss="categorical_crossentropy",
                   metrics=['categorical_accuracy'])


In [None]:
gru_model.fit(x=train["mfcc"], y=train_labels, 
               validation_data=(valid["mfcc"], valid_labels),
               batch_size=batch_size, 
               callbacks=[early_stop_callback],
               epochs=10, shuffle=True)

Train on 51088 samples, validate on 6798 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<keras.callbacks.History at 0x7f5a8e755b00>

Valid Accuracy 0.9238

### GRU hidden_size=256

In [14]:
hidden_size = 256
batch_size = 64

In [15]:
gru_model = Sequential()
gru_model.add(GRU(hidden_size, input_shape=(99, 39)))
gru_model.add(Dense(11))
gru_model.add(Activation("softmax"))

gru_model.compile(optimizer="adam", loss="categorical_crossentropy",
                   metrics=['categorical_accuracy'])


In [18]:
gru_model.fit(x=train["mfcc"], y=train_labels, 
               validation_data=(valid["mfcc"], valid_labels),
               batch_size=batch_size, 
               callbacks=[early_stop_callback],
               epochs=10, shuffle=True)

Train on 51088 samples, validate on 6798 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x7f43cf9ddc50>

validation accuracy 0.9316

### GRU hidden_size= 512

In [19]:
hidden_size = 512
batch_size = 64

In [None]:
gru_model = Sequential()
gru_model.add(GRU(hidden_size, input_shape=(99, 39)))
gru_model.add(Dense(11))
gru_model.add(Activation("softmax"))

gru_model.compile(optimizer="adam", loss="categorical_crossentropy",
                   metrics=['categorical_accuracy'])


In [None]:
gru_model.fit(x=train["mfcc"], y=train_labels, 
               validation_data=(valid["mfcc"], valid_labels),
               batch_size=batch_size, 
               callbacks=[early_stop_callback],
               epochs=10, shuffle=True)

Train on 51088 samples, validate on 6798 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.callbacks.History at 0x7f7bbc116f60>

validation accuracy 0.9278