# Tensorflow speech recognition challenge (Kaggle)

In [0]:
import numpy as np
import pandas as pd
import os
from collections import Counter
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Input, layers

In [0]:
data_path = "drive/My Drive/Colab Notebooks/data/preprocessed"
os.listdir(data_path)

['sample_submission.csv',
 'wav_all.npy',
 'label_all.npy',
 'unknown_wav.npy',
 'background_wav.npy',
 'test_wav_final.npy',
 'model-089-0.875049.h5',
 'submission.csv']

In [0]:
wav_all = np.load(os.path.join(data_path, "wav_all.npy"), allow_pickle=True)
wav_all = np.array([list(arr) for arr in wav_all])

label_all = np.load(os.path.join(data_path, "label_all.npy")).reshape(-1)

unknown_wav = np.load(os.path.join(data_path, "unknown_wav.npy"))
background_wav = np.load(os.path.join(data_path, "background_wav.npy"), allow_pickle=True)

---
### Generating training data
#### Generating silence samples from background noise
Since we have around 2131 of every label in our dataset, we will generate 2131 additional samples from background noise (and label it silence, because it doesn't contain any speech).

In [0]:
label_count = Counter(label_all)
label_count

Counter({'down': 2152,
         'go': 2101,
         'left': 2165,
         'no': 2098,
         'off': 2143,
         'on': 2105,
         'right': 2155,
         'stop': 2174,
         'up': 2062,
         'yes': 2157})

In [0]:
np.mean([count for count in label_count.values()])

2131.2

In [0]:
n_silence_samples = 2130  # divisible by 6

In [0]:
def get_noise_sample(noise_num=0):
    """Gets random sample from selected noise type (one out of 6)"""
    selected_noise = background_wav[noise_num]
    start_idx = random.randint(0, len(selected_noise)- 1 - 8000)
    return selected_noise[start_idx:(start_idx + 8000)]

In [0]:
#silence audio
silence_wav = []
n_samples_per_noise = n_silence_samples // len(background_wav)
for i, _ in enumerate(background_wav):
    for _ in range(n_samples_per_noise):
        silence_wav.append(get_noise_sample(i))
silence_wav = np.array(silence_wav)
silence_label = np.array(['silence' for _ in range(n_samples_per_noise * len(background_wav))])
silence_wav.shape

(2130, 8000)

#### Creating samples of unknown speech
Samples form all other labels. To make classes balanced, I will only use a sample of 2130 elements from unknown labels.

In [0]:
unknown_wav.shape

(41115, 8000)

In [0]:
unknown_wav = np.array(random.sample(list(unknown_wav), n_silence_samples))
unknown_label = ["unknown" for _ in range(n_silence_samples)]

---
#### Data Processing pipeline
The next step would be to create data processing pipeline: sample elements from the whole dataset instead of choosing fixed set of unknown samples and also doing data aumentation (mixing with noise) when doing training.

---

In [0]:
unknown_wav.shape, wav_all.shape, silence_wav.shape

((2130, 8000), (21312, 8000), (2130, 8000))

In [0]:
data = np.concatenate([wav_all, unknown_wav, silence_wav], 0)
labels = np.concatenate([label_all, unknown_label, silence_label], 0)

In [0]:
data.shape, labels.shape

((25572, 8000), (25572,))

---
## Converting data to feed into model

In [0]:
train_wav, test_wav, train_label, test_label = train_test_split(data, labels, test_size=0.2, random_state=42, shuffle=True)

In [0]:
#For Conv1D add channel dimension (at the end)
train_wav = train_wav.reshape(-1, 8000, 1)
test_wav = test_wav.reshape(-1, 8000, 1)

In [0]:
# Converting labels to one hot vectors
label_values = np.unique(labels)
label_dict = {label: value for value, label in enumerate(sorted(label_values))}
label_dict

{'down': 0,
 'go': 1,
 'left': 2,
 'no': 3,
 'off': 4,
 'on': 5,
 'right': 6,
 'silence': 7,
 'stop': 8,
 'unknown': 9,
 'up': 10,
 'yes': 11}

In [0]:
train_label = [label_dict[label] for label in train_label]
test_label = [label_dict[label] for label in test_label]

train_label = keras.utils.to_categorical(train_label, len(label_dict))
test_label = keras.utils.to_categorical(test_label, len(label_dict))

In [0]:
train_label.shape

(20457, 12)

In [0]:
data_all = data.reshape(-1, 8000, 1)
label_all = [label_dict[label] for label in labels]
label_all = keras.utils.to_categorical(label_all, len(label_dict))

---
## Model

In [0]:
# Parameters
lr = 0.001
batch_size = 512
dropout_rate = 0.5
input_shape = (8000,1)

In [0]:
#Conv1D Model
input_tensor = Input(shape=(input_shape))

x = layers.Conv1D(8, 11, padding='valid', activation='relu', strides=1)(input_tensor)
x = layers.MaxPooling1D(2)(x)
x = layers.Dropout(dropout_rate)(x)
x = layers.Conv1D(16, 7, padding='valid', activation='relu', strides=1)(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Dropout(dropout_rate)(x)
x = layers.Conv1D(32, 5, padding='valid', activation='relu', strides=1)(x)
x = layers.MaxPooling1D(10)(x)

x = layers.Flatten()(x)

x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(dropout_rate)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(dropout_rate)(x)

output_tensor = layers.Dense(12, activation='softmax')(x)

model = tf.keras.Model(input_tensor, output_tensor)

model.compile(loss=keras.losses.categorical_crossentropy,
             optimizer=keras.optimizers.Adam(lr = lr),
             metrics=['accuracy'])

In [0]:
model = keras.models.load_model(os.path.join(data_path, "model-089-0.875049.h5"))
model.compile(loss=keras.losses.categorical_crossentropy,
             optimizer=keras.optimizers.Adam(lr = lr),
             metrics=['accuracy'])

In [0]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8000, 1)]         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 7990, 8)           96        
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 3995, 8)           0         
_________________________________________________________________
dropout (Dropout)            (None, 3995, 8)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 3989, 16)          912       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1994, 16)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1994, 16)          0     

In [0]:
history = model.fit(train_wav, train_label, validation_split=0.2, shuffle=True, batch_size=batch_size, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Running model on test data

In [0]:
label_dict = {'down': 0,
            'go': 1,
            'left': 2,
            'no': 3,
            'off': 4,
            'on': 5,
            'right': 6,
            'silence': 7,
            'stop': 8,
            'unknown': 9,
            'up': 10,
            'yes': 11}

In [0]:
test_path = os.path.join(data_path, "test_wav_final.npy")
test_all = np.load(test_path).reshape(-1, 8000, 1)  # may take a while (~3 minutes) (it's almost 5GB)

In [0]:
test_all.shape

(158538, 8000, 1)

In [0]:
preds = model.predict(test_all)

In [0]:
preds.shape

(158538, 12)

In [0]:
num_to_label = {value: key for key, value in label_dict.items()}
num_to_label

{0: 'down',
 1: 'go',
 2: 'left',
 3: 'no',
 4: 'off',
 5: 'on',
 6: 'right',
 7: 'silence',
 8: 'stop',
 9: 'unknown',
 10: 'up',
 11: 'yes'}

In [0]:
predicted_labels = [num_to_label[num] for num in preds.argmax(1)]

In [0]:
submission_file = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
submission_file.head()

Unnamed: 0,fname,label
0,clip_000044442.wav,silence
1,clip_0000adecb.wav,silence
2,clip_0000d4322.wav,silence
3,clip_0000fb6fe.wav,silence
4,clip_0001d1559.wav,silence


In [0]:
submission_file["label"] = predicted_labels

In [0]:
submission_file.to_csv(os.path.join(data_path, "submission.csv"), index=False)

In [0]:
pd.read_csv(os.path.join(data_path, "submission.csv")).head()

Unnamed: 0,fname,label
0,clip_000044442.wav,no
1,clip_0000adecb.wav,unknown
2,clip_0000d4322.wav,unknown
3,clip_0000fb6fe.wav,silence
4,clip_0001d1559.wav,unknown


In [0]:
Counter(submission_file.label.values)

Counter({'down': 5808,
         'go': 8570,
         'left': 8553,
         'no': 7738,
         'off': 9232,
         'on': 17531,
         'right': 13643,
         'silence': 6497,
         'stop': 7438,
         'unknown': 58311,
         'up': 5577,
         'yes': 9640})