In [43]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
os.environ["TENSORBOARD_BINARY"] = ".speech_recognition/bin/tensorboard"

# tensorboard --logdir ./tensorboard --host "0.0.0.0" --port 8501

In [3]:
import h5py
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras import mixed_precision
from tensorflow.keras.optimizers import LossScaleOptimizer
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras import regularizers
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import jiwer

plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False

pd.options.display.float_format = '{:.2f}'.format

os.makedirs("training_images", exist_ok=True)
os.makedirs('checkpoints', exist_ok=True)
log_dir = "tensorboard/"
os.makedirs(log_dir, exist_ok=True)

char_to_num = {char: idx + 1 for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz ")}
char_to_num['<PAD>'] = 0
num_to_char = {v: k for k, v in char_to_num.items()}
n_mfcc = 13
BATCH_SIZE = 128

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
mixed_precision.global_policy()

<DTypePolicy "mixed_float16">

In [13]:
def check_dataset_size(dataset_filename: str):
    """Counts the number of samples in the HDF5 dataset"""
    with h5py.File(f"data/CommonVoice/{dataset_filename}", "r") as hf:
        mfcc_group = hf["mfcc"]
        labels_group = hf["labels"]
        i = 0
        for key in mfcc_group.keys():
            i += 1
    return i


def dataset_from_generator(h5_file_path):
    def generator():
        with h5py.File(h5_file_path, "r") as hf:
            mfcc_group = hf["mfcc"]
            labels_group = hf["labels"]
            for key in mfcc_group.keys():
                yield mfcc_group[key][:], labels_group[key][:]
    
    return tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(None, 13), dtype=tf.float32),
            tf.TensorSpec(shape=(None,), dtype=tf.int32)
        ))

def generate_padded_data(h5_file_path, batch_size=BATCH_SIZE):
    dataset = dataset_from_generator(h5_file_path)
    dataset = dataset.padded_batch(
        batch_size=batch_size,
        padded_shapes=([None, 13], [None]),
        padding_values=(0.0, char_to_num['<PAD>'])
    )
    return dataset.cache()


def plot_loss_curves(history, model_name):
    """Return separate loss curves for training and validation results."""
    
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]

    epochs = range(len(loss))

    plt.figure(figsize=(10, 5))

    plt.plot(epochs, loss, label="training loss")
    plt.plot(epochs, val_loss, label="val loss")
    plt.title(f"CTC loss for {model_name}")
    plt.xlabel("Epochs")
    plt.legend()
    plt.savefig(f"training_images/{model_name}.png")
    plt.show()


def build_model(model_name, layer_list, input_shape=(None, 13), vocab_size=len(char_to_num), learning_rate=1e-03, load_weights=""):
    """
    Creates a model from a list of layers using keras functional api; input and output is permanently defined.
    The returned model is compiled using CTC and Loss scale optimizer.
    """
        
    inputs = layers.Input(shape=input_shape, name="input_layer")

    x = inputs
    for layer in layer_list:
        x = layer(x)

    outputs = layers.Dense(units=vocab_size, name="output_layer")(x)

    model = Model(inputs, outputs, name=model_name)

    if len(load_weights) > 0:
         model.load_weights(f"checkpoints/{load_weights}.ckpt.weights.h5", skip_mismatch=True)

    model.compile(
        optimizer=LossScaleOptimizer(tf.keras.optimizers.Adam(learning_rate=learning_rate)),
        loss=tf.keras.losses.CTC())
    
    return model


def train_model(model,
                train_data,
                val_data,
                train_data_size,
                val_data_size,
                batch_size=BATCH_SIZE,
                epochs=20,
                initial_epoch=0,
                callbacks=None):
    "Trains model, plots and returns history."
    
    history = model.fit(
                train_data,
                validation_data=val_data,
                epochs=epochs,
                steps_per_epoch=int(train_data_size/batch_size),
                validation_steps=int(val_data_size/batch_size),
                callbacks=callbacks,
                verbose=1,
                initial_epoch=initial_epoch
                )
    
    plot_loss_curves(history, model.name)

    return history

## CTC - Connectionist Temporal Classification
* Enables comparative analysis of sequences with different lengths, where there is no clear alignment between input and output, e.g., audio and transcription.
* Predictions are in the form of logits, blank index - 0; no additional function is used in the output layer.
* Loss value is normalized by default at the batch level - allows comparison of batches with samples of varying sizes.

# USA data

In [14]:
train_data_size = check_dataset_size("train.h5")
train_dataset = (generate_padded_data("data/CommonVoice/train.h5", batch_size=BATCH_SIZE)
                 .shuffle(buffer_size=train_data_size//BATCH_SIZE, reshuffle_each_iteration=True)
                 .repeat()
                 .prefetch(tf.data.AUTOTUNE)
)

val_data_size = check_dataset_size("val.h5")
val_dataset = generate_padded_data("data/CommonVoice/val.h5", batch_size=BATCH_SIZE).repeat().prefetch(tf.data.AUTOTUNE)
test_data_size = check_dataset_size("test.h5")
test_dataset = generate_padded_data("data/CommonVoice/test.h5", batch_size=BATCH_SIZE).repeat().prefetch(tf.data.AUTOTUNE)

print(f"Train: {int(train_data_size)}, Val: {int(val_data_size)}, Test: {test_data_size}")

Train: 90322, Val: 10036, Test: 25090


In [21]:
# example tests

models = {
    f"BLSTM64x3_USA_b{BATCH_SIZE}": [
        layers.Bidirectional(layers.LSTM(64, return_sequences=True), name="blstm_1"),
        layers.Bidirectional(layers.LSTM(64, return_sequences=True), name="blstm_2"),
        layers.Bidirectional(layers.LSTM(64, return_sequences=True), name="blstm_3"),
    ],
    f"BLSTM128x3_USA_b{BATCH_SIZE}": [
        layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="blstm_1"),
        layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="blstm_2"),
        layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="blstm_3")
    ],
    f"BLSTM256x3_USA_b{BATCH_SIZE}": [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_3"),
    ],
    f"BLSTM256x3_USA_b{BATCH_SIZE}_l2": [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True, kernel_regularizer=regularizers.l2(0.001)), name="blstm_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True, kernel_regularizer=regularizers.l2(0.001)), name="blstm_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True, kernel_regularizer=regularizers.l2(0.001)), name="blstm_3"),
    ],
    f"BLSTM256x3_USA_b{BATCH_SIZE}_dropout": [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_1"),
        layers.Dropout(0.2, name="dropout_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_2"),
        layers.Dropout(0.2, name="dropout_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_3"),
        layers.Dropout(0.2, name="final_dropout")
    ],
    f"BLSTM256x3_USA_b{BATCH_SIZE}_spdropout": [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_1"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_2"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_3"),
        layers.Dropout(0.2, name="final_dropout")
    ]
}

all_results = []

for name, layer_list in models.items():  
    model = build_model(name, layer_list)
    print(f"Training model {name}....")
    # print(model.summary())

    experiment_dir = f"{log_dir}usa/{name}"

    callbacks = [
    ModelCheckpoint(f"checkpoints/{name}.ckpt.weights.h5",
                   save_weights_only=True),
    EarlyStopping(patience=3,
                  restore_best_weights=True),
    TensorBoard(log_dir=experiment_dir,
                histogram_freq=1,
                write_steps_per_second=True)
    ]
    
    history = train_model(model, train_dataset, val_dataset,
                          callbacks=callbacks,
                          train_data_size=train_data_size,
                          val_data_size=val_data_size,
                          epochs=20)

    results = {
        "Model": name,
        "Params": model.count_params(),
        "Train loss": history.history["loss"][-1],
        "Val loss": history.history["val_loss"][-1],
        "Test loss": model.evaluate(test_dataset, verbose=0, steps=test_data_size//BATCH_SIZE)
    }
    all_results.append(results)

all_results = pd.DataFrame(all_results)
all_results.to_csv("results_00.csv", index=False)
all_results.style.background_gradient()
# all_results.iloc[:, [0, 2, 3, 4]].style.background_gradient()

Unnamed: 0,Model,Train loss,Val loss,Test loss
0,BLSTM64x3_USA_b128,67.938469,68.590004,69.356064
1,BLSTM128x3_USA_b128,43.971088,52.030102,52.767967
2,BLSTM256x3_USA_b128,27.838634,47.835159,47.276829
3,BLSTM256x3_USA_b128_l2,33.08604,50.292,50.354992
4,BLSTM256x3_USA_b128_dropout,30.83498,40.273403,40.723652
5,BLSTM256x3_USA_b128_spdropout,34.111958,39.54636,40.216934


# Data augmentation

In [17]:
def augment_mfcc(mfcc, label):

    # gaussian noise
    if tf.random.uniform(()) > 0.3:
        noise_std = tf.random.uniform([], 0.005, 0.025)
        noise = tf.random.normal(tf.shape(mfcc), stddev=noise_std)
        mfcc = mfcc + noise
    
    # frequency masking
    if tf.random.uniform(()) > 0.5:
        num_mfcc = tf.shape(mfcc)[0]  # 13
        max_freq_mask = 2 
        
        freq_start = tf.random.uniform([], 0, num_mfcc - max_freq_mask, dtype=tf.int32)
        freq_len = tf.random.uniform([], 1, max_freq_mask + 1, dtype=tf.int32)
        
        freq_indices = tf.range(num_mfcc)
        freq_mask = tf.logical_and(freq_indices >= freq_start, 
                                  freq_indices < freq_start + freq_len)
        
        mfcc = tf.where(freq_mask[:, None], 0.0, mfcc)
    
    # time masking 
    if tf.random.uniform(()) > 0.5:
        seq_len = tf.shape(mfcc)[1]
        
        max_time_mask = tf.maximum(1, seq_len // 10)  # 10%
        max_time_mask = tf.minimum(max_time_mask, 10)  # max 10 time steps
        
        safe_range = tf.maximum(1, seq_len - max_time_mask)
        time_start = tf.random.uniform([], 0, safe_range, dtype=tf.int32)
        time_len = tf.random.uniform([], 1, max_time_mask + 1, dtype=tf.int32)
        
        time_indices = tf.range(seq_len)
        time_mask = tf.logical_and(time_indices >= time_start,
                                  time_indices < time_start + time_len)

        mfcc = tf.where(time_mask[None, :], 0.0, mfcc)
    
    # amplitude scaling
    if tf.random.uniform(()) > 0.4:
        scale = tf.random.uniform([], 0.85, 1.15)
        mfcc = mfcc * scale
    
    # feature dropout
    if tf.random.uniform(()) > 0.6:
        dropout_prob = tf.random.uniform([], 0.05, 0.15)
        dropout_mask = tf.random.uniform(tf.shape(mfcc)) > dropout_prob
        mfcc = tf.where(dropout_mask, mfcc, 0.0)
    
    return mfcc, label


def dataset_from_generator(h5_file_path, train=True, batch_size=BATCH_SIZE):
    def generator():
        with h5py.File(h5_file_path, "r") as hf:
            mfcc_group = hf["mfcc"]
            labels_group = hf["labels"]
            for key in mfcc_group.keys():
                yield mfcc_group[key][:], labels_group[key][:]

    if train:
    
        return (tf.data.Dataset.from_generator(
            generator,
            output_signature=(
                tf.TensorSpec(shape=(None, 13), dtype=tf.float32),
                tf.TensorSpec(shape=(None,), dtype=tf.int32)
            ))
                .cache()
                .repeat()
                .map(augment_mfcc, num_parallel_calls=tf.data.AUTOTUNE)
                .padded_batch(batch_size=batch_size,
                              padded_shapes=([None, 13], [None]),
                              padding_values=(0.0, char_to_num['<PAD>']))
               .shuffle(buffer_size=train_data_size//batch_size, reshuffle_each_iteration=True)
               .prefetch(tf.data.AUTOTUNE)
               )
        
    return tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(None, 13), dtype=tf.float32),
            tf.TensorSpec(shape=(None,), dtype=tf.int32)
        ))


def generate_padded_data(h5_file_path, batch_size=BATCH_SIZE):
    
    dataset = dataset_from_generator(h5_file_path, train=False)
    dataset = dataset.padded_batch(
        batch_size=batch_size,
        padded_shapes=([None, 13], [None]),
        padding_values=(0.0, char_to_num['<PAD>'])
    )
    return dataset.cache().repeat().prefetch(tf.data.AUTOTUNE)

In [18]:
train_dataset = dataset_from_generator("data/CommonVoice/train.h5", train=True)

test_dataset = generate_padded_data("data/CommonVoice/test.h5")
test_data_size = check_dataset_size("test.h5")

val_data_size = check_dataset_size("val.h5")
val_dataset = generate_padded_data("data/CommonVoice/val.h5")

print(f"Train: {int(train_data_size)}, Val: {int(val_data_size)}, Test: {test_data_size}")

Train: 90322, Val: 10036, Test: 25090


In [23]:
models = {
    f"BLSTM128x3_USA_b{BATCH_SIZE}_augm": [
        layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="blstm_1"),
        layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="blstm_2"),
        layers.Bidirectional(layers.LSTM(128, return_sequences=True), name="blstm_3")
    ],
    f"BLSTM256x3_USA_b{BATCH_SIZE}_dropout_augm": [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_1"),
        layers.Dropout(0.2, name="dropout_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_2"),
        layers.Dropout(0.2, name="dropout_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_3"),
        layers.Dropout(0.2, name="final_dropout")
    ],
    f"BLSTM256x3_USA_b{BATCH_SIZE}_spdropout_augm": [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_1"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_2"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_3"),
        layers.Dropout(0.2, name="final_dropout")
    ]
}

all_results = []

experiment_dir = f"{log_dir}usa/augm/{name}"

callbacks = [
ModelCheckpoint(f"checkpoints/{name}.ckpt.weights.h5",
               save_weights_only=True),
EarlyStopping(patience=3,
              restore_best_weights=True),
TensorBoard(log_dir=experiment_dir,
            histogram_freq=1,
            write_steps_per_second=True)
]

for name, layer_list in models.items():  
    model = build_model(name, layer_list)
    print(f"Training model {name}....")
    # print(model.summary())

    history = train_model(model, train_dataset, val_dataset,
                          callbacks=callbacks,
                          train_data_size=train_data_size,
                          val_data_size=val_data_size,
                          epochs=30)

    results = {
        "Model": name,
        "Train loss": history.history["loss"][-1],
        "Val loss": history.history["val_loss"][-1],
        "Test loss": model.evaluate(test_dataset, verbose=0, steps=test_data_size//BATCH_SIZE)
    }
    all_results.append(results)

all_results = pd.DataFrame(all_results)
all_results.to_csv("results_augm.csv", index=False)

all_results.style.background_gradient()

Unnamed: 0,Model,Train loss,Val loss,Test loss
0,BLSTM128x3_USA_b128_augm,52.819092,49.725716,50.191483
1,BLSTM256x3_USA_b128_dropout_augm,40.085445,37.723663,38.406605
2,BLSTM256x3_USA_b128_spdropout_augm,41.654919,36.684639,37.386055


In [27]:
name = f"BLSTM256x3_USA_b{BATCH_SIZE}_spdropout_augm"

# model = build_model(name, layer_list, load_weights=name)

history = train_model(model, train_dataset, val_dataset,
                      callbacks=callbacks,
                      train_data_size=train_data_size,
                      val_data_size=val_data_size,
                      epochs=60,
                      initial_epoch=30)

results = {
    "Model": name,
    "Params": model.count_params(),
    "Train loss": history.history["loss"][-1],
    "Val loss": history.history["val_loss"][-1],
    "Test loss": model.evaluate(test_dataset, verbose=0, steps=test_data_size//BATCH_SIZE)
}
pd.DataFrame(results, index=[0])

Unnamed: 0,Model,Params,Train loss,Val loss,Test loss
0,BLSTM256x3_USA_b128_spdropout_augm,3717148,32.96,33.08,33.78


In [33]:
model.save("usa_model.keras")

# Different accents, balanced

In [40]:
train_dataset = dataset_from_generator("data/CommonVoice/balanced_train.h5", train=True)
train_data_size = check_dataset_size("balanced_train.h5")

test_dataset = generate_padded_data("data/CommonVoice/balanced_test.h5")
test_data_size = check_dataset_size("balanced_test.h5")

val_data_size = check_dataset_size("balanced_val.h5")
val_dataset = generate_padded_data("data/CommonVoice/balanced_val.h5")

print(f"Train: {int(train_data_size)}, Val: {int(val_data_size)}, Test: {test_data_size}")

Train: 79236, Val: 8805, Test: 22011


In [51]:
name = f"BLSTM256x3_balanced_b{BATCH_SIZE}_spdropout_augm"
layer_list = [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_1"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_2"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_3"),
        layers.Dropout(0.2, name="final_dropout")
]

model = build_model(name, layer_list)

model.compile(
    optimizer=LossScaleOptimizer(tf.keras.optimizers.Adam(learning_rate=1e-03)),
    loss="ctc")

model.summary()

experiment_dir = f"{log_dir}{name}"
callbacks = [
ModelCheckpoint("checkpoints/{model_name}_epoch{{epoch:02d}}.ckpt.weights.h5".format(model_name=name),
               save_weights_only=True, save_freq="epoch"),
EarlyStopping(patience=4,
              restore_best_weights=True),
TensorBoard(log_dir=experiment_dir,
            histogram_freq=1,
            write_steps_per_second=True),
ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=2, min_lr=1e-07)
]

history = train_model(model, train_dataset, val_dataset,
                      callbacks=callbacks,
                      train_data_size=train_data_size,
                      val_data_size=val_data_size,
                      epochs=50,
                      initial_epoch=0)

results = {
    "Model": name,
    "Params": model.count_params(),
    "Train loss": history.history["loss"][-1],
    "Val loss": history.history["val_loss"][-1],
    "Test loss": model.evaluate(test_dataset, verbose=0, steps=test_data_size//BATCH_SIZE)
}
pd.DataFrame(results, index=[0])

Unnamed: 0,Model,Params,Train loss,Val loss,Test loss
0,BLSTM256x3_balanced_b128_spdropout_augm,3717148,45.9,45.94,46.21


In [117]:
model.save("mixed_acc_balanced.keras")

# Different acccents, unbalanced

In [28]:
BATCH_SIZE = 128

train_data_size = check_dataset_size("unbalanced_train.h5")
train_dataset = dataset_from_generator("data/CommonVoice/unbalanced_train.h5", train=True, train_data_size=train_data_size)

test_dataset = generate_padded_data("data/CommonVoice/unbalanced_test.h5")
test_data_size = check_dataset_size("unbalanced_test.h5")

val_data_size = check_dataset_size("unbalanced_val.h5")
val_dataset = generate_padded_data("data/CommonVoice/unbalanced_val.h5")

print(f"Train: {int(train_data_size)}, Val: {int(val_data_size)}, Test: {test_data_size}")

Train: 76709, Val: 8524, Test: 21309


In [30]:
name = f"BLSTM256x3_unbalanced_b{BATCH_SIZE}_spdropout_augm"
layer_list = [
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_1"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_1"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_2"),
        layers.SpatialDropout1D(0.2, name="spatial_dropout_2"),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="blstm_3"),
        layers.Dropout(0.2, name="final_dropout")
]

model = build_model(name, layer_list)

model.compile(
    optimizer=LossScaleOptimizer(tf.keras.optimizers.Adam(learning_rate=1e-03)),
    loss="ctc")

model.summary()

experiment_dir = f"{log_dir}{name}"
callbacks = [
ModelCheckpoint("checkpoints/{model_name}_epoch{{epoch:02d}}.ckpt.weights.h5".format(model_name=name),
               save_weights_only=True, save_freq="epoch"),
EarlyStopping(patience=4,
              restore_best_weights=True),
TensorBoard(log_dir=experiment_dir,
            histogram_freq=1,
            write_steps_per_second=True),
ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=2, min_lr=1e-07)
]

history = train_model(model, train_dataset, val_dataset,
                      callbacks=callbacks,
                      train_data_size=train_data_size,
                      val_data_size=val_data_size,
                      epochs=50,
                      initial_epoch=0)

print(f"Test loss: {round(model.evaluate(test_dataset, verbose=0, steps=test_data_size//BATCH_SIZE), 3)}")

Test loss: 47.844


In [119]:
model.save("mixed_acc_unabalanced.keras")