Data Set Link https://www.kaggle.com/datasets/mathurinache/the-lj-speech-dataset

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import layers
from IPython import display
import matplotlib.pyplot as plt
from jiwer import wer 


In [2]:
meta_df = pd.read_csv("metadata.csv")
meta_df.shape

(13100, 5)

In [3]:
# split = int(len(meta_df) * 0.8)
train = meta_df.iloc[:200,:]
test = meta_df.iloc[200:250,:]


In [4]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)



In [5]:
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalarITensor. The number of samples to step.
frame_step = 160 
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

In [6]:
def encode_single_simple_train(wav_file, label):
    print(wav_file)
    file = tf.io.read_file("train/"+wav_file + ".wav")
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    return spectrogram, label

In [7]:
def encode_single_simple_test(wav_file, label):
    print(wav_file)
    file = tf.io.read_file("test/"+wav_file + ".wav")
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    return spectrogram, label

In [8]:
meta_df.head()

Unnamed: 0.1,Unnamed: 0,id,sentence,file_name,audio_path
0,0,LJ001-0001,"Printing, in the only sense with which we are ...",LJ001-0001.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
1,1,LJ001-0002,in being comparatively modern.,LJ001-0002.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
2,2,LJ001-0003,For although the Chinese took impressions from...,LJ001-0003.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
3,3,LJ001-0004,"produced the block books, which were the immed...",LJ001-0004.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
4,4,LJ001-0005,the invention of movable metal letters in the ...,LJ001-0005.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...


In [9]:
batch_size= 32
train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(train["id"]),list(train["sentence"]))
)
train_dataset = (
    train_dataset.map(encode_single_simple_train,num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
   .prefetch(buffer_size=tf.data.AUTOTUNE)
)
validation_dataset  = tf.data.Dataset.from_tensor_slices(
    (list(test["id"]),list(test["sentence"]))
)
validation_dataset=(
    validation_dataset.map(encode_single_simple_test,num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

Tensor("args_0:0", shape=(), dtype=string)
Tensor("args_0:0", shape=(), dtype=string)


In [10]:
# fig = plt.figure(figsize=(8, 5))
# for batch in train_dataset.take(1):
#     spectrogram = batch[0][0].numpy()
#     spectrogram = np.array([np.trim_zeros(x) for x in np.transpose(spectrogram)])  
#     label = batch[1][0]
#     label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
#     ax = plt.subplot(2, 1, 1)
#     ax.imshow(spectrogram, vmax=1) 
#     ax.set_title(label)
#     ax.axis("off")
    
#     file = tf.io.read_file("wavs/" + list(train["file_name"])[0] )
#     audio, _ = tf.audio.decode_wav(file)
#     audio = audio.numpy()
#     ax = plt.subplot(2, 1, 2)
#     plt.plot(audio)
#     ax. set_title("Signal Wave")
#     ax. set_xlim(0, len(audio))
#     display.display(display.Audio (np.transpose(audio), rate=16000))
# plt. show()

In [11]:
def CTCLoss(y_true, y_pred):
# Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf. cast(tf.shape(y_true)[1], dtype="int64")
    
    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    # print(y_true , "\n",y_pred )
    # print(input_length ,label_length )
    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [12]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    input_spectrogram = layers.Input((None, input_dim), name="input")
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[1, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    
    # for i in range(1, rnn_layers + 1):
    recurrent = layers.GRU(
        units=rnn_units,
        activation="tanh",
        recurrent_activation="sigmoid",
        use_bias=True,
        return_sequences=True,
        reset_after=True,
        name=f"gru",
    )
    x = layers.Bidirectional(
        recurrent, name=f"bidirectional", merge_mode="concat"
    )(x)


    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)

    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")

    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)

    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLoss)
    return model

# Assuming fft_length and char_to_num are defined earlier
# Get the model
model = build_model(
    input_dim=fft_length // 2 + 1,
    output_dim=char_to_num.vocabulary_size(),
    rnn_units=512,
)

model.summary(line_length=110)

In [13]:
# def decode_batch_predictions(pred):
#     input_len = np.ones(pred.shape[0]) * pred.shape[1]
#     results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
#     output_text = []
#     for result in results:
#         result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
#         output_text.append(result)
#     return output_text

In [14]:
# class CallbackEval(keras.callbacks.Callback):
#     def __init__(self, dataset):
#         super().__init__()
#         self.dataset = dataset
        
#     def on_epoch_end(self, epoch: int, logs=None):
#         predictions = []
#         targets = []
#         for batch in self.dataset:
#             X, y = batch
#             batch_predictions = model.predict(X)
#             batch_predictions = decode_batch_predictions(batch_predictions)
#             predictions.extend(batch_predictions)
#             for label in y:
#                 label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
#                 targets.append(label)
#         wer_score = wer(targets, predictions)
#         print("-" * 100)
#         print(f"word error rate: {wer_score:.4f}")
#         print("-" * 100)
#         for i in np.random.randint(0, len(predictions), 2):
#             print(f"Target: {targets[i]}")
#             print(f"Predictions: {predictions[i]}")
#             print("-" * 100)

In [15]:
epochs = 10
histroy = model.fit(
    train_dataset,
    validation_data = validation_dataset,
    epochs = epochs,
)

Epoch 1/10

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 14s/step - loss: 853.2272 - val_loss: 1093.0642
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 13s/step - loss: 485.9417 - val_loss: 811.2612
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 14s/step - loss: 438.9968 - val_loss: 533.3189
Epoch 4/10
[1m1/7[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:35[0m 16s/step - loss: 382.6378