In [7]:
import pandas as pd 
import os 

In [8]:
meta_df = pd.read_csv("metadata.csv")
meta_df.shape

(13100, 5)

In [9]:
train_meta_df = meta_df.iloc[:200,:]

In [10]:
train_meta_df.isnull().sum()

Unnamed: 0    0
id            0
sentence      0
file_name     0
audio_path    0
dtype: int64

In [11]:
train_meta_df.count()

Unnamed: 0    200
id            200
sentence      200
file_name     200
audio_path    200
dtype: int64

In [12]:
compare = train_meta_df["file_name"] == os.listdir("train/")

In [None]:
len(set(compare)) == 1

True

In [13]:
test_meta_df = meta_df.iloc[200:250,:]

In [14]:
train_meta_df.head()

Unnamed: 0.1,Unnamed: 0,id,sentence,file_name,audio_path
0,0,LJ001-0001,"Printing, in the only sense with which we are ...",LJ001-0001.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
1,1,LJ001-0002,in being comparatively modern.,LJ001-0002.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
2,2,LJ001-0003,For although the Chinese took impressions from...,LJ001-0003.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
3,3,LJ001-0004,"produced the block books, which were the immed...",LJ001-0004.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...
4,4,LJ001-0005,the invention of movable metal letters in the ...,LJ001-0005.wav,/kaggle/input/ljspeech-dataset/LJSpeech-1.1/wa...


In [15]:
test_meta_df.isnull().sum()

Unnamed: 0    0
id            0
sentence      0
file_name     0
audio_path    0
dtype: int64

In [16]:
test_meta_df.count()

Unnamed: 0    50
id            50
sentence      50
file_name     50
audio_path    50
dtype: int64

In [18]:
compare = test_meta_df["file_name"] == os.listdir("test/")

In [19]:
len(set(compare)) == 1

True

In [20]:
import tensorflow as tf 
from tensorflow import keras 

In [21]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)


In [22]:
frame_length = 256
frame_step = 160 
fft_length = 384

In [23]:
def encode_single_simple_train(wav_file, label):
    file = tf.io.read_file("train/"+wav_file + ".wav")
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    spectrogram = tf.math.pow(spectrogram, 0.5)
    spectrogram = tf.abs(spectrogram)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    return spectrogram, label

In [24]:
def encode_single_simple_test(wav_file, label):
    file = tf.io.read_file("test/"+wav_file + ".wav")
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    spectrogram = tf.math.pow(spectrogram, 0.5)
    spectrogram = tf.abs(spectrogram)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    return spectrogram, label

In [25]:
batch_size= 32
train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(train_meta_df["id"]),list(train_meta_df["sentence"]))
)
train_dataset = (
    train_dataset.map(encode_single_simple_train,num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
   .prefetch(buffer_size=tf.data.AUTOTUNE)
)


In [26]:
test_dataset  = tf.data.Dataset.from_tensor_slices(
    (list(test_meta_df["id"]),list(test_meta_df["sentence"]))
)
test_dataset=(
    test_dataset.map(encode_single_simple_test,num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [27]:
train_dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(None, None, 193), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [28]:
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf. cast(tf.shape(y_true)[1], dtype="int64")
    
    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    
    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [29]:
from keras import Sequential , layers
from keras.layers import Reshape , Conv2D ,Input,BatchNormalization ,Bidirectional , GRU , Dense, LSTM, Dropout

In [30]:
# model = Sequential()
# input_spectrogram = layers.Input((None, fft_length // 2 + 1))
# model.add(input_spectrogram)
# model.add(Reshape((-1, fft_length // 2 + 1, 1)))
# model.add(Conv2D(filters=32 , kernel_size= (3,3) ,strides=(2,2) , padding='same'  , activation='relu'))
# model.add(BatchNormalization())
# model.add(Reshape(target_shape=((-1, model.output_shape[-2] * model.output_shape[-1]))))
# recurrent = layers.GRU(
#         units=512,
#         activation="tanh",
#         recurrent_activation="sigmoid",
#         use_bias=True,
#         return_sequences=True,
#         reset_after=True,
#         name=f"gru_",
#     )
# model.add(Bidirectional(recurrent , merge_mode='concat'))
# model.add(Dense(512 , activation='relu'))
# model.add(Dropout(0.4))
# model.add(Dense(32 , activation='softmax'))

# model.summary()

In [40]:
input_dim=fft_length // 2 + 1
output_dim=char_to_num.vocabulary_size()
rnn_units=512
print(type(input_dim))
print(input_dim)
input_spectrogram = Input(shape=(None, input_dim), name="input")
x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)

x = layers.Conv2D(
    filters=32,
    kernel_size=[11, 41],
    strides=[2, 2],
    padding="same",
    use_bias=False,
    name="conv_1",
)(x)
x = layers.BatchNormalization(name="conv_1_bn")(x)
x = layers.ReLU(name="conv_1_relu")(x)

x = layers.Conv2D(
    filters=32,
    kernel_size=[11, 21],
    strides=[1, 2],
    padding="same",
    use_bias=False,
    name="conv_2",
)(x)
x = layers.BatchNormalization(name="conv_2_bn")(x)
x = layers.ReLU(name="conv_2_relu")(x)

x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)

# for i in range(1, rnn_layers + 1):
recurrent = layers.GRU(
    units=rnn_units,
    activation="tanh",
    recurrent_activation="sigmoid",
    use_bias=True,
    return_sequences=True,
    reset_after=True,
    name=f"gru",
)
x = layers.Bidirectional(
    recurrent, name=f"bidirectional", merge_mode="concat"
)(x)


# Dense layer
x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
x = layers.ReLU(name="dense_1_relu")(x)
x = layers.Dropout(rate=0.5)(x)
output = layers.Dense(units=output_dim + 1, activation="softmax")(x)

model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")

# Optimizer
opt = keras.optimizers.Adam(learning_rate=1e-4)

# Compile the model and return
model.compile(optimizer=opt, loss=CTCLoss)

# Assuming fft_length and char_to_num are defined earlier
# Get the model


model.summary(line_length=110)

<class 'int'>
193


In [41]:
model.compile(optimizer='adam' , loss=CTCLoss , metrics=['accuracy'] ) 

In [42]:
hist = model.fit(train_dataset , validation_data= test_dataset ,epochs=20)

Epoch 1/20



InvalidArgumentError: Graph execution error:

Detected at node Equal defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "d:\Installation\Python\Lib\asyncio\base_events.py", line 639, in run_forever

  File "d:\Installation\Python\Lib\asyncio\base_events.py", line 1985, in _run_once

  File "d:\Installation\Python\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 359, in execute_request

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 446, in do_execute

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\DELL\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\DELL\AppData\Local\Temp\ipykernel_19424\3208831943.py", line 1, in <module>

  File "d:\Installation\Python\Lib\site-packages\keras\src\utils\traceback_utils.py", line 118, in error_handler

  File "d:\Installation\Python\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 323, in fit

  File "d:\Installation\Python\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 117, in one_step_on_iterator

  File "d:\Installation\Python\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 105, in one_step_on_data

  File "d:\Installation\Python\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 76, in train_step

  File "d:\Installation\Python\Lib\site-packages\keras\src\trainers\trainer.py", line 375, in compute_metrics

  File "d:\Installation\Python\Lib\site-packages\keras\src\trainers\compile_utils.py", line 331, in update_state

  File "d:\Installation\Python\Lib\site-packages\keras\src\trainers\compile_utils.py", line 18, in update_state

  File "d:\Installation\Python\Lib\site-packages\keras\src\metrics\reduction_metrics.py", line 199, in update_state

  File "d:\Installation\Python\Lib\site-packages\keras\src\metrics\accuracy_metrics.py", line 252, in sparse_categorical_accuracy

  File "d:\Installation\Python\Lib\site-packages\keras\src\ops\numpy.py", line 2464, in equal

  File "d:\Installation\Python\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 770, in equal

Incompatible shapes: [32,168] vs. [32,497]
	 [[{{node Equal}}]] [Op:__inference_one_step_on_iterator_6998]