In [2]:

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
pip install jiwer

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting jiwer
  Downloading jiwer-3.0.0-py3-none-any.whl (21 kB)
Installing collected packages: jiwer
Successfully installed jiwer-3.0.0
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import librosa
import librosa.display
import IPython.display as ipd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import jiwer
from jiwer import wer


In [5]:
#Loading data LJSpeech
wave_data = "/kaggle/input/ljspeech-sr16k-dataset/wavs/"
metadata_path = "/kaggle/input/ljspeech-sr16k-dataset/metadata.csv"

In [6]:
metadata_df = pd.read_csv(metadata_path,header=None)

metadata_df=metadata_df[[1,2]]
metadata_df.columns =['file_name','transcription']
metadata_df = metadata_df[metadata_df.file_name != 'id']

metadata_df = metadata_df.head(4000)
# metadata_df


In [7]:
df_training = metadata_df[:int(len(metadata_df)*0.80)]
df_validate = metadata_df[int(len(metadata_df)*0.80):]
print(f"size of train data is {len(df_training)}")

size of train data is 3200


In [8]:
char = [x for x in "abcdefghijklmnopqrstuvwxyz',.?! "]
char_to_num = keras.layers.StringLookup(vocabulary=char, oov_token="")
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary() , oov_token="", invert=True)


In [9]:
frame_length = 256
frame_step = 160
fft_length = 384

def encode_sample(wave_file, label):
    file = tf.io.read_file(wave_data + wave_file +'.wav')
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=1)
    audio = tf.cast(audio, tf.float32)
    
    spectogram = tf.signal.stft(
        audio, frame_length = frame_length, frame_step = frame_step, fft_length = fft_length
    )
    spectogram = tf.abs(spectogram)
    spectogram = tf.math.pow(spectogram,0.5)
    
    means = tf.math.reduce_mean(spectogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectogram,1, keepdims= True)
    spectogram = (spectogram - means)/ (stddevs + 1e-10)
    print(spectogram)
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    print(f"The array of spectogram: {spectogram}")
    return spectogram, label 


In [10]:
batch_size = 32
#Creating Dataset Object
train_dataset = tf.data.Dataset.from_tensor_slices(
(list(df_training["file_name"]), list(df_training["transcription"])))

train_dataset = (
train_dataset.map(encode_sample, num_parallel_calls = tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

validation_dataset = tf.data.Dataset.from_tensor_slices(
(list(df_validate["file_name"]), list(df_validate["transcription"])))

validation_dataset = (
validation_dataset.map(encode_sample, num_parallel_calls = tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

Tensor("truediv:0", shape=(None, 193), dtype=float32)
The array of spectogram: Tensor("truediv:0", shape=(None, 193), dtype=float32)
Tensor("truediv:0", shape=(None, 193), dtype=float32)
The array of spectogram: Tensor("truediv:0", shape=(None, 193), dtype=float32)


In [11]:
# #visualize 


# plt.figure(figsize=(15,17))
# plt.subplot(3,1,3)
# librosa.display.waveshow(spectogram,alpha=0.5)
# plt.ylim((-1,1))
# plt.show() 

In [12]:
#Model reducing losses
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
    
    input_length = input_length * tf.ones(shape=(batch_len, 1),dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1),dtype="int64")
    
    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [13]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    """Model's frame"""
    input_spectogram = layers.Input((None, input_dim), name="input")
    
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectogram)

    x = layers.Conv2D(
        filters=32,
        kernel_size=[11,41],
        strides=[2,2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11,21],
        strides=[1,2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    
    x = layers.Reshape((-1,x.shape[-2] * x.shape[-1]))(x)
    
    for i in range(1, rnn_layers +1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}",merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    
    x = layers.Dense(units=rnn_units * 2, name="dense1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    
    output = layers.Dense(units=output_dim +1, activation="softmax")(x)
    model = keras.Model(input_spectogram, output, name="DeepSpeech_2")
    
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    
    model.compile(optimizer=opt, loss=CTCLoss)
    return model

#Get the model
model = build_model(
    input_dim=fft_length // 2 +1,
    output_dim=char_to_num.vocabulary_size(),
    rnn_units=512,
)
model.summary(line_length=110)
            

Model: "DeepSpeech_2"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, None, 193)]                         0                
                                                                                                              
 expand_dim (Reshape)                            (None, None, 193, 1)                        0                
                                                                                                              
 conv_1 (Conv2D)                                 (None, None, 97, 32)                        14432            
                                                                                                              
 conv_1_bn (BatchNormalization)                  (None, None, 97, 32)                     

  **Training and Evaluating**

In [14]:
#A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

# A callback class to ouput a few transcription during training
class CallbackEval(keras.callbacks.Callback):
   
    def __init__(self,dataset):
        super().__init__()
        self.dataset = dataset 
        
    def on_epoch_end(self, epoch: int, logs=None):
        predictions = []
        targets = []
        for batch in self.dataset:
            X, y = batch
            batch_predictions =  model.predict(X)
            batch_predictions = decode_batch_predictions(batch_predictions)
            predictions.extend(batch_predictions)
            for label in y:
                label = (
                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
                    
                )
                targets.append(label)
        wer_score = wer(targets, predictions)
        print("." *100)
        print(f"Word error rate: {wer_score:.4f}")
        print("."*100)
        for i in np.random.randint(0, len(predictions), 2):
            print(f"Target   : {targets[i]}")
            print(f"Prediction: {predictions[i]}")
            print("." * 100)
    

***Training Process***

In [15]:
epochs=100
validation_callback = CallbackEval(validation_dataset)
history = model.fit(
    train_dataset,
    validation_data = validation_dataset,
    epochs = epochs,
    callbacks=[validation_callback],
)

Epoch 1/100
....................................................................................................
Word error rate: 1.0000
....................................................................................................
Target   : returned a verdict of willful murder against some person unknown.
Prediction: 
....................................................................................................
Target   : having brought down the records of great frauds, forgeries, and thefts from about  to ,
Prediction: 
....................................................................................................
Epoch 2/100
....................................................................................................
Word error rate: 1.0000
....................................................................................................
Target   : the circumstances of this purchase of brilliants from a stranger at such an inadequate price was strongly com

In [16]:
predictions = []
targets = []
for batch in validation_dataset:
    X,y = batch
    batch_predictions = model.predict(X)
    batch_predictions = decode_batch_predictions(batch_predictions)
    predictions.extend(batch_predictions)
    
    for label in y:
        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
        targets.append(label)
wer_score = wer(targets, predictions)
print("." * 100)
print(f"Word error rate: {wer_score: 0.4f}")
for i in np.random.randint(0, len(predictions), 5):
    print(f"Target   : {targets[i]}")
    print(f"Prediction: {predictions[i]}")
    print(predictions[i])
    print("." * 100)



....................................................................................................
Word error rate:  0.4979
Target   : but could not positively identify it, and ikey defied them to remove a single shoe.
Prediction: but could not positively ad andifiet and i pe defied them to remove as single sho.
but could not positively ad andifiet and i pe defied them to remove as single sho.
....................................................................................................
Target   : elucidated all dark and uncertain points in connection with the crime.
Prediction: the losedated al darkand uncertain points inconection with the crime.
the losedated al darkand uncertain points inconection with the crime.
....................................................................................................
Target   : courvoisier, when put on his trial, pleaded not guilty
Prediction: chu visiay when put on his trial, pleadid not guilty.
chu visiay when put on his trial,

In [17]:

model.save("/kaggle/working/notebook22.h5")


In [2]:
from keras.models import load_model
model = load_model('/kaggle/working/notebook22.h5')

OSError: No file or directory found at /kaggle/working/notebook22.h5