In [151]:
# !pip install tensorflow == 2.15.0

In [128]:
from IPython.display import display, Audio, HTML, display
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import warnings
#import tensorflow_io as tfio
warnings.filterwarnings("ignore")

### Vocabulary

In [129]:
frame_length = 256
frame_step = 160

ch = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
stoi = keras.layers.StringLookup(vocabulary=ch, oov_token="")
itos = keras.layers.StringLookup(vocabulary=stoi.get_vocabulary() , oov_token="", invert=True)

### Loss function

In [132]:
# Function required by models
@keras.saving.register_keras_serializable()
def CTC_loss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

### Encode and Decode Function

In [133]:
# Encode and decode functions
def encodeSample(wavFile):
    wav = tf.io.read_file(wavFile)
    wav, audio_rate = tf.audio.decode_wav(wav)
    #print(wav[0].shape)
    if wav[0].shape == 2:#stereo to mono
        wav = tf.reshape(wav, (-1, 1))
    #print(wav.shape)
    wav = tf.squeeze(wav, axis=-1)
    audio = tf.cast(wav, tf.float32)
    spectogram = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step)
    spectogram = tf.abs(spectogram)
    spectogram = tf.math.pow(spectogram, 0.5)
    means = tf.math.reduce_mean(spectogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectogram, 1, keepdims=True)
    spectogram = (spectogram - means) / stddevs + 1e-9
    spectogram = spectogram.numpy()
    spectogram = tf.expand_dims(spectogram, 0)
    return spectogram

In [134]:
def decodeSample(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = keras.backend.ctc_decode(pred, input_length = input_len, greedy = True)[0][0]
    output_text = []
    for res in results:
        res = tf.strings.reduce_join(itos(res)).numpy().decode("utf-8")
        output_text.append(res)

    return output_text

In [136]:
# Downloads model
!wget https://huggingface.co/Ayushkm10/SpeechToText/resolve/main/speech2text4.keras

--2024-04-25 13:11:55--  https://huggingface.co/Ayushkm10/SpeechToText/resolve/main/speech2text4.keras
Resolving huggingface.co (huggingface.co)... 54.230.18.84, 54.230.18.95, 54.230.18.110, ...
Connecting to huggingface.co (huggingface.co)|54.230.18.84|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.huggingface.co/repos/a5/8d/a58d2a3cebcdf6f7c935506be45b9dc1866c49af5590aea477ffc45bd830a527/e2b76b19490c1aea63e6a6ba13ef286bbdba5144698aa39ec678d61762ce468d?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27speech2text4.keras%3B+filename%3D%22speech2text4.keras%22%3B&Expires=1714309915&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxNDMwOTkxNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2E1LzhkL2E1OGQyYTNjZWJjZGY2ZjdjOTM1NTA2YmU0NWI5ZGMxODY2YzQ5YWY1NTkwYWVhNDc3ZmZjNDViZDgzMGE1MjcvZTJiNzZiMTk0OTBjMWFlYTYzZTZhNmJhMTNlZjI4NmJiZGJhNTE0NDY5OGFhMzllYzY3OGQ2

In [137]:
# Download sample files
!wget https://huggingface.co/Ayushkm10/SpeechToText/resolve/main/0.wav
!wget https://huggingface.co/Ayushkm10/SpeechToText/resolve/main/Larynx-HiFi-GAN_speech_sample.wav

--2024-04-25 13:11:58--  https://huggingface.co/Ayushkm10/SpeechToText/resolve/main/0.wav
Resolving huggingface.co (huggingface.co)... 54.230.18.85, 54.230.18.95, 54.230.18.110, ...
Connecting to huggingface.co (huggingface.co)|54.230.18.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 425830 (416K) [audio/wave]
Saving to: ‘0.wav.1’


2024-04-25 13:11:58 (7.95 MB/s) - ‘0.wav.1’ saved [425830/425830]

--2024-04-25 13:11:58--  https://huggingface.co/Ayushkm10/SpeechToText/resolve/main/Larynx-HiFi-GAN_speech_sample.wav
Resolving huggingface.co (huggingface.co)... 54.230.18.85, 54.230.18.95, 54.230.18.110, ...
Connecting to huggingface.co (huggingface.co)|54.230.18.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255020 (249K) [audio/wave]
Saving to: ‘Larynx-HiFi-GAN_speech_sample.wav.1’


2024-04-25 13:11:58 (6.40 MB/s) - ‘Larynx-HiFi-GAN_speech_sample.wav.1’ saved [255020/255020]



In [140]:
!file speech2text4.keras

speech2text4.keras: Zip archive data, at least v2.0 to extract, compression method=store


In [156]:
model = keras.models.load_model("speech2text4.keras")

In [157]:
import subprocess
def stereoToMono(input_file, output_file):
    # Run FFmpeg command to convert stereo to mono with overwrite
    cmd = f"ffmpeg -y -i {input_file} -ac 1 {output_file}"
    subprocess.run(cmd, shell=True)

In [162]:
# Main function
def speechToText(wavFile):
  outputFile = "output_mono.wav"
  stereoToMono(wavFile, outputFile)
  spectogram = encodeSample(outputFile)
  pred = model.predict(spectogram)
  return decodeSample(pred)[0]

## Demo

In [168]:
display(Audio("0.wav"))
speechToText("0.wav")



"printing in the only sense with which we are at present concerned differs from most if not from all the arts and craft's represented in the exibiion"

In [170]:
display(Audio("Larynx-HiFi-GAN_speech_sample.wav"))
speechToText("Larynx-HiFi-GAN_speech_sample.wav")



'this is an example of sinfacize each that was created by a nural tho co'