In [1]:
import os
from pydub import AudioSegment

def convert_to_wav(source_path, target_path):
    # Load the source file
    audio = AudioSegment.from_file(source_path)
    # Export as WAV
    audio.export(target_path, format="wav")
    print(f"Converted {source_path} to {target_path}")


def check_and_convert_to_wav(file_path):
    # Check if the file is already a WAV file
    if not file_path.lower().endswith('.wav'):
        # Define a new file path with the WAV extension
        base = os.path.splitext(file_path)[0]
        new_file_path = base + ".wav"
        print("new file path: ", new_file_path)
        # Convert to WAV
        convert_to_wav(file_path, new_file_path)
        return new_file_path
    else:
        print(f"{file_path} is already in WAV format")
        return file_path




In [2]:
import io
import logging
import string
from timeit import default_timer as timer
import numpy as np
import tensorflow as tf
import webrtcvad
from tensorflow import keras

import wav_split

# Hide GPU from visible devices
tf.config.set_visible_devices([], 'GPU')

# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80

# Path to the H5 file
model_path = "C:\LearnIT\Speech_to_text\mymodel (1).h5"


def CTCLoss(y_true, y_pred):
    """
    Define CTC loss function
    """
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss


# Recreate the exact same model, including its weights and the optimizer
model = tf.keras.models.load_model(model_path, custom_objects={'CTCLoss': CTCLoss}, compile=False)
# Show the model architecture
model.summary()


def encode_single_buffer(audio_numpy_array):

    audio_tensor = tf.convert_to_tensor(audio_numpy_array, dtype=tf.string)
    audio, sampling_rate = tf.audio.decode_wav(audio_tensor)

    # Check for stereo audio (2 channels) and convert to mono if necessary
    if audio.shape[-1] == 2:
        # Convert stereo to mono by averaging both channels
        audio = tf.reduce_mean(audio, axis=-1, keepdims=True)

    audio = tf.squeeze(audio, axis=-1)
    # 3. Change type to float
    audio = tf.cast(audio, tf.float32)
    # 4. Get the spectrogram
    stfts = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    # 5. We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(stfts)  # get absolute value of complex number
    spectrogram = tf.math.pow(spectrogram, 2)  # get power

    # 6. mel spectrogram
    num_spectrogram_bins = stfts.shape[-1]
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins,
                                                                        sampling_rate, lower_edge_hertz,
                                                                        upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))
    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

    # 8. normalisation
    means = tf.math.reduce_mean(log_mel_spectrograms, 1, keepdims=True)
    stddevs = tf.math.reduce_std(log_mel_spectrograms, 1, keepdims=True)
    log_mel_spectrograms = (log_mel_spectrograms - means) / (stddevs + 1e-10)

    return log_mel_spectrograms


def encode_single_file(wav_file):
    """
    Describes the transformation that we apply to each element of our dataset
    """
    ###########################################
    ##  Process the Audio
    ##########################################
    # 1. Read wav file
    file = tf.io.read_file(wav_file)
    # 2. Decode the wav file
    audio, sampling_rate = tf.audio.decode_wav(file)
    # Check for stereo audio (2 channels) and convert to mono if necessary
    if audio.shape[-1] == 2:
        # Convert stereo to mono by averaging both channels
        audio = tf.reduce_mean(audio, axis=-1, keepdims=True)

    audio = tf.squeeze(audio, axis=-1)
    # 3. Change type to float
    audio = tf.cast(audio, tf.float32)
    # 4. Get the spectrogram
    stfts = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    # 5. We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(stfts)  # get absolute value of complex number
    spectrogram = tf.math.pow(spectrogram, 2)  # get power

    # 6. mel spectrogram
    num_spectrogram_bins = stfts.shape[-1]
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins,
                                                                        sampling_rate, lower_edge_hertz,
                                                                        upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))
    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

    # 8. normalisation
    means = tf.math.reduce_mean(log_mel_spectrograms, 1, keepdims=True)
    stddevs = tf.math.reduce_std(log_mel_spectrograms, 1, keepdims=True)
    log_mel_spectrograms = (log_mel_spectrograms - means) / (stddevs + 1e-10)

    return log_mel_spectrograms


def pad_tensor(tensor, desired_size):
    # Calculate the padding size
    padding_size = max(desired_size - tf.shape(tensor)[-1], 0)
    # Pad the tensor to the desired size
    padded_tensor = tf.pad(tensor, [[0, 0], [0, padding_size]])
    return padded_tensor


def convert_to_tensor_from_frame(audio):
    encoded = encode_single_buffer(audio)
    padded_features = pad_tensor(encoded, 16)
    padded_features = np.expand_dims(padded_features, axis=0)
    tensor_input = tf.convert_to_tensor(padded_features, dtype=tf.float32)
    return tensor_input


def convert_to_tensor_from_file(audio_file):
    encoded = encode_single_file(audio_file)
    padded_features = pad_tensor(encoded, 16)
    padded_features = np.expand_dims(padded_features, axis=0)
    tensor_input = tf.convert_to_tensor(padded_features, dtype=tf.float32)
    return tensor_input


# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text


def stt_from_frames(audio):
    input_tensor = convert_to_tensor_from_frame(audio)
    inference_time = 0.0

    # Run Deepspeech
    logging.debug('Running inference...')
    inference_start = timer()

    output = model.predict(input_tensor)
    output = decode_batch_predictions(output)

    inference_end = timer() - inference_start
    inference_time += inference_end
    logging.debug('Inference took %0.3fs.' % inference_end)

    return [output, inference_time]


def stt_from_file(audio):
    input_tensor = convert_to_tensor_from_file(audio)
    inference_time = 0.0

    # Run Deepspeech
    logging.debug('Running inference...')
    inference_start = timer()

    output = model.predict(input_tensor)
    output = decode_batch_predictions(output)

    inference_end = timer() - inference_start
    inference_time += inference_end
    logging.debug('Inference took %0.3fs.' % inference_end)

    return [output, inference_time]


lowercase_chars = string.ascii_lowercase
accented_chars = "àáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹý"
punctuation_chars = string.punctuation
final_chars = lowercase_chars + accented_chars + punctuation_chars + " "
characters = [x for x in final_chars]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
list = char_to_num.get_vocabulary()
char2num =  " ".join(list)
char_to_num_ = char2num.encode("utf-8")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

print(f"The vocabulary is: {char_to_num.get_vocabulary()} "
      f"(size ={char_to_num.vocabulary_size()})")


def vad_segment_generator(wavFile, aggressiveness):
    logging.debug("Caught the wav file @: %s" % (wavFile))
    audio, sample_rate, audio_length = wav_split.read_wave(wavFile)
    assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = wav_split.frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = wav_split.vad_collector(sample_rate, 30, 300, vad, frames)

    return segments, sample_rate, audio_length





Model: "ASR"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, None, 80)]        0         
                                                                 
 expand_dim (Reshape)        (None, None, 80, 1)       0         
                                                                 
 conv_1 (Conv2D)             (None, None, 40, 32)      14432     
                                                                 
 conv_1_bn (BatchNormalizat  (None, None, 40, 32)      128       
 ion)                                                            
                                                                 
 conv_1_relu (ReLU)          (None, None, 40, 32)      0         
                                                                 
 conv_2 (Conv2D)             (None, None, 20, 32)      236544    
                                                            

In [3]:
import numpy as np
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_nonsilent


def split_audio_on_silence(audio_path, target_sample_rate=16000, silence_thresh=-50, min_silence_len=1000,
                           keep_silence=500):
    """
    Splits an audio file into segments based on silence.

    :param audio_path: Path to the audio file.
    :param silence_thresh: Silence threshold in dB. Lower values mean more silence will be detected.
    :param min_silence_len: Minimum length of silence in milliseconds to consider as a split.
    :param keep_silence: Amount of silence to leave at the beginning and end of each segment.
    :return: List of audio segments.
    """
    sound = AudioSegment.from_file(audio_path)
    sound = sound.set_frame_rate(target_sample_rate)

    # Split on silence
    segments = split_on_silence(sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh,
                                keep_silence=keep_silence)
    return segments


def segment_audio_on_silence(file_path, min_silence_len=1000, silence_thresh=-40, max_segment_duration=3000):
    """
    Segment an audio file based on silence and maximum segment duration.

    Parameters:
    file_path (str): Path to the audio file.
    min_silence_len (int): Minimum length of silence to be used for splitting (in ms).
    silence_thresh (int): Silence threshold (in dB).
    max_segment_duration (int): Maximum duration of a segment (in ms).

    Returns:
    List of np.ndarray: Each element is a numpy array representing a segment.
    """
    # Load the audio file
    audio = AudioSegment.from_file(file_path)

    # Detect non-silent chunks
    nonsilent_chunks = detect_nonsilent(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)

    # Initialize list to hold final segments
    final_segments = []

    for start_i, end_i in nonsilent_chunks:
        segment = audio[start_i:end_i]
        # Further split segment if it's longer than the max duration
        while len(segment) > max_segment_duration:
            # Split at the max duration point
            split_segment, segment = segment[:max_segment_duration], segment[max_segment_duration:]
            final_segments.append(np.array(split_segment.get_array_of_samples()))
        final_segments.append(np.array(segment.get_array_of_samples()))

    return final_segments


In [4]:
import collections
import contextlib
import wave
import numpy as np
from scipy.signal import resample_poly

# def read_wave(path):
#     """Reads a .wav file.
#
#     Takes the path, and returns (PCM audio data, sample rate).
#     """
#     with contextlib.closing(wave.open(path, 'rb')) as wf:
#         num_channels = wf.getnchannels()
#         assert num_channels == 1
#         sample_width = wf.getsampwidth()
#         assert sample_width == 2
#         sample_rate = wf.getframerate()
#         assert sample_rate in (8000, 16000, 32000)
#         frames = wf.getnframes()
#         pcm_data = wf.readframes(frames)
#         duration = frames / sample_rate
#         return pcm_data, sample_rate, duration

def read_wave(path, target_sample_rate=16000):
    """Reads a .wav file and resamples it to the target sample rate.

    Takes the path and target sample rate, and returns (PCM audio data, target sample rate, duration).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels, sample_width, sample_rate, frames, _, _ = wf.getparams()
        assert sample_width == 2, "Only supports 16-bit audio."

        # Read frames and convert to byte array
        pcm_data = wf.readframes(frames)

        # Convert byte array to numpy array
        pcm_array = np.frombuffer(pcm_data, dtype=np.int16)

        # Check if the audio is mono or stereo and convert to mono if necessary
        if num_channels == 2:
            pcm_array = pcm_array.reshape(-1, 2).mean(axis=1).astype(np.int16)

        # Resample if the sample rate is not one of the expected rates
        if sample_rate not in (8000, 16000, 32000):
            # Calculate the number of output samples
            num_output_samples = int(frames * target_sample_rate / sample_rate)
            pcm_array = resample_poly(pcm_array, target_sample_rate, sample_rate)
            pcm_array = np.round(pcm_array).astype(np.int16)  # Ensure it's int16

        duration = len(pcm_array) / target_sample_rate
        return pcm_array.tobytes(), target_sample_rate, duration


def write_wave(path, audio, sample_rate):
    """Writes a .wav file.

    Takes path, PCM audio data, and sample rate.
    """
    with contextlib.closing(wave.open(path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)


class Frame(object):
    """Represents a "frame" of audio data."""

    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


def frame_generator(frame_duration_ms, audio, sample_rate):
    """Generates audio frames from PCM audio data.

    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.

    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n


def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
    """Filters out non-voiced audio frames.

    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.

    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.

    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.

    Arguments:

    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).

    Returns: A generator that yields PCM audio data.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        pass
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])


In [9]:
# Import the argparse library
import argparse
import logging
import os
import textwrap
import wave

import audio_converter
import model
import split

sample_rate = 16000
buffer_size = 2048 * 10
num_channels = 1
sample_width = 2  # Sample width in bytes


def predict_small_file(file_name):
    path = './audio_sample/' + file_name

    print(file_name)
    output_file_path = file_name.rstrip(".wav") + ".txt"
    output_file_path = 'output/' + output_file_path

    convertedFile = audio_converter.check_and_convert_to_wav(path)
    output, inference_time = model.stt_from_file(convertedFile)

    print("output:" + output[0] + "\ninference_time: " + str(inference_time))

    combined_transcription = textwrap.fill(output[0], width=70)

    combined_transcription += '\nCompleted in ' + str(inference_time)
    with open(output_file_path, 'w+', encoding="utf-8") as file:
        file.write(combined_transcription)

    print("Prediction Ended!")


def predict_big_file(path):
    path = './audio_sample/' + path

    output_file_path = path.rstrip(".wav") + ".txt"
    output_file_path = 'output/' + output_file_path

    convertedFile = audio_converter.check_and_convert_to_wav(path)

    # segments = split.split_audio_on_silence(convertedFile)
    segments = split.segment_audio_on_silence(convertedFile)
    transcriptions = []
    predict_times = 0
    chunks = 0
    for i, segment in enumerate(segments):
        print("Processing chunk ", i)
        chunks += 1
        # Run deepspeech on the chunk that just completed VAD
        logging.debug("Processing chunk %002d" % (i,))

        # Convert the NumPy array to bytes
        audio_bytes = segment.tobytes()

        # Create a wave file and set the parameters
        with wave.open('temp1.wav', 'wb') as audio_file:
            audio_file.setnchannels(num_channels)
            audio_file.setsampwidth(sample_width)
            audio_file.setframerate(sample_rate)

            # Write audio data
            audio_file.writeframes(audio_bytes)

        # Perform speech-to-text
        output, inference_time = model.stt_from_file('temp.wav')

        print("output:" + output[0] + "\ninference_time: " + str(inference_time))
        transcriptions.extend(output[0])
        predict_times += inference_time

    combined_transcription = ''.join(transcriptions)
    combined_transcription = textwrap.fill(combined_transcription, width=70)

    combined_transcription += '\nCompleted in ' + str(predict_times) + "\nChunk: " + str(chunks)
    with open(output_file_path, 'w', encoding="utf-8") as file:
        file.write(combined_transcription)

    print("Prediction Ended!")

def main_predict_demo(file_path, smalll=False):
    if (smalll):
        predict_small_file(file_path)
    else:
        predict_big_file(file_path)


def main():
    path = "VIVOSDEV14_116.wav"
    main_predict_demo(path, smalll=True)
    

if __name__ == "__main__":
    main()

VIVOSDEV14_116.wav
./audio_sample/VIVOSDEV14_116.wav is already in WAV format
output:mụt gấp cầu cá trình tàu sài gòn ảnh may qình
inference_time: 0.5721323000002485
Prediction Ended!
