In [3]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
import speech_recognition as sr

def convert_mp3_to_text_with_time(mp3_path):
    # Load MP3 file
    audio = AudioSegment.from_mp3(mp3_path)

    # Split audio on silence
    segments = split_on_silence(audio, silence_thresh=-40)

    # Initialize recognizer
    recognizer = sr.Recognizer()

    # Recognize text from each segment along with timestamps
    result = []
    current_time = 0
    for segment in segments:
        # Convert segment to raw audio data
        segment_raw = segment.raw_data
        
        # Recognize speech using Google Web Speech API
        try:
            text = recognizer.recognize_google(segment_raw, show_all=False)
            duration = len(segment) / 1000.0  # Convert duration to seconds
            result.append({'text': text, 'start_time': current_time, 'end_time': current_time + duration})
            current_time += duration
        except sr.UnknownValueError:
            print("Google Web Speech API could not understand audio")
        except sr.RequestError as e:
            print(f"Could not request results from Google Web Speech API; {e}")

    return result

# Replace "your_file.mp3" with the path to your actual MP3 file
mp3_path = "audios_teste.wav"
result = convert_mp3_to_text_with_time(mp3_path)

# Display results
for segment in result:
    print(f"Start Time: {segment['start_time']:.2f}s, End Time: {segment['end_time']:.2f}s, Text: {segment['text']}")


ValueError: ``audio_data`` must be audio data

In [4]:
# came along this code in below:
# https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python
# modified it and made it more usable



import speech_recognition as sr
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
from mp3_wav import mp3_wav
import shutil

# create a speech recognition object
r = sr.Recognizer()


def extract_text(path):
    """
    splitting the large audio file into chunks
    and apply speech recognition on each of these chunks
    then delete the chunks and only return the text
    """

    file_name = mp3_wav(path)
    # open the audio file using pydub
    sound = AudioSegment.from_wav(file_name)

    # split audio sound where silence is 700 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
                              # experiment with this value for your target audio file
                              min_silence_len=500,
                              # adjust this per requirement
                              silence_thresh=sound.dBFS - 14,
                              # keep the silence for 1 second, adjustable as well
                              keep_silence=500,
                              )

    # create a directory to store the audio chunks
    folder_name = "dummy"
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)

    whole_text = []

    # process each chunk
    for i, audio_chunk in enumerate(chunks, start=1):
        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            # try converting it to text
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError:
                continue
            else:
                text = f"{text.capitalize()}."
                whole_text += text

    # deleting the temp folder
    shutil.rmtree("dummy")

    # return the text for all chunks detected
    return whole_text


def txt_file(file_path):
    """
    this function returns a text file containing
    extracted text string from 'extract_text' function
    and saves on user's desktop
    """

    output = os.path.join(os.path.expanduser('~'), 'Desktop', "Extracted Text.txt")

    # creating a .txt file
    file = open(output, "w")
    # extracting text from the desired file
    text = extract_text(path=file_path)
    # writing to the file
    file.writelines(text)
    # closing the file
    file.close()
    print("\033[1;31m==>", "\033[1;39mCheck out your Desktop!", "\033[1;31m<==")


path = input("On My3.mb.mp3")
txt_file(path)

ModuleNotFoundError: No module named 'mp3_wav'

In [5]:
pip install SpeechRecognition pydub


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

