In [1]:
#pip install SpeechRecognition moviepy
#pip install pafy
#pip install --upgrade youtube_dl

import speech_recognition as sr
import moviepy.editor as mp

# Manipulación de Audio 
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pytube import YouTube

# Otras librerías
import os, shutil

In [2]:
class SoundToText:
    """
    Define a file with a video format.
    """
    def __init__(self, video_file):
        self.video_file = video_file
        self.current_directory = os.getcwd()
        self.video_description = ""
        self.current_folder = self.current_directory + "\\Converted results"
        self.converted_audio = self.current_folder + "\\"+ "Converted_audio.wav"
    
    """
    Download a video from YouTube.
    """
    def download_from_youtube(self):
        yt_file = YouTube(self.video_file)
        self.video_description = yt_file.description
        
        yt_file = yt_file.streams.get_highest_resolution()
        if not os.path.exists(self.current_folder):
            os.makedirs(self.current_folder)
        yt_file.download(self.current_folder)
     
    """
    Converts a video to audio file.
    """
    # It will write the audio in converted_audio.wav file.
    def convert_to_audio(self):
        for file in os.listdir(self.current_folder):
            file = self.current_folder + "\\" + file
            if file.endswith(".mp4"):
                raw_string = r"{}".format(file)
                raw_audio = r"{}".format( self.converted_audio)
                
                clip = mp.VideoFileClip(raw_string)#.subclip(10, 100)
                clip.audio.write_audiofile(raw_audio)
                print("Finished the convertion into audio...")
                   
    # It will read audio file
    def read_audio_file(self):
        audio = sr.AudioFile(self.converted_audio)
        print("Audio file readed...")
     
    # Define a function to normalize a chunk to a target amplitude.
    def match_target_amplitude(self, aChunk, target_dBFS):
        change_in_dBFS = target_dBFS - aChunk.dBFS
        return aChunk.apply_gain(change_in_dBFS)

    def create_audio_chunks(self, audio):
    # split audio sound where silence is 700 miliseconds or more and get chunks
        chunks = split_on_silence(audio,
            # experiment with this value for your target audio file
            min_silence_len = 500,
            # adjust this per requirement
            silence_thresh = -40,
            # keep the silence for 1 second, adjustable as well
            #keep_silence=100,
        )
        return chunks

    # Splits the audio file into chunks
    # and applies speech recognition
    def get_audio_transcription(self):
        r = sr.Recognizer()
        
        path = self.converted_audio # TODO change name
        # open the audio file using pydub
        folder_name = path.replace("wav","")
    
        fh = open(folder_name + "txt", "w+") 
        audio = AudioSegment.from_wav(path)  
    
        # Create chunks of audio
        chunks = self.create_audio_chunks(audio)
    
        # Create a directory to store the audio chunks
        if not os.path.isdir(folder_name):
            os.mkdir(folder_name)
        whole_text = ""

        # process each chunk 
        for i, audio_chunk in enumerate(chunks, start=0):
            silence_chunk = AudioSegment.silent(duration=500)
            
             # Add the padding chunk to beginning and end of the entire chunk.
            audio_chunk = silence_chunk + audio_chunk + silence_chunk
            
            # export audio chunk and save it in
            # the `folder_name` directory.
            chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
            
            # Normalize the entire chunk.
            normalized_chunk = self.match_target_amplitude(audio_chunk, -20.0)
            
            normalized_chunk.export(chunk_filename, bitrate = "192k", format="wav")
            
            # recognize the chunk
            with sr.AudioFile(chunk_filename) as source:
                # Now from here we convert audio into text
                r.energy_threshold = 300
                r.adjust_for_ambient_noise(source, duration=0.5)
                
                audio_file = r.record(source)
            
                # try converting it to text
                try:  
                    text = r.recognize_google(audio_file, language='es-MX') 
                except sr.UnknownValueError as e:
                    print("Error:", str(e))
                else:
                    text = f"{text} "
                    #print(" -- :", text)
                    whole_text += text
                    
        # If exists a description, it will be added in whole_text
        video_description = "\nDescripción del Video: \n" + str(self.video_description)
        whole_text += video_description
        
        #word = "ingredientes" # TODO cambiar palabras 
        #if not word in whole_text:
        #    print("No es video-receta!")
        #    return
        
        # return the text for all chunks detected
        fh.write(whole_text + "\n") 
        return print("\nFull text:", whole_text)
    
    def convert_video_to_audio(self):
        if "youtube" in self.video_file: #optimizar en frontend
            self.download_from_youtube()
            self.convert_to_audio()
            self.read_audio_file()
            self.get_audio_transcription()
        else:
            self.read_audio_file()
            self.get_audio_transcription()
        #self.remove_filed() #TODO error => no elimina mp4
            
    def remove_filed(self):
        for filename in os.listdir(self.current_folder):
            file_path = os.path.join(self.current_folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))

In [3]:
videourl = "https://www.youtube.com/watch?v=TzbvfPaGHl4" #https://www.youtube.com/watch?v=mY4R1uR4gYI"  #"https://www.youtube.com/watch?v=qi3V_ArpDp4" 
to_text = SoundToText(videourl)

In [4]:
to_text.convert_video_to_audio()

MoviePy - Writing audio in C:\Users\VanessaDuarte\Documents\Maestría\Seminario de Innovación\Backend\Converted results\Converted_audio.wav


                                                                                                                       

MoviePy - Done.
Finished the convertion into audio...
Audio file readed...
result2:
[]
Error: 
result2:
{   'alternative': [   {   'confidence': 0.95628816,
                           'transcript': 'hola qué tal amiguitos bienvenidos '
                                         'al canal'},
                       {   'transcript': 'hola que tal amiguitos bienvenidos '
                                         'al canal'}],
    'final': True}
result2:
{   'alternative': [   {   'confidence': 0.89967024,
                           'transcript': 'yo soy Patty Martínez'},
                       {'transcript': 'yo soy Paty Martínez'}],
    'final': True}
result2:
{   'alternative': [   {   'confidence': 0.95804888,
                           'transcript': 'y el día de hoy vamos a hacer unas '
                                         'deliciosas rajas con crema y elote'}],
    'final': True}
result2:
{   'alternative': [   {   'confidence': 0.958049,
                           'transcript': 'es

result2:
[]
Error: 
result2:
[]
Error: 
result2:
{   'alternative': [   {   'confidence': 0.95758694,
                           'transcript': 'miren así es como debe de quedar '
                                         'todo tatemadito por todos lados pero '
                                         'no quemado así se les va a despegar '
                                         'la piel muy muy fácil ahora lo vamos '
                                         'a meter en una bolsa de plástico '
                                         'para que suden y así poder '
                                         'despegarles bien la piel cerramos '
                                         'bien bien'},
                       {   'transcript': 'miren así es como debe de quedar '
                                         'todo tatemadito por todos lados pero '
                                         'no quemado así se les va a despegar '
                                         'la piel muy muy fá

result2:
{   'alternative': [   {   'confidence': 0.95804894,
                           'transcript': 'este vamos a poner una taza y media '
                                         'de crema'}],
    'final': True}
result2:
{   'alternative': [   {   'confidence': 0.95804894,
                           'transcript': 'a mí me gusta que salga bien bien '
                                         'cremosita'},
                       {   'transcript': 'a mí me gusta que salga bien bien '
                                         'cremosito'}],
    'final': True}
result2:
[]
Error: 
result2:
[]
Error: 
result2:
{   'alternative': [{'confidence': 0.958049, 'transcript': 'revolvemos bien'}],
    'final': True}
result2:
[]
Error: 
result2:
{   'alternative': [   {   'confidence': 0.92824531,
                           'transcript': 'y aquí lo vamos a dejar que hierva '
                                         'unos 5 minutos'},
                       {   'transcript': 'y aquí lo vamos a dejar q