TEXT TO SPEECH FROM "https://huggingface.co/datasets/charris/hubert_process_filter_spotify"

In [None]:
import os
from datasets import load_dataset
import torchaudio
import torch
import whisper
import speech_recognition as sr
from pathlib import Path

dataset_name = "charris/hubert_process_filter_spotify"
output_dir = "C:/Users/joey_/Desktop/Spotify/downloaded_audio"
transcriptions_dir = "C:/Users/joey_/Desktop/Spotify/transcriptions"

Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(transcriptions_dir).mkdir(parents=True, exist_ok=True)

def download_audio_files():
    print(f"Cargando dataset desde {dataset_name}...")
    dataset = load_dataset(dataset_name, split="train", streaming=True)
    
    # Prueba si el dataset tiene datos
    first_sample = next(iter(dataset), None)
    if first_sample is None:
        print("Error: El dataset está vacío o no se pudo cargar.")
        return None
    
    print("Primer muestra del dataset:", first_sample)  # Inspecciona la estructura
    
    max_files = 10
    for i, sample in enumerate(dataset):
        if i >= max_files:
            break
        try:
            audio_data = sample["audio"]
            audio_path = audio_data["path"]
            audio_array = audio_data["array"]
            sampling_rate = audio_data["sampling_rate"]
            
            local_path = os.path.join(output_dir, f"audio_{i}.wav")
            torchaudio.save(local_path, torch.from_numpy(audio_array).unsqueeze(0), sampling_rate)
            print(f"Descargado: {local_path}")
        except Exception as e:
            print(f"Error al procesar el archivo {audio_path}: {e}")
    
    return output_dir

def transcribe_with_whisper(audio_path, model):
    print(f"Transcribiendo {audio_path} con Whisper...")
    result = model.transcribe(audio_path)
    transcription = result["text"]
    return transcription

def transcribe_with_speech_recognition(audio_path):
    print(f"Transcribiendo {audio_path} con SpeechRecognition...")
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    try:
        transcription = recognizer.recognize_google(audio)
        return transcription
    except sr.UnknownValueError:
        return "No se pudo entender el audio"
    except sr.RequestError as e:
        return f"Error en la solicitud: {e}"

def process_dataset():
    audio_dir = download_audio_files()
    whisper_model = whisper.load_model("base")
    for audio_file in Path(audio_dir).glob("*.wav"):
        transcription_whisper = transcribe_with_whisper(str(audio_file), whisper_model)
        transcription_file_whisper = Path(transcriptions_dir) / f"{audio_file.stem}_whisper.txt"
        transcription_file_whisper.write_text(transcription_whisper, encoding="utf-8")
        print(f"Transcripción (Whisper) guardada en: {transcription_file_whisper}")
        
        transcription_sr = transcribe_with_speech_recognition(str(audio_file))
        transcription_file_sr = Path(transcriptions_dir) / f"{audio_file.stem}_sr.txt"
        transcription_file_sr.write_text(transcription_sr, encoding="utf-8")
        print(f"Transcripción (SpeechRecognition) guardada en: {transcription_file_sr}")

if __name__ == "__main__":
    process_dataset()

Cargando dataset desde charris/hubert_process_filter_spotify...


: 

: 