TEXT TO SPEECH FROM "https://huggingface.co/datasets/charris/hubert_process_filter_spotify"

In [8]:
import os
import logging
import time
from typing import Optional, Dict, Any
from datasets import load_dataset
import torchaudio
import torch
import whisper
import speech_recognition as sr
from pathlib import Path
import numpy as np
from tqdm import tqdm
import librosa  # Re-added for audio loading

# Configuration
CONFIG = {
    "dataset_name": "charris/hubert_process_filter_spotify",
    "output_dir": Path("C:/Users/joey_/Desktop/Spotify/downloaded_audio"),
    "transcriptions_dir": Path("C:/Users/joey_/Desktop/Spotify/transcriptions"),
    "max_files": 1,
    "whisper_model": "base",
    "num_workers": 2
}

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('transcription.log'),
        logging.StreamHandler()
    ]
)

def setup_directories() -> None:
    """Create necessary directories if they don't exist."""
    CONFIG["output_dir"].mkdir(parents=True, exist_ok=True)
    CONFIG["transcriptions_dir"].mkdir(parents=True, exist_ok=True)

def download_audio_files() -> Optional[Path]:
    """Download and save audio files from the dataset."""
    logging.info(f"Loading dataset from {CONFIG['dataset_name']}...")
    try:
        dataset = load_dataset(CONFIG["dataset_name"], split="train", streaming=False)
        first_sample = next(iter(dataset), None)
        
        if first_sample is None:
            logging.error("Dataset is empty or could not be loaded.")
            return None
        
        logging.info("Processing audio files...")
        for i, sample in tqdm(enumerate(dataset), total=CONFIG["max_files"]):
            if i >= CONFIG["max_files"]:
                break
                
            try:
                audio_data = sample["audio"]
                local_path = CONFIG["output_dir"] / f"audio_{i}.wav"
                
                audio_array = torch.from_numpy(audio_data["array"]).unsqueeze(0)
                audio_array = (audio_array * 32767).to(torch.int16)
                
                torchaudio.save(
                    str(local_path),
                    audio_array,
                    audio_data["sampling_rate"],
                    encoding='PCM_S',
                    bits_per_sample=16
                )
                time.sleep(2)
                logging.info(f"Downloaded and converted: {local_path}")
            except Exception as e:
                logging.error(f"Error processing file {i}: {str(e)}")
                continue
        
        return CONFIG["output_dir"]
    except Exception as e:
        logging.error(f"Failed to download audio files: {str(e)}")
        return None

def transcribe_with_whisper(audio_path: str, model: Any) -> str:
    """Transcribe audio using Whisper model."""
    try:
        logging.info(f"Transcribing {audio_path} with Whisper...")
        if not os.path.isfile(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        if not os.access(audio_path, os.R_OK):
            raise PermissionError(f"No read permission for file: {audio_path}")
        
        logging.info(f"File exists and is readable: {audio_path}")
        
        # Load audio as an array to bypass file path issues
        audio, sample_rate = librosa.load(audio_path, sr=16000)  # Whisper expects 16kHz
        logging.info(f"Audio loaded successfully: {audio.shape}, sample_rate={sample_rate}")
        
        # Transcribe using the audio array instead of the file path
        result = model.transcribe(audio)
        logging.info("Whisper transcription completed")
        return result["text"]
    except Exception as e:
        logging.error(f"Whisper transcription failed: {str(e)}")
        return ""

def transcribe_with_speech_recognition(audio_path: str) -> str:
    """Transcribe audio using Google Speech Recognition."""
    try:
        logging.info(f"Transcribing {audio_path} with SpeechRecognition...")
        if not os.path.isfile(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        
        recognizer = sr.Recognizer()
        with sr.AudioFile(audio_path) as source:
            audio = recognizer.record(source)
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        logging.warning(f"Could not understand audio: {audio_path}")
        return "Audio could not be understood"
    except sr.RequestError as e:
        logging.error(f"Speech Recognition service error: {str(e)}")
        return f"Service error: {str(e)}"
    except Exception as e:
        logging.error(f"Unexpected error in speech recognition: {str(e)}")
        return ""

def process_single_file(audio_file: Path, whisper_model: Any) -> Dict[str, str]:
    """Process a single audio file with both transcription methods."""
    results = {}
    results["whisper"] = transcribe_with_whisper(str(audio_file), whisper_model)
    results["speech_recognition"] = transcribe_with_speech_recognition(str(audio_file))
    return results

def save_transcriptions(audio_file: Path, transcriptions: Dict[str, str]) -> None:
    """Save transcriptions to files."""
    for method, text in transcriptions.items():
        output_file = CONFIG["transcriptions_dir"] / f"{audio_file.stem}_{method}.txt"
        try:
            output_file.write_text(text, encoding="utf-8")
            logging.info(f"Transcription saved: {output_file}")
        except Exception as e:
            logging.error(f"Failed to save transcription: {str(e)}")

def process_dataset() -> None:
    """Main function to process the dataset."""
    setup_directories()
    audio_dir = download_audio_files()
    
    if not audio_dir:
        logging.error("Failed to process dataset")
        return

    whisper_model = whisper.load_model(CONFIG["whisper_model"])
    audio_files = list(Path(audio_dir).glob("*.wav"))

    for audio_file in audio_files:
        while not os.path.exists(audio_file) or not os.access(audio_file, os.R_OK):
            logging.info(f"Waiting for {audio_file} to be fully written and readable...")
            time.sleep(1)
        
        transcriptions = process_single_file(audio_file, whisper_model)
        save_transcriptions(audio_file, transcriptions)

if __name__ == "__main__":
    try:
        process_dataset()
    except KeyboardInterrupt:
        logging.info("Process interrupted by user")
    except Exception as e:
        logging.error(f"Unexpected error: {str(e)}")

2025-03-06 17:56:11,554 - INFO - Loading dataset from charris/hubert_process_filter_spotify...
2025-03-06 17:56:28,556 - INFO - Processing audio files...
  0%|          | 0/1 [00:00<?, ?it/s]2025-03-06 17:56:32,218 - INFO - Downloaded and converted: C:\Users\joey_\Desktop\Spotify\downloaded_audio\audio_0.wav
100%|██████████| 1/1 [00:04<00:00,  4.62s/it]
2025-03-06 17:56:37,395 - INFO - Transcribing C:\Users\joey_\Desktop\Spotify\downloaded_audio\audio_0.wav with Whisper...
2025-03-06 17:56:37,398 - INFO - File exists and is readable: C:\Users\joey_\Desktop\Spotify\downloaded_audio\audio_0.wav
2025-03-06 17:56:37,434 - INFO - Audio loaded successfully: (496000,), sample_rate=16000
2025-03-06 17:57:04,946 - INFO - Whisper transcription completed
2025-03-06 17:57:04,948 - INFO - Transcribing C:\Users\joey_\Desktop\Spotify\downloaded_audio\audio_0.wav with SpeechRecognition...
2025-03-06 17:57:19,374 - INFO - Transcription saved: C:\Users\joey_\Desktop\Spotify\transcriptions\audio_0_whispe