In [None]:
%pip install pydub

In [None]:
from pydub import AudioSegment
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from tqdm import tqdm
import time

def convert_audio_file(file_info):
    """Convert a single audio file from MP3 to WAV"""
    file_path, output_file = file_info
    thread_id = threading.current_thread().ident
    
    try:
        audio = AudioSegment.from_file(file_path, format="mp3")
        audio.export(output_file, format="wav")
        return f"✓ Thread {thread_id}: {Path(file_path).name}"
    except Exception as e:
        return f"✗ Thread {thread_id}: Error processing {Path(file_path).name}: {e}"

def optimize_audio_conversion():
    # Use pathlib for better path handling
    input_path = Path("./common-voices-mozilla/cv-valid-train/cv-valid-train")
    output_path = Path("./common-voices-mozilla/cv-valid-train/wav-files")
    
    # Create output directory
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Get all MP3 files and prepare file info tuples
    mp3_files = [f for f in input_path.iterdir() if f.suffix.lower() == '.mp3']
    
    if not mp3_files:
        print("No MP3 files found in the input directory.")
        return
    
    print(f"Found {len(mp3_files)} MP3 files to process")
    
    # Prepare file information for processing
    file_info_list = []
    for mp3_file in mp3_files:
        output_file = output_path / f"{mp3_file.stem}.wav"
        
        # Skip if WAV file already exists
        if output_file.exists():
            print(f"Skipping {mp3_file.name} - WAV file already exists")
            continue
            
        file_info_list.append((mp3_file, output_file))
    
    if not file_info_list:
        print("All files have already been converted.")
        return
    
    print(f"Processing {len(file_info_list)} files...")
    
    # Optimal thread count for your i5-6200U (2 cores, 4 threads)
    # Using 3 threads to leave one thread for system processes
    max_workers = min(3, len(file_info_list))
    
    successful_conversions = 0
    failed_conversions = 0
    
    start_time = time.time()
    
    # Process files with threading and progress bar
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_file = {executor.submit(convert_audio_file, file_info): file_info 
                        for file_info in file_info_list}
        
        # Process completed tasks with progress bar
        with tqdm(total=len(file_info_list), desc="Converting files", 
                unit="file", ncols=80) as pbar:
            
            for future in as_completed(future_to_file):
                result = future.result()
                
                if result.startswith("✓"):
                    successful_conversions += 1
                else:
                    failed_conversions += 1
                    print(f"\n{result}")
                
                pbar.update(1)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    # Summary
    print(f"\n{'='*50}")
    print(f"CONVERSION SUMMARY")
    print(f"{'='*50}")
    print(f"Total files processed: {len(file_info_list)}")
    print(f"Successful conversions: {successful_conversions}")
    print(f"Failed conversions: {failed_conversions}")
    print(f"Total processing time: {processing_time:.2f} seconds")
    
    if successful_conversions > 0:
        avg_time = processing_time / len(file_info_list)
        print(f"Average time per file: {avg_time:.2f} seconds")

# Run the optimization
if __name__ == "__main__":
    optimize_audio_conversion()