In [None]:
%pip install pydub

In [None]:
from pathlib import Path
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import os

# ==== CONFIGURATION ====
INPUT_ROOT = r"E:\dev\thesis-testing\common-voices-mozilla\cv-valid-train\cv-valid-train"
OUTPUT_DIR = r"E:\dev\thesis-testing\common-voices-mozilla\cv-valid-train\cv-valid-train-wav"   # Where to move all WAV files
FILE_EXTENSION = "wav"  # File extension to look for

def find_audio_files(root_dir, extension):
    """Recursively find all files with the given extension under root_dir."""
    return list(Path(root_dir).rglob(f'*.{extension}'))

def move_file(file_info):
    """Move a single file to the output directory with a unique name."""
    src_path, output_dir = file_info
    # Create a unique name using parent folder names to avoid collisions
    unique_name = f"{src_path.parent.name}_{src_path.name}"
    dest_path = Path(output_dir) / unique_name
    
    try:
        shutil.move(str(src_path), str(dest_path))
        return f"✓ Moved: {src_path.name}"
    except Exception as e:
        return f"✗ Error moving {src_path.name}: {e}"

def move_all_wav_files():
    # Create output directory if it doesn't exist
    output_path = Path(OUTPUT_DIR)
    output_path.mkdir(parents=True, exist_ok=True)

    # Find all WAV files
    wav_files = find_audio_files(INPUT_ROOT, FILE_EXTENSION)
    
    if not wav_files:
        print(f"No {FILE_EXTENSION.upper()} files found in {INPUT_ROOT}")
        return

    print(f"Found {len(wav_files)} {FILE_EXTENSION.upper()} files to move")

    # Prepare file info for processing
    file_info_list = [(f, OUTPUT_DIR) for f in wav_files]

    max_workers = min(16, len(file_info_list))
    successful_moves = 0
    failed_moves = 0

    start_time = time.time()

    # Use ThreadPoolExecutor for parallel file operations
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(move_file, file_info): file_info 
                         for file_info in file_info_list}
        
        with tqdm(total=len(file_info_list), desc="Moving files", unit="file", ncols=80) as pbar:
            for future in as_completed(future_to_file):
                result = future.result()
                if result.startswith("✓"):
                    successful_moves += 1
                else:
                    failed_moves += 1
                    print(f"\n{result}")
                pbar.update(1)

    end_time = time.time()
    processing_time = end_time - start_time

    # Print summary
    print(f"\n{'='*50}")
    print("FILE MOVING SUMMARY")
    print(f"{'='*50}")
    print(f"Total files processed: {len(wav_files)}")
    print(f"Successfully moved: {successful_moves}")
    print(f"Failed to move: {failed_moves}")
    print(f"Total processing time: {processing_time:.2f} seconds")

    if successful_moves > 0:
        avg_time = processing_time / len(wav_files) if len(wav_files) > 0 else 0
        print(f"Average time per file: {avg_time:.4f} seconds")
        print(f"\nAll files moved to: {output_path.absolute()}")

# Run the script
if __name__ == "__main__":
    move_all_wav_files()

In [None]:
from pydub import AudioSegment
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from tqdm import tqdm
import time
import os

# ==== CONFIGURATION ====
INPUT_TYPE = "mp3"  # File extension to search for, e.g., "flac", "mp3"
INPUT_FORMAT = "mp3"  # Format for pydub, usually same as INPUT_TYPE
INPUT_ROOT = r"E:\dev\thesis-testing\common-voices-mozilla\cv-valid-train\cv-valid-train"
OUTPUT_ROOT = r"E:\dev\thesis-testing\common-voices-mozilla\cv-valid-train\cv-valid-train-wav2"   # Where to move all WAV files

def find_audio_files(root_dir, extension):
    """Recursively find all files with the given extension under root_dir."""
    return list(Path(root_dir).rglob(f'*.{extension}'))

def get_output_path(input_file, input_root, output_root):
    """Return the output WAV path, mirroring the input directory structure."""
    relative_path = input_file.relative_to(input_root)
    return Path(output_root) / relative_path.with_suffix('.wav')

def convert_audio_file(file_info):
    """Convert a single audio file to WAV."""
    file_path, output_file = file_info
    thread_id = threading.current_thread().ident
    try:
        audio = AudioSegment.from_file(file_path, format=INPUT_FORMAT)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        audio.export(output_file, format="wav")
        return f"✓ Thread {thread_id}: {file_path.name}"
    except Exception as e:
        return f"✗ Thread {thread_id}: Error processing {file_path.name}: {e}"

def optimize_audio_conversion():
    input_path = Path(INPUT_ROOT)
    output_path = Path(OUTPUT_ROOT)

    # Find all matching files recursively
    input_files = find_audio_files(input_path, INPUT_TYPE)

    if not input_files:
        print(f"No {INPUT_TYPE.upper()} files found in {input_path}")
        return

    print(f"Found {len(input_files)} {INPUT_TYPE.upper()} files to process")

    file_info_list = []
    for input_file in input_files:
        out_file = get_output_path(input_file, input_path, output_path)
        if out_file.exists():
            print(f"Skipping {input_file} - WAV file already exists")
            continue
        file_info_list.append((input_file, out_file))

    if not file_info_list:
        print("All files have already been converted.")
        return

    print(f"Processing {len(file_info_list)} files...")

    max_workers = min(16, len(file_info_list))
    successful_conversions = 0
    failed_conversions = 0

    start_time = time.time()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(convert_audio_file, file_info): file_info for file_info in file_info_list}
        with tqdm(total=len(file_info_list), desc="Converting files", unit="file", ncols=80) as pbar:
            for future in as_completed(future_to_file):
                result = future.result()
                if result.startswith("✓"):
                    successful_conversions += 1
                else:
                    failed_conversions += 1
                    print(f"\n{result}")
                pbar.update(1)

    end_time = time.time()
    processing_time = end_time - start_time

    # Summary
    print(f"\n{'='*50}")
    print(f"CONVERSION SUMMARY")
    print(f"{'='*50}")
    print(f"Total files processed: {len(file_info_list)}")
    print(f"Successful conversions: {successful_conversions}")
    print(f"Failed conversions: {failed_conversions}")
    print(f"Total processing time: {processing_time:.2f} seconds")

    if successful_conversions > 0:
        avg_time = processing_time / len(file_info_list)
        print(f"Average time per file: {avg_time:.2f} seconds")

# Run the optimization
if __name__ == "__main__":
    optimize_audio_conversion()

In [None]:
import hashlib
from pathlib import Path
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import os

def get_file_hash(file_path, block_size=65536):
    """Generate MD5 hash for a file"""
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read(block_size)
        while len(buf) > 0:
            hasher.update(buf)
            buf = f.read(block_size)
    return file_path, hasher.hexdigest()

def process_batch(files, max_workers=8):
    """Process a batch of files in parallel"""
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_file = {executor.submit(get_file_hash, file): file for file in files}
        
        # Process completed tasks as they finish
        for future in tqdm(as_completed(future_to_file), total=len(files), desc="Processing files"):
            try:
                file_path, file_hash = future.result()
                results[file_path] = file_hash
            except Exception as e:
                print(f"Error processing {future_to_file[future]}: {e}")
    
    return results

def find_duplicates(directory, max_workers=8):
    """Find duplicate files in the given directory using multithreading"""
    directory = Path(directory)
    hashes = defaultdict(list)
    
    # Get all WAV files recursively
    files = list(directory.rglob('*.wav'))
    
    # Process files in parallel
    results = process_batch(files, max_workers)
    
    # Group files by their hashes
    for file_path, file_hash in results.items():
        hashes[file_hash].append(str(file_path))
    
    # Filter out unique files
    duplicates = {k: v for k, v in hashes.items() if len(v) > 1}
    return duplicates

# Usage
directory_path = r"E:\dev\thesis-testing\dataset-balanced\val\real"
duplicates = find_duplicates(directory_path, max_workers=(os.cpu_count() * 5))  # Adjust max_workers based on your CPU

# Print results
if duplicates:
    print(f"Found {len(duplicates)} sets of duplicate files:")
    for i, (hash_val, files) in enumerate(duplicates.items(), 1):
        print(f"\nDuplicate set {i} (hash: {hash_val}):")
        for file in files:
            print(f"  - {file}")
else:
    print("No duplicate files found.")