In [None]:
import os
from pathlib import Path
from collections import defaultdict

# Define the directory structures
directories = {
    "for-2seconds": {
        "subdirs": ['training', 'validation', 'testing'],
        "categories": ['fake', 'real']
    },
    "release_in_the_wild": {
        "subdirs": [''],  # No subdirectories
        "categories": ['fake']
    },
    "generated_audio": {
        "subdirs": [''],  # Will handle fake subdirectories specially
        "categories": ['fake']
    },
    "common-voices-mozilla": {
        "subdirs": [''],  # Will handle subdirectories specially
        "categories": ['real']
    }
}

# Dictionary to store counts
counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

# Count files in each directory
for base_dir, config in directories.items():
    for subdir in config["subdirs"]:
        for category in config["categories"]:
            if base_dir == "generated_audio" and category == "fake":
                # Special handling for generated_audio/fake/subdirectories
                fake_dir = Path(base_dir) / "fake"
                if fake_dir.exists():
                    # Only count files in leaf directories
                    for root, dirs, files in os.walk(fake_dir):
                        if not dirs:  # This is a leaf directory
                            rel_path = os.path.relpath(root, base_dir)
                            counts[base_dir][rel_path][category] = len([
                                f for f in files 
                                if os.path.isfile(os.path.join(root, f))
                            ])
            elif base_dir == "common-voices-mozilla":
                # Special handling for common-voices-mozilla
                if subdir:  # If subdir is not empty
                    search_dir = Path(base_dir) / subdir
                else:
                    search_dir = Path(base_dir)
                
                if search_dir.exists():
                    # Count all audio files in all subdirectories
                    for root, _, files in os.walk(search_dir):
                        audio_files = [f for f in files if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg'))]
                        if audio_files:
                            rel_path = os.path.relpath(root, base_dir)
                            counts[base_dir][rel_path][category] += len(audio_files)
            else:
                # Standard directory structure
                dir_path = Path(base_dir) / subdir / category if subdir else Path(base_dir) / category
                if dir_path.exists() and dir_path.is_dir():
                    # Only count audio files
                    audio_files = [f for f in os.listdir(dir_path) 
                                if os.path.isfile(os.path.join(dir_path, f)) and 
                                f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg'))]
                    dir_key = subdir if subdir else "root"
                    counts[base_dir][dir_key][category] += len(audio_files)

# Print results
total_files = 0
total_real = 0
total_fake = 0

for base_dir, subdirs in counts.items():
    print(f"\n=== {base_dir} ===")
    for subdir, categories in sorted(subdirs.items()):
        if subdir and subdir != "root":  # Skip empty subdir or root
            print(f"\n  {subdir}:")
        for category, count in categories.items():
            if count > 0:  # Only show categories with files
                prefix = "    " if subdir and subdir != "root" else "  "
                print(f"{prefix}{category.capitalize()}: {count}")
                
                # Update totals
                if category == "fake":
                    total_fake += count
                elif category == "real":
                    total_real += count
                total_files += count

print(f"\n=== Summary ===")
print(f"Total audio files: {total_files}")
print(f"Total fake audio files: {total_fake}")
print(f"Total real audio files: {total_real}")