In [4]:
import os
import pandas as pd
from pathlib import Path

def check_dataset_structure(dataset_path):
    """Check the structure of your local NIH dataset"""
    
    dataset_path = Path(dataset_path)
    print(f"Checking dataset at: {dataset_path.absolute()}")
    print("=" * 50)
    
    # Check if path exists
    if not dataset_path.exists():
        print("❌ Dataset path does not exist!")
        return False
    
    # Look for common files and folders
    files_found = []
    folders_found = []
    
    for item in dataset_path.iterdir():
        if item.is_file():
            files_found.append(item.name)
        else:
            folders_found.append(item.name)
    
    print(f"📁 Folders found ({len(folders_found)}):")
    for folder in sorted(folders_found):
        print(f"   └── {folder}")
    
    print(f"\n📄 Files found ({len(files_found)}):")
    for file in sorted(files_found):
        print(f"   └── {file}")
    
    # Check for metadata file
    metadata_files = [f for f in files_found if 'Data_Entry' in f or 'metadata' in f.lower()]
    if metadata_files:
        print(f"\n✅ Metadata file found: {metadata_files}")
        return metadata_files
    else:
        print("\n⚠️  No metadata file found. Looking for CSV files...")
        csv_files = [f for f in files_found if f.endswith('.csv')]
        if csv_files:
            print(f"📊 CSV files: {csv_files}")
            return csv_files
        else:
            print("❌ No CSV metadata file found!")
            return None

# Usage - Update this path to your dataset location
dataset_path = r"D:\Projects\CLARITY\Model\Dataset\archive"  # Update this path!
metadata_file = check_dataset_structure(dataset_path)

Checking dataset at: D:\Projects\CLARITY\Model\Dataset\archive
📁 Folders found (12):
   └── images_001
   └── images_002
   └── images_003
   └── images_004
   └── images_005
   └── images_006
   └── images_007
   └── images_008
   └── images_009
   └── images_010
   └── images_011
   └── images_012

📄 Files found (8):
   └── ARXIV_V5_CHESTXRAY.pdf
   └── BBox_List_2017.csv
   └── Data_Entry_2017.csv
   └── FAQ_CHESTXRAY.pdf
   └── LOG_CHESTXRAY.pdf
   └── README_CHESTXRAY.pdf
   └── test_list.txt
   └── train_val_list.txt

✅ Metadata file found: ['Data_Entry_2017.csv']


In [14]:
import pandas as pd
import os
from pathlib import Path
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

def explore_nested_dataset_structure(base_path):
    """Explore your nested dataset structure"""
    
    base_path = Path(base_path)
    print(f"🔍 Exploring dataset at: {base_path}")
    print("=" * 60)
    
    if not base_path.exists():
        print("❌ Path does not exist!")
        return None
    
    # Check for CSV file
    csv_files = list(base_path.glob("*.csv"))
    metadata_file = None
    
    if csv_files:
        metadata_file = csv_files[0]  # Select the first CSV file found
        print(f"📊 Metadata file found: {metadata_file.name}")
    else:
        print("❌ No CSV metadata file found!")
        return None

    
    # Explore nested structure
    total_images = 0
    folder_structure = {}
    
    print(f"\n📁 Folder structure analysis:")
    print("-" * 40)
    
    for main_folder in sorted(base_path.iterdir()):
        if main_folder.is_dir() and main_folder.name.startswith('images_'):
            print(f"\n📂 {main_folder.name}/")
            
            # Look for subfolders
            subfolders = [f for f in main_folder.iterdir() if f.is_dir()]
            
            if subfolders:
                for subfolder in subfolders:
                    print(f"   └── {subfolder.name}/")
                    
                    # Count images in this subfolder
                    image_extensions = ['*.png', '*.jpg', '*.jpeg']
                    subfolder_images = 0
                    
                    for ext in image_extensions:
                        subfolder_images += len(list(subfolder.glob(ext)))
                    
                    print(f"       └── {subfolder_images:,} images")
                    
                    folder_structure[f"{main_folder.name}/{subfolder.name}"] = {
                        'path': subfolder,
                        'count': subfolder_images
                    }
                    
                    total_images += subfolder_images
            else:
                print("   └── (no subfolders)")
    
    print("-" * 40)
    print(f"🖼️  TOTAL IMAGES: {total_images:,}")
    
    return metadata_file, folder_structure, total_images

# Run exploration
BASE_PATH = r"D:\Projects\CLARITY\Model\Dataset\archive"
metadata_file, folder_structure, total_images = explore_nested_dataset_structure(BASE_PATH)

🔍 Exploring dataset at: D:\Projects\CLARITY\Model\Dataset\archive
📊 Metadata file found: BBox_List_2017.csv

📁 Folder structure analysis:
----------------------------------------

📂 images_001/
   └── images/
       └── 4,999 images

📂 images_002/
   └── images/
       └── 10,000 images

📂 images_003/
   └── images/
       └── 10,000 images

📂 images_004/
   └── images/
       └── 10,000 images

📂 images_005/
   └── images/
       └── 10,000 images

📂 images_006/
   └── images/
       └── 10,000 images

📂 images_007/
   └── images/
       └── 10,000 images

📂 images_008/
   └── images/
       └── 10,000 images

📂 images_009/
   └── images/
       └── 10,000 images

📂 images_010/
   └── images/
       └── 10,000 images

📂 images_011/
   └── images/
       └── 10,000 images

📂 images_012/
   └── images/
       └── 7,121 images
----------------------------------------
🖼️  TOTAL IMAGES: 112,120


In [11]:
import pandas as pd
import os
from pathlib import Path
from collections import defaultdict
import hashlib

def analyze_dataset_duplicates(dataset_path):
    """Analyze the duplicate situation in your dataset"""
    
    dataset_path = Path(dataset_path)
    
    # Load metadata to get expected image names
    metadata_path = dataset_path / "Data_Entry_2017.csv"
    if not metadata_path.exists():
        print("❌ Data_Entry_2017.csv not found!")
        return
    
    df = pd.read_csv(metadata_path)
    expected_images = set(df['Image Index'].tolist())
    
    print(f"📊 Expected unique images from metadata: {len(expected_images):,}")
    
    # Find all actual images on disk
    found_images = defaultdict(list)  # image_name -> [list of paths]
    
    for folder in dataset_path.iterdir():
        if folder.is_dir() and folder.name.startswith('images_'):
            print(f"🔍 Scanning {folder.name}...")
            
            for img_file in folder.glob("*.png"):
                image_name = img_file.name
                found_images[image_name].append(img_file)
    
    print(f"📊 Actual unique image names found: {len(found_images):,}")
    print(f"📊 Total image files found: {sum(len(paths) for paths in found_images.values()):,}")
    
    # Analyze duplication patterns
    duplicates = {name: paths for name, paths in found_images.items() if len(paths) > 1}
    
    print(f"\n🔄 Duplicate Analysis:")
    print(f"   Images with duplicates: {len(duplicates):,}")
    print(f"   Images with single copy: {len(found_images) - len(duplicates):,}")
    
    if len(duplicates) > 0:
        duplicate_counts = defaultdict(int)
        for paths in duplicates.values():
            duplicate_counts[len(paths)] += 1
        
        print(f"\n   Duplication pattern:")
        for count, num_images in duplicate_counts.items():
            print(f"     {count} copies: {num_images:,} images")
    
    # Check for missing images
    missing_images = expected_images - set(found_images.keys())
    extra_images = set(found_images.keys()) - expected_images
    
    print(f"\n📋 Metadata Alignment:")
    print(f"   Images in metadata but missing from disk: {len(missing_images):,}")
    print(f"   Images on disk but not in metadata: {len(extra_images):,}")
    
    if len(missing_images) > 0:
        print(f"   First 5 missing: {list(missing_images)[:5]}")
    
    if len(extra_images) > 0:
        print(f"   First 5 extra: {list(extra_images)[:5]}")
    
    return found_images, duplicates, missing_images

# Run the analysis
dataset_path = r"D:\Projects\CLARITY\Model\Dataset\archive"  # Update your path!
found_images, duplicates, missing = analyze_dataset_duplicates(dataset_path)

📊 Expected unique images from metadata: 112,120
🔍 Scanning images_001...
🔍 Scanning images_002...
🔍 Scanning images_003...
🔍 Scanning images_004...
🔍 Scanning images_005...
🔍 Scanning images_006...
🔍 Scanning images_007...
🔍 Scanning images_008...
🔍 Scanning images_009...
🔍 Scanning images_010...
🔍 Scanning images_011...
🔍 Scanning images_012...
📊 Actual unique image names found: 0
📊 Total image files found: 0

🔄 Duplicate Analysis:
   Images with duplicates: 0
   Images with single copy: 0

📋 Metadata Alignment:
   Images in metadata but missing from disk: 112,120
   Images on disk but not in metadata: 0
   First 5 missing: ['00007352_002.png', '00010426_000.png', '00019313_012.png', '00014303_000.png', '00016134_013.png']


In [15]:
def create_nested_image_mapping(base_path, folder_structure):
    """Create mapping from image names to paths in nested structure"""
    
    print(f"\n🗺️  Creating image mapping...")
    
    image_mapping = {}
    
    for folder_key, folder_info in folder_structure.items():
        folder_path = folder_info['path']
        
        print(f"📂 Processing {folder_key}...")
        
        # Get all images from this nested folder
        for img_file in folder_path.glob("*.png"):
            image_name = img_file.name
            
            if image_name in image_mapping:
                # Duplicate found - this explains your double count!
                print(f"⚠️  Duplicate found: {image_name}")
                print(f"    Existing: {image_mapping[image_name]}")
                print(f"    New: {img_file}")
            else:
                image_mapping[image_name] = img_file
    
    print(f"✅ Image mapping complete: {len(image_mapping):,} unique images")
    return image_mapping

# Create the mapping
if folder_structure:
    image_mapping = create_nested_image_mapping(BASE_PATH, folder_structure)


🗺️  Creating image mapping...
📂 Processing images_001/images...
📂 Processing images_002/images...
📂 Processing images_003/images...
📂 Processing images_004/images...
📂 Processing images_005/images...
📂 Processing images_006/images...
📂 Processing images_007/images...
📂 Processing images_008/images...
📂 Processing images_009/images...
📂 Processing images_010/images...
📂 Processing images_011/images...
📂 Processing images_012/images...
✅ Image mapping complete: 112,120 unique images
