In [10]:
import os

# Count files in directories
images_dir = r'../dataset/compiled-new/images'
labels_dir = r'../dataset/compiled-new/labels'

num_images = len([f for f in os.listdir(images_dir) if not f.startswith('.')])
num_labels = len([f for f in os.listdir(labels_dir) if not f.startswith('.')])

print(f"Number of images: {num_images}")
print(f"Number of labels: {num_labels}")

Number of images: 9561
Number of labels: 9561


In [None]:
# Check for images without corresponding label files
missing_labels = []

for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        base_name = os.path.splitext(img_file)[0]
        label_file = base_name + '.txt'
        label_path = os.path.join(labels_dir, label_file)
        if not os.path.isfile(label_path):
            missing_labels.append(img_file)

if missing_labels:
    print("Images missing label files:")
    for img in missing_labels:
        print(img)
else:
    print("All images have corresponding label files.")

All images have corresponding label files.


In [8]:
# Check for label files without corresponding image files
missing_images = []

for label_file in os.listdir(labels_dir):
    if label_file.lower().endswith('.txt'):
        base_name = os.path.splitext(label_file)[0]
        # Check for any image extension
        has_image = any(
            os.path.isfile(os.path.join(images_dir, base_name + ext))
            for ext in ['.jpg', '.jpeg', '.png']
        )
        if not has_image:
            missing_images.append(label_file)

if missing_images:
    print("Label files missing corresponding images:")
    for label in missing_images:
        print(label)
else:
    print("All label files have corresponding images.")

All label files have corresponding images.


In [9]:
import hashlib

# Remove duplicate images and their labels
def file_hash(filepath):
    """Compute MD5 hash of a file (read in chunks for large files)."""
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hasher.update(chunk)
    return hasher.hexdigest()

hashes = {}
duplicates = []

for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(images_dir, img_file)
        h = file_hash(img_path)
        if h in hashes:
            # Duplicate found: delete image and its label
            print(f"Deleting duplicate image: {img_file}")
            os.remove(img_path)
            # Delete corresponding label
            base_name = os.path.splitext(img_file)[0]
            label_file = base_name + '.txt'
            label_path = os.path.join(labels_dir, label_file)
            if os.path.isfile(label_path):
                print(f"Deleting label: {label_file}")
                os.remove(label_path)
            duplicates.append(img_file)
        else:
            hashes[h] = img_file

print(f"Deleted {len(duplicates)} duplicate images and their labels.")

Deleted 0 duplicate images and their labels.
