In [1]:
import os

# Count files in directories
images_dir = r'../dataset/compiled-new/images'
labels_dir = r'../dataset/compiled-new/labels'

num_images = len([f for f in os.listdir(images_dir) if not f.startswith('.')])
num_labels = len([f for f in os.listdir(labels_dir) if not f.startswith('.')])

print(f"Number of images: {num_images}")
print(f"Number of labels: {num_labels}")

Number of images: 9561
Number of labels: 9561


In [2]:
# Check for images without corresponding label files
missing_labels = []

for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        base_name = os.path.splitext(img_file)[0]
        label_file = base_name + '.txt'
        label_path = os.path.join(labels_dir, label_file)
        if not os.path.isfile(label_path):
            missing_labels.append(img_file)

if missing_labels:
    print("Images missing label files:")
    for img in missing_labels:
        print(img)
else:
    print("All images have corresponding label files.")

All images have corresponding label files.


In [3]:
# Check for label files without corresponding image files
missing_images = []

for label_file in os.listdir(labels_dir):
    if label_file.lower().endswith('.txt'):
        base_name = os.path.splitext(label_file)[0]
        # Check for any image extension
        has_image = any(
            os.path.isfile(os.path.join(images_dir, base_name + ext))
            for ext in ['.jpg', '.jpeg', '.png']
        )
        if not has_image:
            missing_images.append(label_file)

if missing_images:
    print("Label files missing corresponding images:")
    for label in missing_images:
        print(label)
else:
    print("All label files have corresponding images.")

All label files have corresponding images.


In [4]:
import hashlib

# Remove duplicate images and their labels
def file_hash(filepath):
    """Compute MD5 hash of a file (read in chunks for large files)."""
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hasher.update(chunk)
    return hasher.hexdigest()

hashes = {}
duplicates = []

for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(images_dir, img_file)
        h = file_hash(img_path)
        if h in hashes:
            # Duplicate found: delete image and its label
            print(f"Deleting duplicate image: {img_file}")
            os.remove(img_path)
            # Delete corresponding label
            base_name = os.path.splitext(img_file)[0]
            label_file = base_name + '.txt'
            label_path = os.path.join(labels_dir, label_file)
            if os.path.isfile(label_path):
                print(f"Deleting label: {label_file}")
                os.remove(label_path)
            duplicates.append(img_file)
        else:
            hashes[h] = img_file

print(f"Deleted {len(duplicates)} duplicate images and their labels.")

Deleted 0 duplicate images and their labels.


In [5]:
import os

# Check for empty .txt label files
empty_labels = []

for label_file in os.listdir(labels_dir):
    if label_file.lower().endswith('.txt'):
        label_path = os.path.join(labels_dir, label_file)
        if os.path.getsize(label_path) == 0:
            empty_labels.append(label_file)

if empty_labels:
    print("Empty label files found:")
    for label in empty_labels:
        print(label)
else:
    print("No empty label files found.")

No empty label files found.


In [6]:
from PIL import Image

# Remove corrupted images and their labels
corrupted_images = []

for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(images_dir, img_file)
        try:
            with Image.open(img_path) as img:
                img.verify()
        except Exception as e:
            print(f"Corrupted or unreadable image: {img_file} ({e})")
            os.remove(img_path)
            base_name = os.path.splitext(img_file)[0]
            label_file = base_name + '.txt'
            label_path = os.path.join(labels_dir, label_file)
            if os.path.isfile(label_path):
                print(f"Deleting label: {label_file}")
                os.remove(label_path)
            corrupted_images.append(img_file)

print(f"Deleted {len(corrupted_images)} corrupted images and their labels.")

Deleted 0 corrupted images and their labels.


In [7]:
import cv2
import os

def is_blurry(image_path, threshold=100):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True
    variance = cv2.Laplacian(img, cv2.CV_64F).var()
    return variance < threshold

blurry_images = []
for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(images_dir, img_file)
        if is_blurry(img_path):
            blurry_images.append(img_file)

print(f"Found {len(blurry_images)} blurry images.")


Found 8943 blurry images.


In [13]:
import os
import cv2
import shutil

images_dir = r'../dataset/compiled-new/images'
labels_dir = r'../dataset/compiled-new/labels'
output_images_dir = r'../dataset-allin/images'
output_labels_dir = r'../dataset-allin/labels'

os.makedirs(output_images_dir, exist_ok=True)
os.makedirs(output_labels_dir, exist_ok=True)

def is_blurry(image_path, threshold=80):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True
    variance = cv2.Laplacian(img, cv2.CV_64F).var()
    return variance < threshold

kept_count = 0
for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(images_dir, img_file)
        if not is_blurry(img_path, threshold=100):
            # Copy image
            shutil.copy2(img_path, os.path.join(output_images_dir, img_file))
            # Copy label if exists
            base_name = os.path.splitext(img_file)[0]
            label_file = base_name + '.txt'
            label_path = os.path.join(labels_dir, label_file)
            if os.path.isfile(label_path):
                shutil.copy2(label_path, os.path.join(output_labels_dir, label_file))
            kept_count += 1

print(f"Copied {kept_count} non-blurry images and their labels to {output_images_dir} and {output_labels_dir}.")

Copied 618 non-blurry images and their labels to ../dataset-allin/images and ../dataset-allin/labels.


DATA PREP

In [9]:
import os
import shutil

images_dir = r'../dataset-v2/kaggle-good/archive'
labels_dir = r'../dataset-v2/kaggle-good/archive'
output_base = r'../dataset-allin/sorted'

# Make output folders for each class
for class_id in ['0', '1', '2']:
    os.makedirs(os.path.join(output_base, class_id), exist_ok=True)

moved = 0
for img_file in os.listdir(images_dir):
    if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        base_name = os.path.splitext(img_file)[0]
        label_file = base_name + '.txt'
        label_path = os.path.join(labels_dir, label_file)
        if os.path.isfile(label_path):
            with open(label_path, 'r') as f:
                line = f.readline().strip()
                if line:
                    class_id = line.split()[0]
                    if class_id in ['0', '1', '2']:
                        dest_dir = os.path.join(output_base, class_id)
                        shutil.copy2(os.path.join(images_dir, img_file), os.path.join(dest_dir, img_file))
                        moved += 1

print(f"Copied {moved} images into class folders 0, 1, 2 under {output_base}.")

Copied 1221 images into class folders 0, 1, 2 under ../dataset-allin/sorted.


In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

base_dir = r'../dataset-allin/sorted'
output_base = r'../dataset-allin/split'
splits = ['train', 'val', 'test']
split_ratio = [0.7, 0.15, 0.15]  # 70% train, 15% val, 15% test

for split in splits:
    for class_id in ['0', '1', '2']:
        os.makedirs(os.path.join(output_base, split, class_id), exist_ok=True)

for class_id in ['0', '1', '2']:
    class_dir = os.path.join(base_dir, class_id)
    images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    images = sorted(images)

    train_imgs, temp_imgs = train_test_split(images, test_size=(1 - split_ratio[0]), random_state=42)
    val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)

    for img in train_imgs:
        shutil.copy2(os.path.join(class_dir, img), os.path.join(output_base, 'train', class_id, img))
    for img in val_imgs:
        shutil.copy2(os.path.join(class_dir, img), os.path.join(output_base, 'val', class_id, img))
    for img in test_imgs:
        shutil.copy2(os.path.join(class_dir, img), os.path.join(output_base, 'test', class_id, img))

    print(f"Class {class_id}: {len(train_imgs)} train, {len(val_imgs)} val, {len(test_imgs)} test images copied.")

Class 0: 482 train, 103 val, 104 test images copied.
Class 1: 547 train, 117 val, 118 test images copied.
Class 2: 257 train, 55 val, 56 test images copied.
