In [1]:
import os
import random
import shutil
from tqdm import tqdm

def create_folder(folder_path):
    # Create the folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)

def split_dataset(source_folder, train_folder, test_folder, validation_folder):
    # Iterate over each class (subfolder) in the source_folder
    for class_folder in tqdm(os.listdir(source_folder), desc="Processing class folders"):
        class_path = os.path.join(source_folder, class_folder)

        # Check if it is a directory
        if os.path.isdir(class_path):
            # List all images in the class folder
            images = os.listdir(class_path)
            # Shuffle the list of images
            random.shuffle(images)

            # Calculate the number of images for each set
            total_images = len(images)
            train_split = int(0.8 * total_images)
            test_split = int(0.2 * total_images)

            # Split the images into train, test, and validation sets
            train_images = images[:train_split]
            test_images = images[train_split:train_split + test_split]

            # Create destination folders if they don't exist
            train_class_folder = os.path.join(train_folder, class_folder)
            test_class_folder = os.path.join(test_folder, class_folder)

            create_folder(train_class_folder)
            create_folder(test_class_folder)

            # Copy the images to their respective folders
            for image in train_images:
                source_path = os.path.join(class_path, image)
                destination_path = os.path.join(train_class_folder, image)
                shutil.copyfile(source_path, destination_path)

            for image in test_images:
                source_path = os.path.join(class_path, image)
                destination_path = os.path.join(test_class_folder, image)
                shutil.copyfile(source_path, destination_path)

# Replace these paths with your actual paths
source_folder = "snakes"


train_folder = "train"
test_folder = "test"

# Create master folders if they don't exist
create_folder(train_folder)
create_folder(test_folder)

# Call the function to perform the split
split_dataset(source_folder, train_folder, test_folder, validation_folder)

Processing class folders: 100%|██████████| 38/38 [00:05<00:00,  7.53it/s]


In [5]:
import os

def count_images(folder):
    total_images = 0
    # Iterate over each class (subfolder) in the specified folder
    for class_folder in os.listdir(folder):
        class_path = os.path.join(folder, class_folder)

        # Check if it is a directory
        if os.path.isdir(class_path):
            # Count the number of images in the class folder and its subfolders
            images = [f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]
            total_images += len(images)

    return total_images

# Count the number of images in each set
train_images_count = count_images(train_folder)
test_images_count = count_images(test_folder)
# Display the results
print(f"Number of images in train set: {train_images_count}")
print(f"Number of images in test set: {test_images_count}")


Number of images in train set: 6195
Number of images in test set: 1538
