Splitting Database

In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Paths
original_dataset_path = r"C:\Users\kiezi\bloomsight-flowers\dataset\flowers"  # Path to your dataset
output_dataset_path = r"C:\Users\kiezi\bloomsight-flowers\second_output"  # Path to store the split dataset

# Splitting ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Ensure the output directories exist
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(output_dataset_path, split), exist_ok=True)

# Iterate over each category (e.g., daisy, rose, etc.)
for category in os.listdir(original_dataset_path):
    category_path = os.path.join(original_dataset_path, category)
    
    if not os.path.isdir(category_path):  # Skip non-directory files
        continue
    
    # Get all files in the category (filter image files)
    files = [os.path.join(category_path, f) for f in os.listdir(category_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    # Skip categories with no files
    if not files:
        print(f"No files found in category: {category}")
        continue
    
    # Split files into train, val, and test
    train_files, temp_files = train_test_split(files, test_size=(val_ratio + test_ratio), random_state=42)
    val_files, test_files = train_test_split(temp_files, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)
    
    # Function to copy files to respective directories
    def copy_files(files, split):
        split_category_path = os.path.join(output_dataset_path, split, category)
        os.makedirs(split_category_path, exist_ok=True)
        for file in files:
            shutil.copy(file, split_category_path)
    
    # Copy files to respective directories
    copy_files(train_files, 'train')
    copy_files(val_files, 'val')
    copy_files(test_files, 'test')

print("Dataset successfully split into train, val, and test sets.")


Dataset successfully split into train, val, and test sets.


Data Preprocessing

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Path to the dataset
dataset_dir = r"C:\Users\kiezi\bloomsight-flowers\dataset\flowers"

# Create train, val, and test directories
train_dir = r"C:\Users\kiezi\bloomsight-flowers\second_ouput\train"
val_dir = r"C:\Users\kiezi\bloomsight-flowers\second_ouput\val"
test_dir = r"C:\Users\kiezi\bloomsight-flowers\second_ouput\test"

# Create the directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Get the list of class directories
class_dirs = [d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]

# Split data for each class
for class_dir in class_dirs:
    class_path = os.path.join(dataset_dir, class_dir)
    
    # Get the list of image files in each class
    class_images = [f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]
    
    # Split the data into training, validation, and test sets (80% train, 10% val, 10% test)
    train_images, temp_images = train_test_split(class_images, test_size=0.2, random_state=42)
    val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)
    
    # Define function to move images to the appropriate directories
    def move_images(image_list, target_dir):
        for image in image_list:
            # Create class directory if it doesn't exist
            class_dir_path = os.path.join(target_dir, class_dir)
            if not os.path.exists(class_dir_path):
                os.makedirs(class_dir_path)
            
            # Move image to the appropriate class folder
            shutil.move(os.path.join(class_path, image), os.path.join(class_dir_path, image))

    # Move the images to their respective directories
    move_images(train_images, train_dir)
    move_images(val_images, val_dir)
    move_images(test_images, test_dir)

print("Dataset successfully split and moved to train, val, and test directories.")


Dataset successfully split and moved to train, val, and test directories.


Merging Dataset

In [10]:
import os
import shutil
from pathlib import Path

# Paths to the two datasets to merge
dataset1_path = r"C:\Users\kiezi\bloomsight-flowers\final_dataset"
dataset2_path = r"C:\Users\kiezi\bloomsight-flowers\Flower_Classification"

# Path to the merged dataset
merged_dataset_path = r"C:\Users\kiezi\bloomsight-flowers\final_na_dataset"

# Create directories for the merged dataset (train, val, test)
splits = ['train', 'val', 'test']
for split in splits:
    split_dir = os.path.join(merged_dataset_path, split)
    os.makedirs(split_dir, exist_ok=True)

# Function to merge two directories
def merge_directories(source_dir, target_dir):
    if not os.path.exists(source_dir):
        print(f"Source directory {source_dir} does not exist, skipping.")
        return

    for class_name in os.listdir(source_dir):
        source_class_dir = os.path.join(source_dir, class_name)
        target_class_dir = os.path.join(target_dir, class_name)

        # Skip if not a directory
        if not os.path.isdir(source_class_dir):
            continue

        # Create class directory in target if it doesn't exist
        os.makedirs(target_class_dir, exist_ok=True)

        # Copy files from source to target
        for file_name in os.listdir(source_class_dir):
            source_file = os.path.join(source_class_dir, file_name)
            target_file = os.path.join(target_class_dir, file_name)

            # Avoid overwriting by renaming if a file with the same name exists
            if os.path.exists(target_file):
                file_stem = Path(file_name).stem
                file_ext = Path(file_name).suffix
                target_file = os.path.join(
                    target_class_dir, f"{file_stem}_dup{file_ext}"
                )

            shutil.copy2(source_file, target_file)

# Merge the datasets for each split
for split in splits:
    source_dir1 = os.path.join(dataset1_path, split)
    source_dir2 = os.path.join(dataset2_path, split)
    target_dir = os.path.join(merged_dataset_path, split)

    print(f"Merging {split} set...")
    merge_directories(source_dir1, target_dir)
    merge_directories(source_dir2, target_dir)

print("Datasets successfully merged!")


Merging train set...
Source directory C:\Users\kiezi\bloomsight-flowers\Flower_Classification\train does not exist, skipping.
Merging val set...
Source directory C:\Users\kiezi\bloomsight-flowers\Flower_Classification\val does not exist, skipping.
Merging test set...
Source directory C:\Users\kiezi\bloomsight-flowers\Flower_Classification\test does not exist, skipping.
Datasets successfully merged!


Label of Dataset

In [8]:
import os
import pandas as pd 

# Path to the dataset
dataset_path = r"C:\Users\kiezi\bloomsight-flowers\dataset"  # Replace with your dataset folder path
output_file = r"C:\Users\kiezi\bloomsight-flowers\flower_labels.csv" # File to save the labeled data

# List to store image paths and labels
data = []

# Iterate through each class folder
for class_folder in os.listdir(dataset_path):
    class_path = os.path.join(dataset_path, class_folder)
    if os.path.isdir(class_path):  # Ensure it's a directory
        for image_file in os.listdir(class_path):
            image_path = os.path.join(class_path, image_file)
            data.append({'Image': image_path, 'Label': class_folder})

# Create a DataFrame
df = pd.DataFrame(data)

# Save to a CSV file
df.to_csv(output_file, index=False)

print(f"Labeled dataset saved to {output_file}")

ModuleNotFoundError: No module named 'pandas'