# Dataset Preparation for YOLO after Annotation

## Transfer all imgs and txts to a new folder matching via names

In [1]:
import os
import shutil

# Function to get the filenames without extensions
def get_filenames_without_extension(files):
    return set(os.path.splitext(f)[0] for f in files)

# Path to the folder containing the images and text files
source_folder = "C://Users//3286089//Desktop//219_All_Annotated_Files"  # Replace with your folder path
destination_folder = "C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24"  # Replace with your folder path

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Get the list of all files in the source folder
all_files = os.listdir(source_folder)

# Separate image and text files
image_files = [f for f in all_files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))]
text_files = [f for f in all_files if f.lower().endswith('.txt')]

# Get the base filenames (without extension) for both images and text files
image_filenames = get_filenames_without_extension(image_files)
text_filenames = get_filenames_without_extension(text_files)

# Find the common filenames
common_filenames = image_filenames.intersection(text_filenames)

# Copy the matching image and text files to the destination folder
for filename in common_filenames:
    # Full file paths
    image_path = os.path.join(source_folder, filename)
    text_path = os.path.join(source_folder, filename + '.txt')
    
    # Copy image file (check for supported formats)
    for ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff']:
        img_file = image_path + ext
        if os.path.exists(img_file):
            shutil.copy(img_file, destination_folder)
            break
    
    # Copy corresponding text file
    shutil.copy(text_path, destination_folder)

print(f'Copied {len(common_filenames)} images and text files to {destination_folder}')


Copied 5297 images and text files to C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24


## Separate those imgs and txt into separate images and texts sub folders

In [2]:
import os
import shutil

# Path to the folder containing the images and text files
main_folder = 'C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24'  # Replace with your folder path

# Paths for the subfolders
images_folder = os.path.join(main_folder, 'images')
labels_folder = os.path.join(main_folder, 'labels')

# Ensure the subfolders exist, if not, create them
os.makedirs(images_folder, exist_ok=True)
os.makedirs(labels_folder, exist_ok=True)

# Get the list of all files in the main folder
all_files = os.listdir(main_folder)

# Move image and text files to respective subfolders
for file in all_files:
    # Check for image files
    if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
        shutil.move(os.path.join(main_folder, file), images_folder)
    
    # Check for text files
    elif file.lower().endswith('.txt'):
        shutil.move(os.path.join(main_folder, file), labels_folder)

print(f'Files have been organized into {images_folder} and {labels_folder}.')


Files have been organized into C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24\images and C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24\labels.


## Preparing Yolo dataset (train,test,validation)

In [5]:
import os
import shutil
import random

# Paths
source_img_dir = "C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24//images"  # Directory where images are located
source_txt_dir = "C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24//labels"   # Directory where text files are located
train_dir = "C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24//train"  # Directory to store training set
test_dir = "C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24//test"    # Directory to store testing set
val_dir = "C://Users//3286089//Desktop//219_New_YOLO_training_dataset_5aug24//val"      # Directory to store validation set

# Create directories if they don't exist
for dir_path in [train_dir, val_dir, test_dir]:
    os.makedirs(os.path.join(dir_path, 'images'), exist_ok=True)
    os.makedirs(os.path.join(dir_path, 'labels'), exist_ok=True)

# Get list of image and text files
image_files = [f for f in os.listdir(source_img_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
text_files = [f for f in os.listdir(source_txt_dir) if f.lower().endswith('.txt')]

# Get filenames without extensions for matching
image_filenames = set(os.path.splitext(f)[0] for f in image_files)
text_filenames = set(os.path.splitext(f)[0] for f in text_files)

# Ensure we have matching image and text files
common_filenames = image_filenames.intersection(text_filenames)

# Filter the matching image and text files
image_files = [f for f in image_files if os.path.splitext(f)[0] in common_filenames]
text_files = [f for f in text_files if os.path.splitext(f)[0] in common_filenames]

# Ensure there are exactly 80 matching images and text files
assert len(image_files) == 5327 and len(text_files) == 5327, "There should be 80 images and 80 matching text files."

# Shuffle the files
combined = list(zip(image_files, text_files))
random.shuffle(combined)
image_files, text_files = zip(*combined)

# Define counts
train_count = 3729
val_count = 1065
test_count = 533

# Split files
train_images = image_files[:train_count]
train_texts = text_files[:train_count]

val_images = image_files[train_count:train_count + val_count]
val_texts = text_files[train_count:train_count + val_count]

test_images = image_files[train_count + val_count:]
test_texts = text_files[train_count + val_count:]

# Function to copy files to their respective directories
def copy_files(files, source_dir, destination_dir, file_type):
    for file_name in files:
        source_file = os.path.join(source_dir, file_name)
        destination_file = os.path.join(destination_dir, file_type, file_name)
        shutil.copy2(source_file, destination_file)  # Copy instead of moving

# Copy files
copy_files(train_images, source_img_dir, train_dir, 'images')
copy_files(train_texts, source_txt_dir, train_dir, 'labels')

copy_files(val_images, source_img_dir, val_dir, 'images')
copy_files(val_texts, source_txt_dir, val_dir, 'labels')

copy_files(test_images, source_img_dir, test_dir, 'images')
copy_files(test_texts, source_txt_dir, test_dir, 'labels')

print("Files have been successfully copied into train, validation, and test sets.")


Files have been successfully copied into train, validation, and test sets.
