In [None]:
# standard imports
import os  # used for interacting with the operating system, such as file and directory manipulation
import shutil  # used for high-level file operations like copying and removing files
import random  # used for generating random numbers and making random selections
import time  # used for time-related functions, such as adding delays

In [None]:
# project_code\
# │
# ├── project_comp6721_venv\  # Virtual environment
# │
# ├── data\
# │   ├── original\
# │   │   └── places365_standard\
# │   │       ├── train\  # Contains folders for each class
# │   │       │   ├── airplane_cabin\
# │   │       │   ├── arena-hockey\
# │   │       │   ├── movie_theater-indoor\
# │   │       │   ├── staircase\
# │   │       │   └── supermarket\
# │   │       │   └── ...  # Other classes
# │   │       └── val\  # Validation images (not used in this script)
# │   │
# │   └── processed\  # New dataset to be created
# │       ├── train_val\
# │       │   ├── airplane_cabin\
# │       │   ├── hockey_arena\
# │       │   ├── movie_theater\
# │       │   ├── staircase\
# │       │   └── supermarket\
# │       │
# │       └── test\
# │           ├── airplane_cabin\
# │           ├── hockey_arena\
# │           ├── movie_theater\
# │           ├── staircase\
# │           └── supermarket\
# │
# └── notebooks\  # Python code for data processing, model training, etc.
#     ├── create_project_dataset.ipynb   
#     └── comp6721_project.ipynb 



In [None]:
# define constants
NUM_IMAGES_PER_CLASS = 1000
TEST_SPLIT_PERCENTAGE = 0.15
TRAIN_VAL_SPLIT_PERCENTAGE = 1 - TEST_SPLIT_PERCENTAGE  # the rest is for training and validation
BATCH_SIZE = 50  # number of images to copy at once
RANDOM_SEED = 42  # seed for reproducibility
CLASSES_ORIGINAL = ['airplane_cabin', 'arena-hockey', 'movie_theater-indoor', 'staircase', 'supermarket']
CLASSES_NEW = ['airplane_cabin', 'hockey_arena', 'movie_theater', 'staircase', 'supermarket']


In [None]:
# define paths of the original dataset and the new dataset
ORIGINAL_DATASET_PATH = r'C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\original\places365_standard\train'
NEW_DATASET_PATH = r'C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\processed'


In [None]:
# create new dataset directory structure
def create_directory_structure():
    """
    Creates the directory structure for the new dataset, including directories for training/validation and testing sets for each class.

    No inputs or outputs. The function creates directories in the file system.

    It uses the global variables:
    - CLASSES_NEW: List of new class names.
    - NEW_DATASET_PATH: Path where the new dataset will be created.
    """
    for split in ['train_val', 'test']:
        for class_name in CLASSES_NEW:
            os.makedirs(os.path.join(NEW_DATASET_PATH, split, class_name), exist_ok=True)
            print(f"Created directory: {os.path.join(NEW_DATASET_PATH, split, class_name)}")



In [None]:
# function to copy images to new directories
def copy_images_in_batches(src_dir, dest_dir, file_list, batch_size):
    """
    Copies images from the source directory to the destination directory in batches.

    Inputs:
    - src_dir: Source directory containing the original images.
    - dest_dir: Destination directory where images will be copied.
    - file_list: List of image filenames to be copied.
    - batch_size: Number of images to copy in each batch.

    No outputs. The function copies files and prints the status of each batch.
    """
    for i in range(0, len(file_list), batch_size):
        batch = file_list[i:i + batch_size]
        for file_name in batch:
            src_file = os.path.join(src_dir, file_name)
            dest_file = os.path.join(dest_dir, file_name)
            shutil.copyfile(src_file, dest_file)
            print(f"Copied {src_file} to {dest_file}")
        time.sleep(0.5)  # Add a short delay to prevent overloading the file system


In [None]:
# extract and distribute images
def distribute_images():
    """
    Extracts images from the original dataset, randomly selects a specified number of images per class,
    and distributes them into training/validation and testing directories.

    No inputs. Uses global constants and paths to locate the original dataset and create the new dataset.

    No outputs. The function copies files and prints the status of the process.
    """
    random.seed(RANDOM_SEED)  # Set the seed for reproducibility
    
    for original_class, new_class in zip(CLASSES_ORIGINAL, CLASSES_NEW):
        class_path = os.path.join(ORIGINAL_DATASET_PATH, original_class)
        if not os.path.exists(class_path):
            print(f"Directory does not exist: {class_path}")
            continue

        all_images = os.listdir(class_path)
        if len(all_images) < NUM_IMAGES_PER_CLASS:
            print(f"Not enough images in {class_path}")
            continue

        # select images randomly from the entire range of the original dataset
        selected_images = random.sample(all_images, NUM_IMAGES_PER_CLASS)
        
        # calculate number of images for each split
        num_test_images = int(NUM_IMAGES_PER_CLASS * TEST_SPLIT_PERCENTAGE)
        num_train_val_images = NUM_IMAGES_PER_CLASS - num_test_images
        
        # split the selected images into train_val and test sets
        train_val_images = selected_images[:num_train_val_images]
        test_images = selected_images[num_train_val_images:]
        
        # copy images to the new dataset structure
        copy_images_in_batches(class_path, os.path.join(NEW_DATASET_PATH, 'train_val', new_class), train_val_images, BATCH_SIZE)
        copy_images_in_batches(class_path, os.path.join(NEW_DATASET_PATH, 'test', new_class), test_images, BATCH_SIZE)


In [None]:
def main():
    """
    Main function to orchestrate the creation of the new dataset. 
    It creates the directory structure and distributes images.
    """
    create_directory_structure()
    distribute_images()    
    print("New dataset created successfully!")

In [None]:
main()