$$\Huge\textbf{Image Preprocessing}$$

# Imports

In [1]:
# standard library imports
import os  #  directory and file operations
import shutil  #  copying files
import  time  #  adding delays

# installed library imports
import numpy as np  #  numerical operations
from sklearn.model_selection import train_test_split  #  splitting datasets
from PIL import Image  #  image processing
import torchvision.transforms as transforms  #  data augmentation
from tqdm import tqdm  # Progress bar for visualization
from sklearn.model_selection import StratifiedKFold



# Global Constants

In [2]:
# global constants
IMAGE_SIZE = (256, 256)
NO_CV_VALIDATION_SPLIT = 15 / 85  # train_val is 85% of the total dataset, we want the validation set to be 15% of the total dataset
CLASSES = ['airplane_cabin', 'hockey_arena', 'movie_theater', 'staircase', 'supermarket']
RANDOM_SEED = 42
BATCH_SIZE = 100
NUM_AUG_IMG = 1  # number of augmentated image per image of the train set

In [3]:
# data directories
RAW_DATA_DIR = r'C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\raw_data'
RAW_TRAIN_VAL_DIR = os.path.join(RAW_DATA_DIR, 'train_val')
RAW_TEST_DIR = os.path.join(RAW_DATA_DIR, 'test')

# processed data directories: case without cross validation
PROCESSED_DATA_DIR = r'C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\processed_data'
NO_CV_TRAIN_DIR = os.path.join(PROCESSED_DATA_DIR, 'without_cross_validation','train')  # new directory to be created
NO_CV_VAL_DIR = os.path.join(PROCESSED_DATA_DIR, 'without_cross_validation', 'validation')  # new directory to be created
NO_CV_TEST_DIR = os.path.join(PROCESSED_DATA_DIR, 'without_cross_validation', 'test')  # new directory to be created
CV_DIR = os.path.join(PROCESSED_DATA_DIR, 'with_cross_validation')# new directory to be created

# Dataset Structure

We following cell illustrate the complete structure of how we organised our dataset for this project.

This notebook only deals with the creation of the directories and images inside of `\data\project_dataset\processed_data\`.

For the creation of the directories and images inside of `\data\project_dataset\raw_data\`, please refer to the `project_raw_dataset_creation.ipynb` notebook.

In [4]:
# project_code\
# │
# ├── project_comp6721_venv\  # Virtual environment
# │
# ├── data\
# │   ├── original\
# │   │   └── places365_standard\
# │   │       ├── train\  # Contains folders for each class
# │   │       │   ├── airplane_cabin\
# │   │       │   ├── arena-hockey\
# │   │       │   ├── movie_theater-indoor\
# │   │       │   ├── staircase\
# │   │       │   └── supermarket\
# │   │       │   └── ...  # other classes
# │   │       └── val\  # validation images (not used in the project)
# │   │
# │   └── project_dataset\  # New subdirectory to be created
# |       ├── raw_data
# |       |   ├── train_val\
# |       |   |   ├── airplane_cabin\
# |       |   |   ├── hockey_arena\
# |       |   |   ├── movie_theater\
# |       |   |   ├── staircase\
# |       |   |   └── supermarket\
# |       |   |
# |       |   └── test\
# |       |       ├── airplane_cabin\
# |       |       ├── hockey_arena\
# |       |       ├── movie_theater\
# |       |       ├── staircase\
# |       |       └── supermarket\
# |       |
# |       └── preprocessed_data\
# |           ├── without_cross_validation\
# |           |   ├── train\----------------------> augmented
# |           |   |   ├── airplane_cabin\
# |           |   |   ├── hockey_arena\
# |           |   |   ├── movie_theater\
# |           |   |   ├── staircase\
# |           |   |   └── supermarket\
# |           |   |   
# |           |   ├── validation\-----------------> 
# |           |   |   ├── airplane_cabin\
# |           |   |   ├── hockey_arena\
# |           |   |   ├── movie_theater\
# |           |   |   ├── staircase\
# |           |   |   └── supermarket\
# |           |   |   
# |           |   └── test\-----------------------> 
# |           |       ├── airplane_cabin\
# |           |       ├── hockey_arena\
# |           |       ├── movie_theater\
# |           |       ├── staircase\
# |           |       └── supermarket\
# |           |   
# |           └── with_cross_validation\
# |               ├── fold_1
# |               |   ├── train\----------------------> 
# |               |   |   ├── airplane_cabin\
# |               |   |   ├── hockey_arena\
# |               |   |   ├── movie_theater\
# |               |   |   ├── staircase\
# |               |   |   └── supermarket\
# |               |   |   
# |               |   ├── validation\-----------------> 
# |               |       ├── airplane_cabin\
# |               |       ├── hockey_arena\
# |               |       ├── movie_theater\
# |               |       ├── staircase\
# |               |       └── supermarket\
# |               |  
# |               ├── fold_2 (similar structure for fold 1)
# |               ├── fold_3 (similar structure for fold 1)
# |               ├── fold_4 (similar structure for fold 1)
# |               └── fold_5 (similar structure for fold 1)
# │
# └── notebooks\  # Python code for data processing, model training, etc.
#     ├── project_dataset_creation.ipynb 
#     ├── decision_tree_models.ipynb 
#     └── image_preprocessing.ipynb 

# 1. Data Preprocessing

## 1.1. Case Without Cross-Validation

### 1.1.1. Creation of the Empty Training, Validation and Testing Directories

In [5]:
# create the empty train, val, test directories and the subdirectories with the name of the classes
def create_empty_train_val_test_dir():
    """
    Create empty directories inside of \project_dataset\preprocessed_data\without_cross_validation\.
    
    No inputs or outputs. 
    """
    os.makedirs(NO_CV_TRAIN_DIR, exist_ok=True)
    os.makedirs(NO_CV_VAL_DIR, exist_ok=True)
    os.makedirs(NO_CV_TEST_DIR, exist_ok=True)
    for class_name in CLASSES:
        os.makedirs(os.path.join(NO_CV_TRAIN_DIR, class_name), exist_ok=True)
        os.makedirs(os.path.join(NO_CV_VAL_DIR, class_name), exist_ok=True)
        os.makedirs(os.path.join(NO_CV_TEST_DIR, class_name), exist_ok=True)

### 1.1.2. Copy Images in Batches

In [6]:
# copy images from the source directory to the destination directory
def copy_images_in_batches(src_dir, dest_dir, file_list, batch_size):
    """
    Copy images from the source directory to the destination directory in batches.
    Include a short delay to prevent overloading the file system.

    Inputs:
    - src_dir: Source directory containing the original images.
    - dest_dir: Destination directory where images will be copied.
    - file_list: List of image filenames to be copied.
    - batch_size: Number of images to copy in each batch.

    No outputs. The function copies files and prints the status of each batch.
    """
    for i in range(0, len(file_list), batch_size):
        batch = file_list[i:i + batch_size]
        for file_name in batch:
            src_file = os.path.join(src_dir, file_name)
            dest_file = os.path.join(dest_dir, file_name)
            shutil.copyfile(src_file, dest_file)
            print(f"Copied {src_file} to {dest_file}")
        time.sleep(0.5)  # add a short delay to prevent overloading the file system


### 1.1.3. Image Distribution in Training and Validation Sets

In [7]:
# distribute the images in train and validation sets
def split_train_val():
    """
    Split the data in train_val directory into training and validation sets.

    No inputs or outputs. This function splits the images and moves them to their respective directories.
    """
    for class_name in CLASSES:
        class_dir = os.path.join(RAW_TRAIN_VAL_DIR, class_name)
        images = os.listdir(class_dir)
        train_images, val_images = train_test_split(images, test_size=NO_CV_VALIDATION_SPLIT, random_state=RANDOM_SEED)

        # copy training images in batches
        copy_images_in_batches(class_dir, os.path.join(NO_CV_TRAIN_DIR, class_name), train_images, BATCH_SIZE)

        # copy validation images in batches
        copy_images_in_batches(class_dir, os.path.join(NO_CV_VAL_DIR, class_name), val_images, BATCH_SIZE)


### 1.1.4. Copy All the Images of the Raw Test Set

In [8]:
def copy_test_set():
    """
    Copy all images from \project_data\raw_data\test\ directory to new subfolders in the target \preprocessed_data\without_cross_validation\test\ directory.

    No inputs or outputs. This function copies all images to their respective subfolders.
    """
    for class_name in CLASSES:
        class_dir = os.path.join(RAW_TEST_DIR, class_name)
        images = os.listdir(class_dir)

        # Get the target directory for this class (modify if needed)
        target_dir = os.path.join(NO_CV_TEST_DIR, class_name)  # Replace TARGET_DIR with your desired location

        # Create the target directory if it doesn't exist
        os.makedirs(target_dir, exist_ok=True)

        # Copy all images in batches
        copy_images_in_batches(class_dir, target_dir, images, BATCH_SIZE)


## 1.2. Case With Cross-Validation

### 1.2.1 Creation of the Empty Fold Directories for CV

In [None]:
def create_empty_fold_dir(n_splits=5):
    """
    Create directory of \project_dataset\preprocessed_data\with_cross_validation\.
    
    Create fold directories inside of \project_dataset\preprocessed_data\with_cross_validation\.

    Inputs:
    - default: n_splits=5 : 5 folds.

    No outputs. 
    """
    os.makedirs(CV_DIR, exist_ok=True)
    for i in range(1, n_splits+1):
        fold_dir = os.path.join(CV_DIR, f'fold_{i}')
        os.makedirs(fold_dir, exist_ok=True)
        train_dir = os.path.join(fold_dir, 'train')
        val_dir = os.path.join(fold_dir, 'val')
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(val_dir, exist_ok=True)

### 1.2.2 Split training and validation images for each fold

In [None]:
def prepare_cross_validation_data(base_dir, output_dir, n_splits=5):
    """
    Prepares image data for each fold.

    Parameters:
    - base_dir: Path to the directory containing the original unnormalized, unaugmented data.
      - RAW_TRAIN_VAL_DIR
    - output_dir: Path to the directory where the cross-validation splits will be stored.
      - CV_DIR
    - n_splits: Number of cross-validation splits.
    """
    # Read all images and their labels
    images, labels = [], []
    class_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
    for folder in class_folders:
        folder_path = os.path.join(base_dir, folder)
        for filename in os.listdir(folder_path):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                img_path = os.path.join(folder, filename)
                images.append(img_path)
                labels.append(folder)

    # Prepare stratified splits
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    images = np.array(images)
    labels = np.array(labels)

    # Generate splits and directories
    for fold, (train_idx, val_idx) in enumerate(skf.split(images, labels), 1):
        fold_dir = os.path.join(output_dir, f'fold_{fold}')
        os.makedirs(fold_dir, exist_ok=True)

        # Directories for training and validation inside each fold
        train_dir = os.path.join(fold_dir, 'train')
        val_dir = os.path.join(fold_dir, 'val')
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(val_dir, exist_ok=True)

        # Copy training images
        for img in images[train_idx]:
            src = os.path.join(base_dir, img)
            dst = os.path.join(train_dir, img)
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy(src, dst)

        # Copy validation images
        for img in images[val_idx]:
            src = os.path.join(base_dir, img)
            dst = os.path.join(val_dir, img)
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy(src, dst)

# 2. Training, Validation, and Testing Data Normalization

## 2.1. Calculate Mean and Standard Deviation

In [9]:
def calculate_mean_std(directory):
    """
    Calculate the mean and standard deviation of images in a directory.
    The mean and standard deviation are computed for each channel (R, G, B).

    Inputs:
    - directory (str): Path to the directory containing image subdirectories for each class.

    Outputs:
    - mean (np.ndarray): Mean values for each channel (R, G, B).
    - std (np.ndarray): Standard deviation values for each channel (R, G, B).
    """
    # Define a transformation pipeline to resize images and convert them to tensors
    transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),  # Converts image to PyTorch tensor and scales pixel values to [0, 1]
    ])

    # Initialize variables to accumulate the mean and standard deviation
    mean = np.zeros(3)
    std = np.zeros(3)
    num_images = 0

    # Loop over each class directory
    for class_name in CLASSES:
        class_dir = os.path.join(directory, class_name)
        
        # Loop over each image file in the class directory
        for img_name in tqdm(os.listdir(class_dir), desc=f'Processing {class_name}'):
            img_path = os.path.join(class_dir, img_name)  # Get the full path of the image file
            
            image = Image.open(img_path).convert('RGB')  # Open the image and convert it to RGB
            image = transform(image)  # Apply the transformation (resize and convert to tensor)
            
            # Accumulate the mean and standard deviation for each channel
            mean += image.mean(dim=[1, 2]).numpy()
            std += image.std(dim=[1, 2]).numpy()
            num_images += 1  # Increment the image count

    # Calculate the average mean and standard deviation across all images
    mean /= num_images
    std /= num_images

    return mean, std


## 2.2. Normalization of an Image

In [10]:
def normalize_image(image, mean, std):
    """
    Normalize a PIL image using the provided mean and standard deviation.

    Inputs:
        - image (PIL Image): The image to normalize.
        - mean (tuple): The mean values for each color channel.
        - std (tuple): The standard deviation values for each color channel.

    Output:
        - normalized_image (torch.tensor): The normalized image as a PyTorch tensor.
    """

    transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),  # convert to PyTorch tensor (default range [0, 1])
        transforms.Normalize(mean=mean, std=std),  # normalize using calculated mean and std of the train set
    ])
    return transform(image)

## 2.3. Normalization of a Directory of Images

In [11]:
def normalize_images_in_directory(directory, mean, std):
    """
    Normalize all images in the specified directory.
    Normalized images are converted back to PIL images and overwrite the original ones.
    
    Inputs:
        - directory (str): Path to the directory containing images.
        - mean (tuple): The mean values for each color channel.
        - std (tuple): The standard deviation values for each color channel.

    No outputs. This function normalizes images and saves them as PyTorch tensors.
    """
    for class_name in CLASSES:
        class_dir = os.path.join(directory, class_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            image = Image.open(img_path)
            normalized_image = normalize_image(image, mean, std)
            normalized_image_pil = transforms.ToPILImage()(normalized_image)
            normalized_image_pil.save(img_path)  # overwrite the original image


# 3. Training Data Augmentation

In [12]:
# only augment training images
def augment_image(image):
    """
    Apply combined augmentation techniques (rotation, flipping, brightness enhancement) to the image using PyTorch transforms.
    Each original image generates three additional augmented images, increasing the dataset size.
    
    Inputs:
    - image: PIL Image object.
    
    Outputs:
    - List of augmented images as PIL Image objects.
    """
    # define the transforms
    transform = transforms.Compose([
        transforms.RandomRotation(15),  # randomly rotate the image by up to 15 degrees
        transforms.RandomHorizontalFlip(),  # randomly flip the image horizontally with a probability of 0.5
        transforms.ColorJitter(brightness=1.5)  # randomly change the brightness of the image
    ])
    
    # apply the transformations directly to the PIL image
    augmented_images = [transform(image) for _ in range(NUM_AUG_IMG)]  # give a certain number of augmented images
    return augmented_images



In [13]:
def augment_images_in_directory(directory):
    """
    Apply augmentation to all images in the specified (training) directory.
    Augmented images are directly saved into the training directory.
    
    Inputs:
    - directory: Path to the directory containing images.
    
    No outputs. This function augments images and saves them in the same directory.
    """
    for class_name in CLASSES:
        class_dir = os.path.join(directory, class_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            image = Image.open(img_path)
            augmented_images = augment_image(image)
            for i, aug_image in enumerate(augmented_images):
                aug_image_path = os.path.join(class_dir, f"{os.path.splitext(img_name)[0]}_aug_{i}.jpg")
                aug_image.save(aug_image_path)


# 4. Code Execution

## 4.1. Case Without Cross-Validation

In [14]:
def preprocess_no_cv_dataset():
    # create training, validation, test directories
    create_empty_train_val_test_dir()
    print("train, validation, and test directories created sucessfully!")

    # distribute images into training and validation directories, no cross-validation case
    split_train_val()
    print("train and validation data split successfully!")

    # copy the test set (so that the test images in the raw directory are not altered)
    copy_test_set()

    # augment the training images
    augment_images_in_directory(NO_CV_TRAIN_DIR)
    print("training data augmented successfully!")

    # mean and standard deviation of train set (after augmentation)
    #mean_no_cv_train, std_no_cv_train = calculate_mean_std(NO_CV_TRAIN_DIR)
    #print("mean and std of train set:", mean_no_cv_train, std_no_cv_train)

    # No need to normalize for Decision Tree
    # normalize training set
    #normalize_images_in_directory(NO_CV_TRAIN_DIR, mean_no_cv_train, std_no_cv_train)
    #print("training data normalized successfully!")

    # normalize validation set
    #normalize_images_in_directory(NO_CV_VAL_DIR, mean_no_cv_train, std_no_cv_train)
    #print("validation data normalized successfully!")

    # normalize testing set
    #normalize_images_in_directory(NO_CV_TEST_DIR, mean_no_cv_train, std_no_cv_train)
    #print("testing data normalized successfully!")


In [15]:
preprocess_no_cv_dataset()

train, validation, and test directories created sucessfully!
Copied C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\raw_data\train_val\airplane_cabin\00001208.jpg to C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\processed_data\without_cross_validation\train\airplane_cabin\00001208.jpg
Copied C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\raw_data\train_val\airplane_cabin\00002068.jpg to C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\processed_data\without_cross_validation\train\airplane_cabin\00002068.jpg
Copied C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\raw_data\train_val\airplane_cabin\00003114.jpg to C:\Users\helen\Documents\Concordia University\summer 2024\COMP 6721\project_code\data\project_dataset\processe

In [None]:
# generate the directories and image datas for cross-validation
def preprocess_cv_dataset():
    # create the with_cross_validation/fold_n
    create_empty_fold_dir()
    # create the specific training and validation directories in the fold_n 
    # distribute images into training and validation directories, with cross-validation case
    prepare_cross_validation_data(RAW_TRAIN_VAL_DIR, CV_DIR)
    