# Data Pre-Processing for Kaggle dataset (original images)
---


# Mount drive and import dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from skimage import io, exposure, img_as_float
from skimage.color import rgb2gray
from skimage.util import img_as_ubyte
from glob import glob
import os

# This section defines the directory mapping
- **IMPORTATNT NOTE:** If your data is in a different directory structure update it in `dir_map`
---


In [None]:
dir_map = {
    'KAGGLE_RAW_IMAGES_DIR':'/content/drive/Shared drives/Grain Detection Project Trello/Ben/Grain Boundary Detection/Stardist/data/kaggleData/raw_images',
    'KAGGLE_IMAGES_OUTPUT_DIR':'/content/drive/Shared drives/Grain Detection Project Trello/Ben/Grain Boundary Detection/Stardist/data/kaggleData/train/images',
}

#Used Primarily for Output Directories (Preferably not made yet), Raw Images folders need to have images to process
for _, path in dir_map.items():
    dir = os.getcwd() + '/' + path
    if not os.path.exists(dir):
        os.makedirs(dir)
        print(f'Directory "{str(dir)}" created')
    else:
        print(f'Directory "{str(dir)}" already exists')

Directory "/content//content/drive/Shared drives/Grain Detection Project Trello/Ben/Grain Boundary Detection/Stardist/data/kaggleData/raw_images" already exists
Directory "/content//content/drive/Shared drives/Grain Detection Project Trello/Ben/Grain Boundary Detection/Stardist/data/kaggleData/train/images" created


# Process Kaggle Original Images and save to kaggleData directory
- Create equalized images for all Kaggle images
- Change their name and save in specified direcrtory
- Change into 2D


In [None]:
# Get the sorted list of file paths
# Kaggle data file paths
KAGGLE_image_paths = sorted(glob(dir_map['KAGGLE_RAW_IMAGES_DIR'] + '/*.png'))

# Directorys where the processed images will be saved
# KAGGLE_mask_paths =  dir_map['KAGGLE_MASKS_OUTPUT_DIR']

# Process each KAGGLE_image image
for index, path in enumerate(KAGGLE_image_paths):
  # Read image
  KAGGLE_original_img = io.imread(path)

  # Convert the image to 2D grayscale
  KAGGLE_original_img_2D = rgb2gray(KAGGLE_original_img)

  # Convert the grayscale image to float
  img_float = img_as_float(KAGGLE_original_img_2D)

  # Perform histogram equalization
  equalized_img = exposure.equalize_hist(img_float)

  # Convert the processed image back to 8-bit unsigned integers
  equalized_img_ubyte = img_as_ubyte(equalized_img)

  # Save the processed image
  custom_name = f"kaggle_original{index + 1}.tif"
  save_path = os.path.join(dir_map['KAGGLE_IMAGES_OUTPUT_DIR'], custom_name)
  io.imsave(save_path, equalized_img_ubyte)

print("All images for KAGGLE_image_paths have been processed and saved.")

All images for KAGGLE_image_paths have been processed and saved.


In [None]:
# Print the shape of the Kaggle original image
print("Shape of original Kaggle image:", KAGGLE_original_img_2D.shape)

print("Number of Kaggle images:", len(KAGGLE_image_paths))