In [38]:
import os
import cv2
import random
from shutil import copyfile
from sklearn.model_selection import train_test_split
import torch
import tensorflow as tf
from pathlib import Path
import numpy as np


In [16]:
# Define the parent directory for the preprocessed data (path to final project)
parent_dir = "/Users/jeeviscarozza/Documents/Spring2024Physics/AICourse/FinalProject"

# Define paths
images_dir = "./preprocessed_data/Images"
masks_dir = "./preprocessed_data/Masks"

In [None]:
# Create a new directory for preprocessed data within the same parent directory
preprocessed_dir = os.path.join(parent_dir, "preprocessed_data")
os.makedirs(preprocessed_dir, exist_ok=True)

# Create directories for train, val, and test sets within the preprocessed data folder
train_dir = os.path.join(preprocessed_dir, "train")
val_dir = os.path.join(preprocessed_dir, "val")
test_dir = os.path.join(preprocessed_dir, "test")
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Get list of image file paths
image_files = os.listdir(images_dir)
# Shuffle the list for randomness
random.shuffle(image_files)  

# Split dataset into train, val, and test sets
train_files, test_val_files = train_test_split(image_files, test_size=0.2, random_state=42)
val_files, test_files = train_test_split(test_val_files, test_size=0.5, random_state=42)

# Define function to copy files to respective directories
def copy_files(file_list, source_dir, dest_dir):
    for file in file_list:
        image_path = os.path.join(source_dir, file)
        mask_path = os.path.join(masks_dir, file) 
        dest_image_path = os.path.join(dest_dir, file)
        dest_mask_path = os.path.join(dest_dir, f"{os.path.splitext(file)[0]}_mask.png")
        copyfile(image_path, dest_image_path)
        copyfile(mask_path, dest_mask_path)

# Copy files to train, val, and test directories within the preprocessed data folder
copy_files(train_files, images_dir, train_dir)
copy_files(val_files, images_dir, val_dir)
copy_files(test_files, images_dir, test_dir)

# Display how many images are in each set
print(f"Train set: {len(train_files)} images")
print(f"Validation set: {len(val_files)} images")
print(f"Test set: {len(test_files)} images")

Train set: 5052 images
Validation set: 631 images
Test set: 632 images


In [33]:
# define the paths to the train, val, and test directories
train_path = Path(train_dir)
val_path = Path(val_dir)
test_path = Path(test_dir)

In [46]:
def image_masks(data_dir):
    data_dir = Path(data_dir)  # Convert to Path object
    images = []
    masks = []

    for filename in data_dir.glob("*.png"):
        
        # Check if the current file is a mask
        if "_mask" in filename.stem:
            continue  # Skip processing if it's a mask file

        img = cv2.imread(str(filename))
        images.append(img)

        mask_filename = filename.stem + "_mask.png"
        mask_path = filename.parent / mask_filename
        mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
        masks.append(mask)
            
    print(f"Number of images: {len(images)}")  
    print(f"Number of masks: {len(masks)}")  
    return np.array(images), np.expand_dims(np.array(masks), axis=-1)  

In [48]:
train_images, train_masks = image_masks(train_dir)

KeyboardInterrupt: 

In [49]:
val_images, val_masks = image_masks(val_dir)

Number of images: 631
Number of masks: 631


In [50]:
test_images, test_masks = image_masks(test_dir)

Number of images: 632
Number of masks: 632


In [53]:
#print shape of images and masks
print(f"Train images shape: {train_images.shape}, Train masks shape: {train_masks.shape}")
print(f"Validation images shape: {val_images.shape}, Validation masks shape: {val_masks.shape}")
print(f"Test images shape: {test_images.shape}, Test masks shape: {test_masks.shape}")

Train images shape: (5052, 400, 400, 3), Train masks shape: (5052, 400, 400, 1)
Validation images shape: (631, 400, 400, 3), Validation masks shape: (631, 400, 400, 1)
Test images shape: (632, 400, 400, 3), Test masks shape: (632, 400, 400, 1)


In [55]:
# Display an image and its mask
def display_img_mask(img, mask):
    cv2.imshow("Image", img)
    cv2.imshow("Mask", mask)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [None]:
# The following code is used for the class weights calculation, to address the class imbalance problem
labels_flat = train_masks.flatten()
print(labels_flat[0:200])

n_samples = labels_flat.shape[0]
weight_0 = n_samples / (2 * np.sum(labels_flat == 0))
weight_1 = n_samples / (2 * np.sum(labels_flat == 1))
print(n_samples, weight_0, weight_1)

[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
808320000 0.5238804478064116 85.2202101955135
