### Data processing

1. Dataset loading: images and masks
2. Dataset splitting:
   1. Taining set
   2. Validation set
   3. Test set
3. Save the dataset

In [1]:
import os
import numpy as np
import cv2
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split

### Dataset loading: images and masks

In [10]:
def load_dataset(path):
    """
    Load images and masks from the specified directory.
    """
    images = sorted(glob(os.path.join(path, 'images', '*.png')))
    masks = sorted(glob(os.path.join(path, 'masks', '*.png')))
    
    if not images or not masks:
        raise ValueError("No images or masks found. Check your dataset paths.")
    
    return images, masks

### Dataset split

In [11]:
def split_dataset(images, masks, split=0.2):
    """
    Split dataset into training, validation, and test sets.
    """
    train_x, valid_x, train_y, valid_y = train_test_split(images, masks, test_size=split, random_state=42)
    train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=split, random_state=42)
    return (train_x, train_y), (valid_x, valid_y), (test_x, test_y)

### Save the dataset


#### Creating folders

In [12]:
def create_dirs(path):
    """
    Create necessary directories if they don't exist.
    """
    os.makedirs(path, exist_ok=True)

In [13]:
def save_dataset(images, masks, save_dir):
    """
    Save images and masks to specified directory.
    """
    for x, y in tqdm(zip(images, masks), total=len(images), desc=f"Saving dataset in {save_dir}"):
        name = os.path.basename(x)
        img = cv2.imread(x, cv2.IMREAD_COLOR)  # Keep images in color
        mask = cv2.imread(y, cv2.IMREAD_GRAYSCALE)  # Read mask as grayscale

        if img is None or mask is None:
            print(f"Warning: Issue loading {x} or {y}, skipping...")
            continue

        cv2.imwrite(os.path.join(save_dir, 'images', name), img)
        cv2.imwrite(os.path.join(save_dir, 'masks', name), mask)

In [6]:
s = "/home/ahsan/University/Thesis/UNet_Directory/Datasets/segmentation_dataset_path/images/1.png"

In [7]:
name = s.split("/")[-1]
print(name)

1.png


### Executing the program

#### Loading the dataset

In [14]:
dataset_path = '/home/ahsan/University/Thesis/UNet_Directory/Datasets/second_phase/working_dataset'

In [15]:
images, masks = load_dataset(dataset_path)

### Dataset splitting execution


In [16]:
# Split dataset
(train_x, train_y), (valid_x, valid_y), (test_x, test_y) = split_dataset(images, masks)

In [None]:
# print(f"Train: {len(train_x)} images and {len(train_y)} masks")
# print(f"Validation: {len(valid_x)} images and {len(valid_y)} masks")
# print(f"Test: {len(test_x)} images and {len(test_y)} masks")

## Saving the dataset Execution

### 1. Creating folders

In [17]:
# Create directories for saving splits
save_path = '/home/ahsan/University/Thesis/UNet_Directory/Datasets/second_phase/processed_dataset'
processed_dir = os.path.join(save_path, 'non-aug')
for split in ['train', 'valid', 'test']:
    create_dirs(os.path.join(processed_dir, split, 'images'))
    create_dirs(os.path.join(processed_dir, split, 'masks'))

In [18]:
# Save datasets
save_dataset(train_x, train_y, os.path.join(processed_dir, 'train'))
save_dataset(valid_x, valid_y, os.path.join(processed_dir, 'valid'))
save_dataset(test_x, test_y, os.path.join(processed_dir, 'test'))

Saving dataset in /home/ahsan/University/Thesis/UNet_Directory/Datasets/second_phase/processed_dataset/non-aug/train: 100%|██████████| 74/74 [00:01<00:00, 63.72it/s]
Saving dataset in /home/ahsan/University/Thesis/UNet_Directory/Datasets/second_phase/processed_dataset/non-aug/valid: 100%|██████████| 24/24 [00:00<00:00, 68.78it/s]
Saving dataset in /home/ahsan/University/Thesis/UNet_Directory/Datasets/second_phase/processed_dataset/non-aug/test: 100%|██████████| 19/19 [00:00<00:00, 69.97it/s]
