### Data processing

1. Dataset loading: images and masks
2. Dataset splitting:
   1. Taining set
   2. Validation set
   3. Test set
3. Save the dataset

In [2]:
import os
import numpy as np
import cv2
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split

### Dataset loading: images and masks

In [None]:
def load_dataset(path):
    images = sorted(glob(os.path.join(path, '/home/ahsan/University/Thesis/UNet_Directory/Datasets/face_cleaned_images', '*')))
    masks = sorted(glob(os.path.join(path, '/home/ahsan/University/Thesis/UNet_Directory/Datasets/mask_images', '*')))
    return images, masks

### Dataset split

In [None]:
def split_dataset(images, masks, split=0.2):
    split_size = int(len(images) * split)
    
    train_x, valid_x = train_test_split(images, test_size=split_size, random_state=42)
    train_y, valid_y = train_test_split(masks, test_size=split_size, random_state=42)
    
    train_x, test_x = train_test_split(train_x, test_size=split_size, random_state=42)
    train_y, test_y = train_test_split(train_y, test_size=split_size, random_state=42)
    
    return (train_x, train_y), (valid_x, valid_y), (test_x, test_y)

### Save the dataset


#### Creating folders

In [None]:
def create_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
def save_dataset(images, masks, save_dir):
    for x, y in tqdm(zip(images, masks), total=len(images)):
        name = x.split("/")[-1]
        x = cv2.imread(x, cv2.IMREAD_COLOR)
        y = cv2.imread(y, cv2.IMREAD_COLOR)
        
        save_image_path = os.path.join(save_dir, "images", name)
        save_mask_path = os.path.join(save_dir, "masks", name)
        
        cv2.imwrite(save_image_path, x)
        cv2.imwrite(save_mask_path, y)

In [1]:
s = "/home/ahsan/University/Thesis/UNet_Directory/Datasets/segmentation_dataset_path/images/1.png"

In [2]:
name = s.split("/")[-1]
print(name)

1.png


### Executing the program

#### Loading the dataset

In [None]:
dataset_path = '/home/ahsan/University/Thesis/UNet_Directory/Datasets/segmentation_dataset_path'

In [None]:
images, masks = load_dataset(dataset_path)
print(f"Loaded {len(images)} images and {len(masks)} masks")

In [None]:
for x, y in zip(images, masks):
    print(x, y)
    break


### Visualize the images and masks


In [None]:
from imshow import imshow

cat = []
for x, y in zip(images[:6], masks[:6]):
    x = cv2.imread(x, cv2.IMREAD_COLOR)
    x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
    y = cv2.imread(y, cv2.IMREAD_COLOR)
    y = cv2.cvtColor(y, cv2.COLOR_BGR2RGB)
    z = np.concatenate([x, y], axis=1)
    cat.append(z)
imshow(*cat, size=(20, 10), columns=3)

### Dataset splitting execution


In [None]:
(train_x, train_y), (valid_x, valid_y), (test_x, test_y) = split_dataset(images, masks, split=0.2)

In [None]:
print(f"Train: {len(train_x)} images and {len(train_y)} masks")
print(f"Validation: {len(valid_x)} images and {len(valid_y)} masks")
print(f"Test: {len(test_x)} images and {len(test_y)} masks")

## Saving the dataset Execution

### 1. Creating folders

In [None]:
save_dir = os.path.join(dataset_path, 'non-aug')

for item in ["train", "valid", "test"]:
    create_dirs(os.path.join(save_dir, item, "images"))
    create_dirs(os.path.join(save_dir, item, "masks"))


### 2. Saving the training dataset

In [None]:
save_dataset(train_x, train_y, os.path.join(save_dir, "train"))

### 3. Saving the validation dataset

In [None]:
save_dataset(valid_x, valid_y, os.path.join(save_dir, "valid"))

### 4. Saving the test dataset

In [None]:
save_dataset(test_x, test_y, os.path.join(save_dir, "test"))