# Data Preparation Script

In [1]:
# Prequisites
import os
import re       # for regex operations
from pathlib import Path  # for handling file paths
import numpy as np
from patchify import patchify  # for creating patches from images
from PIL import Image

### Check in running in Colab or Locally

In [2]:
# Check if notebook being run in Colab or locally  
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

print (f'Running in Colab: {IN_COLAB}')

Running in Colab: False


### Download Raw Data from Kaggle

In [6]:
import kagglehub

if IN_COLAB:
    pass

else: 
    # Download latest version
    raw_data_path = kagglehub.dataset_download("humansintheloop/semantic-segmentation-of-aerial-imagery")

print("Path to dataset files:", raw_data_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/humansintheloop/semantic-segmentation-of-aerial-imagery?dataset_version_number=1...


100%|██████████| 29.6M/29.6M [00:02<00:00, 11.6MB/s]

Extracting files...





Path to dataset files: C:\Users\Jari\.cache\kagglehub\datasets\humansintheloop\semantic-segmentation-of-aerial-imagery\versions\1


### Copy Raw Data

In [7]:
import shutil

src_dir = raw_data_path
dst_dir = "data/raw"

shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)

# Delete temporary download directory
shutil.rmtree(raw_data_path, ignore_errors=True)

### Create Folders 'train', 'val', 'test'

In [8]:
def create_folders(root_dir='.'):
    FOLDER_NAMES = ["train", "val", "test"]

    for folder_name in FOLDER_NAMES:
        if not os.path.exists(folder_name):
            folder_images = f"{root_dir}/{folder_name}/images"
            folder_masks = f"{root_dir}/{folder_name}/masks"
            os.makedirs(folder_images) if not os.path.exists(folder_images) else print('images folder already exists')
            os.makedirs(folder_masks) if not os.path.exists(folder_masks) else print('masks folder already exists')

if IN_COLAB:
    create_folders()
else:
    create_folders("data/processed")

### Create Image Patches

In [9]:
def create_patches(src, dest_path):
    path_split = os.path.split(src)   # get the last part of the path
    tile_num = re.findall(r'\d+', path_split[0])[0]  # extract tile number using regex
    
    image = Image.open(src)   
    image = np.asarray(image)  # convert to numpy array
    if len(image.shape) > 2:  # need color channels
        patches = patchify(image, (320, 320, 3), step=300)  # create patches of size 320x320 with overlap of 20 pixels
        file_name_wo_ext = Path(src).stem   # get file name without extension
        for i in range(patches.shape[0]): 
            for j in range(patches.shape[1]):
                patch = patches[i, j, 0]
                patch = Image.fromarray(patch)  # convert back to image
                num = i * patches.shape[1] + j
                patch.save(f"{dest_path}/{file_name_wo_ext}_tile_{tile_num}_patch_{num}.png")

### Copy the files from dataset to 'train', 'val', and 'test'

In [10]:
raw_data_root = "data/raw/Semantic segmentation dataset"
processed_data_root = "data/processed"

for path_name, _, file_name in os.walk(raw_data_root): # walk through all files and folders in dataset folder
    for f in file_name:
        print(f)
        if f != 'classes.json':    # Do not process classes.json file
            
            path_split = os.path.split(path_name)  # get the last part of the path
            tile_num = re.findall(r'\d+', path_split[0])[0]
            
            img_type =path_split[1]  # either 'masks' or 'images'
            
            # Skip tile 2, it has issues with color dim
            if tile_num in ['4', '5', '6', '7', '8']:
                target_folder_imgs = f"{processed_data_root}/train"
                target_folder_masks = f"{processed_data_root}/train"
            elif tile_num == '3':
                target_folder_imgs = f"{processed_data_root}/val"
                target_folder_masks = f"{processed_data_root}/val"
            elif tile_num == '1':
                target_folder_imgs = f"{processed_data_root}/test"
                target_folder_masks = f"{processed_data_root}/test"

            # copy all images
            src = os.path.join(path_name, f)  
            file_name_wo_ext = Path(src).stem  # get file name without extension
            # check if file exists in images and masks
            img_file = f"{path_split[0]}/images/{file_name_wo_ext}.jpg"  # image files are .jpg
            mask_file = f"{path_split[0]}/masks/{file_name_wo_ext}.png"  # mask files are .png
            if os.path.exists(img_file) and os.path.exists(mask_file):
                if img_type == 'images':
                    dest = os.path.join(target_folder_imgs, img_type)
                    create_patches(src=src, dest_path=dest)
                    
                # copy all masks
                if img_type == 'masks':
                    dest = os.path.join(target_folder_masks, img_type)
                    create_patches(src=src, dest_path=dest)
   

classes.json
image_part_001.jpg
image_part_002.jpg
image_part_003.jpg
image_part_004.jpg
image_part_005.jpg
image_part_006.jpg
image_part_007.jpg
image_part_008.jpg
image_part_009.jpg
image_part_001.png
image_part_002.png
image_part_003.png
image_part_004.png
image_part_005.png
image_part_006.png
image_part_007.png
image_part_008.png
image_part_009.png
image_part_001.jpg
image_part_002.jpg
image_part_003.jpg
image_part_004.jpg
image_part_005.jpg
image_part_006.jpg
image_part_007.jpg
image_part_008.jpg
image_part_009.jpg
image_part_001.png
image_part_002.png
image_part_003.png
image_part_004.png
image_part_005.png
image_part_006.png
image_part_007.png
image_part_008.png
image_part_009.png
image_part_001.jpg
image_part_002.jpg
image_part_003.jpg
image_part_004.jpg
image_part_005.jpg
image_part_006.jpg
image_part_007.jpg
image_part_008.jpg
image_part_009.jpg
image_part_001.png
image_part_002.png
image_part_003.png
image_part_004.png
image_part_005.png
image_part_006.png
image_part_007.png