# Notebook 02: Preprocessing and Training Data Development
The purpose of this notebook is to run through the steps of preparing data to use for fitting models in the next step.

The two datasets that are going to be used initially for semantically segmenting drone footage, are the two static image datasets titled:
1. Varied Drone Dataset for Semantic Segmentation
2. Semantic Drone Dataset

The following code will use the naming conventions:
1. Varied Drone Dataset (VDD)
2. Semantic Drone Dataset (SDD)

### Imports

In [4]:
import os
import cv2
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from collections import Counter
from PIL import Image

### Functions

In [3]:
# function to load image data into a list
def load_images_from_folder(folder_path, flags=cv2.IMREAD_COLOR):
    image_dataset = []
    image_shape = []
    image_aspect_ratio = []
    for image_file in tqdm(os.listdir(folder_path)):
        image_data = cv2.imread(folder_path + image_file, flags=flags)
        image_shape.append(image_data.shape)
        image_aspect_ratio.append(float(image_data.shape[1] / image_data.shape[0]))
        image_data = cv2.resize(image_data, (192, 128), interpolation=0)  # downsizing images saves memory and time, using nearest neighbor interpolation to prevent color interpolation of some pixels
        if len(image_data.shape) == 3:
            image_data = cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)  # color channels are GRB and mask_labels are RGB
        image_dataset.append(image_data)
    return image_dataset, image_shape, image_aspect_ratio

### Define data paths

#### Varied Drone Dataset

In [15]:
# VDD Train Source Images
file_path_VDD_train_src = '../data/VDD/train/src/'

# VDD Train Ground Truth Images
file_path_VDD_train_gt = '../data/VDD/train/gt/'

# VDD Validation Source Images
file_path_VDD_val_src = '../data/VDD/val/src/'

# VDD Validation Ground Truth Images
file_path_VDD_val_gt = '../data/VDD/val/gt/'

# VDD Test Source Images
file_path_VDD_test_src = '../data/VDD/test/src/'

# VDD Test Ground Truth Images
file_path_VDD_test_gt = '../data/VDD/test/gt/'

#### Semantic Drone Dataset

In [16]:
# SDD Source Images
file_path_SDD_src = '../data/semantic_drone_dataset/training_set/images/'

# SDD Ground Truth Images
file_path_SDD_gt = '../data/semantic_drone_dataset/training_set/gt/semantic/label_images/'

### Load Images

#### Varied Drone Dataset

In [None]:
# import images
VDD_train_src, VDD_train_src_shape, VDD_train_src_aspect_ratio = load_images_from_folder(file_path_VDD_train_src)
VDD_train_gt,  VDD_train_gt_shape,  VDD_train_gt_aspect_ratio  = load_images_from_folder(file_path_VDD_train_gt)
VDD_val_src,   VDD_val_src_shape,   VDD_val_src_aspect_ratio   = load_images_from_folder(file_path_VDD_val_src)
VDD_val_gt,    VDD_val_gt_shape,    VDD_val_gt_aspect_ratio    = load_images_from_folder(file_path_VDD_val_gt)
VDD_test_src,  VDD_test_src_shape,  VDD_test_src_aspect_ratio  = load_images_from_folder(file_path_VDD_test_src)
VDD_test_gt,   VDD_test_gt_shape,   VDD_test_gt_aspect_ratio   = load_images_from_folder(file_path_VDD_test_gt)

 16%|█████████████▎                                                                   | 46/280 [00:04<00:23,  9.77it/s]

#### Semantic Drone Dataset

In [13]:
# import images
SDD_src, VDD_src_shape, VDD_src_aspect_ratio = load_images_from_folder(file_path_SDD_src)
SDD_gt,  VDD_gt_shape,  VDD_gt_aspect_ratio  = load_images_from_folder(file_path_VDD_gt)

100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [01:24<00:00,  4.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [01:00<00:00,  6.56it/s]
