In [None]:
import numpy as np
import os
import pandas as pd
import pickle

from bs4 import BeautifulSoup as bs
from skimage import io
from tqdm import tqdm
from typing import Tuple, List

In [None]:
def parse_xml(xml) -> pd.DataFrame:
    """Creates a dataframe containing bounding box information from xml

    Args:
        xml (Beautiful soup object): Contains bounding box information in the form of an xml.

    Returns:
        pd.DataFrame: Contains bounding box information
    """
    # Find bounding box information
    label = xml.find_all("name")
    xmin = xml.find_all("xmin")
    ymin = xml.find_all("ymin")
    xmax = xml.find_all("xmax")
    ymax = xml.find_all("ymax")

    # One bounding box is missing an xmin value, this is to deal with that
    min_length = min(
        len(xmin),
        len(ymin),
        len(xmax),
        len(ymax),
    )

    # Extract text
    for i in range(min_length):
        label[i] = label[i].text
        xmin[i] = xmin[i].text
        ymin[i] = ymin[i].text
        xmax[i] = xmax[i].text
        ymax[i] = ymax[i].text
  
    # Create dataframe
    df = pd.DataFrame(
        {
            "label": label[:min_length], 
            "xmin": xmin[:min_length], 
            "ymin": ymin[:min_length], 
            "xmax": xmax[:min_length], 
            "ymax": ymax[:min_length],
        }
    )
    
    # Cast to int
    df["xmin"] = df["xmin"].astype(int)
    df["ymin"] = df["ymin"].astype(int)
    df["xmax"] = df["xmax"].astype(int)
    df["ymax"] = df["ymax"].astype(int)

    return df


def get_xml_dataframe(bounding_box_info_path:str) -> pd.DataFrame:
    """Given a path to an xml file, returns a dataframe containing bounding box information

    Args:
        bounding_box_info_path (str): path to xml file

    Returns:
        pd.DataFrame: Dataframe containing bounding box information
    """
    # Parse XML file
    raw_bounding_box_data = open(bounding_box_info_path)

    bounding_box_file_content = "".join(
        raw_bounding_box_data
        .readlines()
    )
    
    raw_bounding_box_data.close()

    # Create bs object
    xml_data = bs(bounding_box_file_content, features='xml')

    # Create DataFrame
    xml_df = parse_xml(xml_data)

    return xml_df


def get_intersecting_bounding_boxes(bb_df:pd.DataFrame, sub_image_xmin:int, sub_image_ymin:int, sub_image_xmax:int, sub_image_ymax:int, size:int) -> pd.DataFrame:
    """Determines which bounding boxes intersect with the subimage

    Args:
        bb_df (pd.DataFrame): DataFrarme containing all bounding box information
        Sub-image coordinates:
            sub_image_xmin (int)
            sub_image_ymin (int)
            sub_image_xmax (int)
            sub_image_ymax (int)
        size (int): Width/Height of sub-image

    Returns:
        pd.DataFrame: Dataframe containing all intersecting sub-images
    """
    # Conditions for bounding boxes that do not intersect
    non_intersecting_bb_conditions = f"xmin >= {sub_image_xmax} or ymin >= {sub_image_ymax} or xmax <= {sub_image_xmin} or ymax <= {sub_image_ymin}"

    # Determine bounding boxes that intersect
    intersecting_bb = bb_df.query(f"not({non_intersecting_bb_conditions})").copy()

    # If a bounding box exists, crop the the dimensions so it fits in the sub-image and normalize it (range between 0 and size)
    if intersecting_bb.shape[0] > 0:

        # Calculate initial area of bounding box
        intersecting_bb["bb_initial_area"] = (intersecting_bb.xmax - intersecting_bb.xmin) * (intersecting_bb.ymax - intersecting_bb.ymin)

        intersecting_bb["xmin"] = intersecting_bb.xmin.apply(lambda bb_xmin: max(bb_xmin - sub_image_xmin, 0))
        intersecting_bb["ymin"] = intersecting_bb.ymin.apply(lambda bb_ymin: max(bb_ymin - sub_image_ymin, 0))
        intersecting_bb["xmax"] = intersecting_bb.xmax.apply(lambda bb_xmax: min(bb_xmax - sub_image_xmin, size))
        intersecting_bb["ymax"] = intersecting_bb.ymax.apply(lambda bb_ymax: min(bb_ymax - sub_image_ymin, size))

        # Calculate percent of initial bounding box (seal) in sub-image
        intersecting_bb["bb_cropped_area"] = (intersecting_bb.xmax - intersecting_bb.xmin) * (intersecting_bb.ymax - intersecting_bb.ymin)
        intersecting_bb["percent"] = intersecting_bb.bb_cropped_area / intersecting_bb.bb_initial_area

        intersecting_bb = intersecting_bb.drop(columns=["bb_initial_area", "bb_cropped_area"])

    return intersecting_bb


def get_sub_images(image_path:str, xml_df:pd.DataFrame, size:int=150, step:int=75) -> Tuple[List, List]:
    """Gets sub-image and bounding box information from image. Parses image like reading a book

    Args:
        image_path (str): path to image
        xml_df (pd.DataFrame): DataFrame containing bounding box information
        size (int, optional): Width/Height of sub-image. Defaults to 150.
        step (int, optional): Number of pixels to iterate. Defaults to 75.

    Returns:
        Tuple[List, List]: List containing sub-images and list containing bounding box information
    """
    sub_images = []
    bounding_boxes = []
    
    img = io.imread(image_path)

    # Max y and x dimensions for boundaries
    max_image_y = img.shape[0]
    max_image_x = img.shape[1]

    # Initialize as negative step and 0 for first iteration
    ymin = -1 * step
    ymax = ymin + size

    while ymax < max_image_y:
        
        # Update y values
        ymin += step
        ymax = ymin + size

        # If the y edge has been reached
        if ymax > max_image_y:
            ymax = max_image_x
            ymin = ymax - size

        # Initialize as negative step and 0 for first iteration
        xmin = -1 * step
        xmax = xmin + size

        while xmax < max_image_x:

            # Update x valyes
            xmin += step
            xmax = xmin + size

            # If the x edge has been reached
            if xmax > max_image_x:
                xmax = max_image_x
                xmin = xmax - size

            # Calculate intersecting bounding boxes
            intersecting_bb = get_intersecting_bounding_boxes(xml_df, xmin, ymin, xmax, ymax, size)

            # If there are intersecting bounding boxes, save information
            if intersecting_bb.shape[0] > 0:
                sub_image = img[ymin:ymax, xmin:xmax, :]
                sub_images.append(sub_image)
                bounding_boxes.append(intersecting_bb)
    
    return sub_images, bounding_boxes


def rotate_90(sub_image:np.array, bb_df:pd.DataFrame, size:int) -> Tuple[np.array, pd.DataFrame]:
    """Given a sub-image and bounding boix information, rotate it 90 degrees

    Args:
        sub_image (np.array): Sub-image
        bb_df (pd.DataFrame): dataFrame containing bounding box information
        size (int): Width/Height of the sub-image

    Returns:
        Tuple[np.array, pd.DataFrame]: Rotated sub-iamges and coressponding bounding box information
    """
    rotated_sub_image = np.rot90(sub_image)
    
    label = []
    xmin = []
    ymin = []
    xmax = []
    ymax = []
    percent = []

    for i in range(bb_df.shape[0]):
        row = bb_df.iloc[i]

        label.append(row.label)
        xmin.append(row.ymin)
        ymin.append(size - row.xmax)
        xmax.append(row.ymax)
        ymax.append(size - row.xmin)
        percent.append(row.percent)

    new_bb_df = pd.DataFrame(
        {
            "label":label,
            "xmin":xmin,
            "ymin":ymin,
            "xmax":xmax,
            "ymax":ymax,
            "percent":percent
        }
    )

    return rotated_sub_image, new_bb_df


def apply_transformations(sub_images:List[np.array], bb_data:List[pd.DataFrame], size:int) -> Tuple[List, List]:
    """Given a list of images and bounding box information, rotates all image, 90, 180, and 270 degrees (4x data)

    Args:
        sub_images (List[np.array]): List of sub-images
        bb_data (List[pd.DataFrame]): List containing bounding box information stored in DataFrames
        size (int): Width/Height of sub-image

    Returns:
        Tuple[List, List]: Rotated subimages and corresponding bounding box information
    """
    new_sub_images = []
    new_bb_data = []
    for i in range(len(sub_images)):

        sub_image = sub_images[i]
        bb_data_df = bb_data[i]

        # Add existing data
        new_sub_images.append(sub_image)
        new_bb_data.append(bb_data_df)

        # Rotate 90 degrees
        sub_image_rotated_90, bb_data_rotated_90 = rotate_90(sub_image, bb_data_df, size)
        new_sub_images.append(sub_image_rotated_90)
        new_bb_data.append(bb_data_rotated_90)

        # Rotate 180 degrees
        sub_image_rotated_180, bb_data_rotated_180 = rotate_90(sub_image_rotated_90, bb_data_rotated_90, size)
        new_sub_images.append(sub_image_rotated_180)
        new_bb_data.append(bb_data_rotated_180)

        # Rotate 270 degrees
        sub_image_rotated_270, bb_data_rotated_270 = rotate_90(sub_image_rotated_180, bb_data_rotated_180, size)
        new_sub_images.append(sub_image_rotated_270)
        new_bb_data.append(bb_data_rotated_270)

    return new_sub_images, new_bb_data


def generate_data_for_image(path:str, file_name:str, transformation:bool=True, step:int=75, size:int=150) -> Tuple[List, List]:
    """For a given image, generate its sub-iamges and boundingbox information for all sub-iamges which contain seals.
       Transformations can be applied to get additional data

    Args:
        path (str): Path to image
        file_name (str): Name of iamge
        transformation (bool, optional): Flag for if transformations should be applied. Defaults to True.
        step (int, optional): How  many pixels to iterate. Defaults to 75.
        size (int, optional): Size of a sub-iamge. Defaults to 150.

    Returns:
        Tuple[List, List]: _description_
    """
    img_path = path + f"/{file_name}.JPG"
    xml_path = path + f"/{file_name}.xml"

    xml_df = get_xml_dataframe(xml_path)
    sub_images, bounding_boxes = get_sub_images(img_path, xml_df, size, step)
    
    if transformation:
        sub_images, bounding_boxes = apply_transformations(sub_images, bounding_boxes, size)

    return sub_images, bounding_boxes

In [None]:
# Generate Data
training_data = {}

path = "../Training, Val, and Test Images/Training Images"
transformation = True
step = 50
size = 150

# Get names of images
image_names = set(
    [
        file_name.split(".")[0] 
        for file_name in os.listdir(path)
    ]
)

# Get sub-image and bounding box information
for image_name in tqdm(image_names):
    
    training_data[image_name] = generate_data_for_image(
        path, 
        image_name, 
        transformation=transformation, 
        step=step, 
        size=size
    )

In [None]:
# Save data
write_path = "../seal_counter"
file_name = f"rcnn_training_data_transformation_{transformation}_step_{step}_sub_image_size_{size}"

if "Data" not in os.listdir(write_path):
    os.mkdir(f"{write_path}/Data")

with open(f"{write_path}/Data/{file_name}.pkl", "wb") as f:
    pickle.dump(training_data, f)
