In [None]:
import numpy as np
import os
import pandas as pd
import pickle
import sys

from bs4 import BeautifulSoup as bs
from sklearn.cluster import DBSCAN
from tqdm import tqdm
from typing import List, Tuple, Dict

sys.path.append("../RCNN Notebooks")
from rcnn_utils import decode_prediction, get_bb, parse_xml

In [None]:
grid_search_df_directory = "Grid Search DataFrames"

if grid_search_df_directory not in os.listdir():
    os.mkdir(grid_search_df_directory)

## Score and NMS Treshold Grid Search

In [None]:
def get_actual_count(path:str, file_name:str) -> int:
   """Gets the actual count of seals in an image

   Args:
         path (str): Path to directory containing images/XML files
         file_name (str): Name of XML file

   Returns:
         int: Total number of seals within the image
   """
   xml_name = file_name + ".xml"
   return get_bb(path, [xml_name]).shape[0]


def get_image_score_results(image_predictions:List, nms_threshold:float, actual_count:int) -> Tuple[float, int, int]:
    """Calculate the predicted seal count and the difference from the true count for all seal predictions from an image

    Args:
        image_predictions (List): List RCNN predictions made on sub-images for an image
        nms_threshold (float): NMS threshold to use on bounding box predictions
        actual_count (int): Actual number of seals within an image

    Returns:
        Tuple[float, int, int]: Returns (Score, Predicted number of seals in the image, Difference between predicted number and actual number of seals)
    """
    score_values = []
    count_predictions = []
    count_differences = []

    potential_scores = [round(x, 2) for x in np.arange(0.0, 1.0, 0.05)]
    for score in potential_scores:
        predicted_count = 0
        for prediction in image_predictions:
            boxes, scores, labels = decode_prediction(prediction, score, nms_threshold)
            predicted_count += len(boxes)

        # Update Values
        score_values.append(score)
        count_predictions.append(predicted_count)
        count_differences.append(abs(actual_count - predicted_count))
        
    return score_values, count_predictions, count_differences


def conduct_grid_search(dataset_predictions:List, dataset_image_path:str, write_path:str=None, name:str="") -> pd.DataFrame:
    """Conduct a grid search where multiple score and nms thresholds are tested for all images in the dataset

    Args:
        dataset_predictions (List): List of RCNN bounding box predictions for that dataset
        dataset_image_path (str): path to directory containing Images/XML files for the dataset
        write_path (str, optional): Path to the location where the dataframe should be saved. Defaults to None.
        name (str, optional): Name that the dataframe will be saved with. Defaults to "".

    Returns:
        pd.DataFrame: Data frame containing grid search information
    """
    # Initialize dict to store df information
    data_frame_dict = {
        "File Name": [], 
        "Score": [], 
        "IOU Threshold": [], 
        "Predicted Counts": [], 
        "Actual Count": [], 
        "Count Difference": []
    }

    nms_thresholds = [round(x, 2) for x in np.arange(0.0, 1.0, 0.05)]

    for image_name in tqdm(dataset_predictions.keys()):
        # Initialize image variables
        actual_image_counts = get_actual_count(dataset_image_path, image_name)
        image_predictions = dataset_predictions[image_name]

        for nms_thresh in nms_thresholds:

            # Calculate predicted count of all score for an image
            score_values, count_predictions, count_differences = get_image_score_results(image_predictions, nms_thresh, actual_image_counts)
            num_observations = len(score_values)
            
            # Update values in dataframe
            data_frame_dict["File Name"] += [image_name] * num_observations
            data_frame_dict["Score"] += score_values
            data_frame_dict["IOU Threshold"] += [nms_thresh] * num_observations
            data_frame_dict["Predicted Counts"] += count_predictions
            data_frame_dict["Actual Count"] += [actual_image_counts] * num_observations
            data_frame_dict["Count Difference"] += count_differences
    
    grid_search_df = pd.DataFrame(data_frame_dict)

    # Save CSV
    if write_path is not None:
        if name != "":
            name = "_"+name
        grid_search_df.to_csv("{}/grid_search{}.csv".format(write_path, name))
    
    return grid_search_df
            

In [None]:
models = ["unfrozen", "frozen_v1", "frozen_v2"]

dataset_image_paths = {
    "training": "Training Images",
    "validation": "Validation Images",
    "testing": "Test Images"
}

dataset_types = list(dataset_image_paths.keys())
write_path = grid_search_df_directory

In [None]:
for model_name in models:
    
    print("Generating Predictions for:", model_name)
    
    for dataset_type in dataset_types:
        
        print("\tUsing dataset:", dataset_type)
        
        write_name = f"{model_name}_{dataset_type}"

        dataset_image_path = f"../../../Training, Val, and Test Images/{dataset_image_paths[dataset_type]}/"
        
        dataset_path = f"../../Generated Data/{model_name}_{dataset_type}_predictions.pkl"
        with open(dataset_path, "rb") as fp:
            rcnn_predictions = pickle.load(fp)
        
        grid_search_df = conduct_grid_search(rcnn_predictions, dataset_image_path, write_path, write_name)


## Cluster Grid Search

In [None]:
def calculate_centriod_metrics(image_centriods:List[Tuple[float, float]], eps:float) -> Tuple[int, int, int, int]:
    """Calculates all metrics about centriods from a list of centriods
       Calculated Metrics:
       - Number of sub-images in image that contain a seal
       - Number of clusters
       - Number of sub-images in the largest cluster
       - Number of sub-images in the smallest cluster

    Args:
        image_centriods (List[Tuple[float, float]]): (x, y) coordinates of centriods
        eps (float): Epsilon value to be used in DBSCAN

    Returns:
        Tuple[int, int, int, int]: List of relevent metrics
    """
    if len(image_centriods) > 0:
        # Calculate clusters
        clustering_object = DBSCAN(eps=eps, min_samples=1).fit(image_centriods)
        labels = pd.Series(clustering_object.labels_)

        # Filter out invalid clusters
        valid_cluster_indices = labels > -1
        labels = labels[valid_cluster_indices].value_counts()

        # Calculate metrics
        seal_sub_image_number = labels.sum()
        cluster_number = len(labels)
        largest_cluster = labels.max()
        smallest_cluster = labels.min()

        return seal_sub_image_number, cluster_number, largest_cluster, smallest_cluster
   
   # If there are no seals in the image
    else:
        return 0, 0, 0, 0
    

def conduct_centriod_grid_search(centriods:Dict, epsilon_values:List[int], write_path:str=None, write_name:str=None) -> pd.DataFrame:
    """Calculates cluster metrics for each specified epsilon value for all images in the dataset

    Args:
        centriods (Dict[str:List]): Dictionary mapping image name to list of centriods for a dataset
        epsilon_values (List[int], optional): An array of epsilon values to be used with DBSCAN.
        write_path (str, optional): Path to location to save dataframe. Defaults to None.
        write_name (str, optional): Name of saved file. Defaults to None.

    Returns:
        pd.DataFrame: DataFrame containing all cluster grid search information
    """
    # Dictionary containing all data frame information
    data_frame_dict = {
        "Image Name": [],
        "Epsilon Value": [],
        "Sub-Images with Seals": [],
        "Number of Clusters": [],
        "Largest Cluster Size": [],
        "Smallest Cluster Size": [],
    }


    # Iterate through each image
    for image_name in tqdm(centriods.keys()):
        image_centriods = centriods[image_name]
        
        # Iterate through each espilon value
        for epsilon_value in epsilon_values:
            
            # Calculate cluster metrics
            seal_sub_image_number, cluster_number, largest_cluster, smallest_cluster = calculate_centriod_metrics(image_centriods, epsilon_value)

            # Save metrics in dictionary
            data_frame_dict["Image Name"].append(image_name)
            data_frame_dict["Epsilon Value"].append(epsilon_value)
            data_frame_dict[ "Sub-Images with Seals"].append(seal_sub_image_number)
            data_frame_dict["Number of Clusters"].append(cluster_number)
            data_frame_dict["Largest Cluster Size"].append(largest_cluster)
            data_frame_dict["Smallest Cluster Size"].append(smallest_cluster)\
            
    # Convert to DataFrame
    centriod_info_df = pd.DataFrame(data_frame_dict)

    # Save the dataframe
    if write_path is not None:
        if write_name is not None:
            centriod_info_df.to_csv(f"{write_path}/centriod_info_{write_name}.csv", index=False)
        else:
            centriod_info_df.to_csv(f"{write_path}/centriod_info.csv", index=False)

    return centriod_info_df

In [None]:
# Epsilon Values to conduct grid search over
epsilon_values = [150, 300, 450]

# Iterate through models
for model_name in models:
    print("Generating Grid Search for model:", model_name)
    
    # Iterate through datasets
    for dataset_type in dataset_types:
        print("\tUsing Dataset:", dataset_type)

        write_name = f"{model_name}_{dataset_type}"

        # Load centriods for dataset
        centriods_path = f"Centroids/seals_centroids_{dataset_type}.pkl"
        with open(centriods_path, "rb") as fp:
                    centriods = pickle.load(fp)
        
        # Conduct Grid Search
        conduct_centriod_grid_search(
               centriods,
               epsilon_values,
               write_path, 
               write_name
        )