In [12]:
import numpy as np
import pandas as pd
import pickle
import torch
import sys

from torchvision import ops
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from sklearn.cluster import DBSCAN

In [13]:
sys.path.append("..\RCNN Notebooks")
from rcnn_utils import decode_prediction

## Score and NMS Treshold Grid Search

In [14]:
def get_bb(in_path, xml):
   df = pd.DataFrame()
   i = 0
   for x in xml:
      f = open(in_path + x)
      xml_file = bs("".join(f.readlines()), "lxml")
      df_temp = parse_xml(xml_file)
      df_temp.insert(0, "file_num", str(i).zfill(4))
      df = pd.concat([df, df_temp])
      f.close()
      i+=1
   return df

def parse_xml(xml):
  label = xml.find_all("name")
  xmin = xml.find_all("xmin")
  ymin = xml.find_all("ymin")
  xmax = xml.find_all("xmax")
  ymax = xml.find_all("ymax")
  min_size = min(len(label), len(xmin), len(ymin), len(xmax), len(ymax))
  for i in range(min_size):
      label[i] = label[i].text
      xmin[i] = xmin[i].text
      ymin[i] = ymin[i].text
      xmax[i] = xmax[i].text
      ymax[i] = ymax[i].text
  df = pd.DataFrame({"label": label[:min_size], "xmin": xmin[:min_size], "ymin": ymin[:min_size], "xmax": xmax[:min_size], "ymax": ymax[:min_size]})
  return df

def get_actual_count(path, file_name):
  xml_name = file_name + ".xml"
  return get_bb(path, [xml_name]).shape[0]

In [15]:
def get_image_score_results(image_predictions, nms_threshold, actual_count):
    score_values = []
    count_predictions = []
    count_differences = []

    potential_scores = [round(x, 2) for x in np.arange(0.0, 1.0, 0.05)]
    for score in potential_scores:
        predicted_count = 0
        for prediction in image_predictions:
            boxes, scores, labels = decode_prediction(prediction, score, nms_threshold)
            predicted_count += len(boxes)

        # Update Values
        score_values.append(score)
        count_predictions.append(predicted_count)
        count_differences.append(abs(actual_count - predicted_count))
        
    return score_values, count_predictions, count_differences

def conduct_grid_search(dataset_predictions, dataset_image_path, write_path=None, name=""):
    # Initialize dict to store df information
    data_frame_dict = {
        "File Name": [], 
        "Score": [], 
        "IOU Threshold": [], 
        "Predicted Counts": [], 
        "Actual Count": [], 
        "Count Difference": []
    }

    nms_thresholds = [round(x, 2) for x in np.arange(0.0, 1.0, 0.05)]

    for image_name in tqdm(dataset_predictions.keys()):
        # Initialize image variables
        actual_image_counts = get_actual_count(dataset_image_path, image_name)
        image_predictions = dataset_predictions[image_name]

        for nms_thresh in nms_thresholds:

            # Calculate predicted count of all score for an image
            score_values, count_predictions, count_differences = get_image_score_results(image_predictions, nms_thresh, actual_image_counts)
            num_observations = len(score_values)
            
            # Update values in dataframe
            data_frame_dict["File Name"] += [image_name] * num_observations
            data_frame_dict["Score"] += score_values
            data_frame_dict["IOU Threshold"] += [nms_thresh] * num_observations
            data_frame_dict["Predicted Counts"] += count_predictions
            data_frame_dict["Actual Count"] += [actual_image_counts] * num_observations
            data_frame_dict["Count Difference"] += count_differences
    
    grid_search_df = pd.DataFrame(data_frame_dict)

    # Save CSV
    if write_path is not None:
        if name != "":
            name = "_"+name
        grid_search_df.to_csv("{}/grid_search{}.csv".format(write_path, name))
    
    return grid_search_df
            

In [22]:
def generate_grid_search(preds, path, write_path = None, name = ""):
    scores_arr_total = []
    pred_counts_total = []
    count_diffs_total = []
    file_names_total = []
    thresh_arr_total = []
    actual_counts_total = []

    iou_thresh = [round(x, 2) for x in np.arange(0.0, 1.0, 0.05)]
    for file_name in tqdm(preds.keys()):
        actual_count = get_actual_count(path, file_name)
        for thresh in iou_thresh:
            file_names, scores_arr, thresh_arr, pred_counts, actual_counts, count_diffs = get_scores(preds, thresh, file_name, actual_count)
            scores_arr_total += scores_arr
            pred_counts_total += pred_counts
            count_diffs_total += count_diffs
            file_names_total += file_names
            thresh_arr_total += thresh_arr
            actual_counts_total += actual_counts

    df = pd.DataFrame({"File Name": file_names_total, "Score":scores_arr_total, "IOU Threshold": thresh_arr_total, "Predicted Counts": pred_counts_total, "Actual Count": actual_counts_total, "Count Difference": count_diffs_total})
    
    if write_path is not None:
        if name != "":
            name = "_"+name
        df.to_csv("{}/grid_search{}.csv".format(write_path, name), index=False)
    return df

In [17]:
models = ["unfrozen", "frozen_v1", "frozen_v2"]
dataset_image_paths = {
    "training": "Training Images",
    "validation": "Validation Images",
    "testing": "Test Images"
}
dataset_types = list(dataset_image_paths.keys())
write_path = r"..\MetaData"

In [18]:
for model_name in models:
    print("Generating Predictions for:", model_name)
    for dataset_type in dataset_types:
        print("\tUsing dataset:", dataset_type)
        write_name = f"{model_name}_{dataset_type}"

        dataset_image_path = r"C:\Users\kaanan\Desktop\Training, Val, and Test Images\{}/".format(dataset_image_paths[dataset_type])
        
        dataset_path = r"..\MetaData\{}_{}_predictions.pkl".format(model_name, dataset_type)
        with open(dataset_path, "rb") as fp:
            rcnn_predictions = pickle.load(fp)
        
        grid_search_df = conduct_grid_search(rcnn_predictions, dataset_image_path, write_path, write_name)


Generating Predictions for: unfrozen
	Using dataset: training


100%|██████████| 50/50 [00:58<00:00,  1.18s/it]


	Using dataset: validation


100%|██████████| 16/16 [00:29<00:00,  1.86s/it]


	Using dataset: testing


100%|██████████| 13/13 [00:09<00:00,  1.40it/s]


Generating Predictions for: frozen_v1
	Using dataset: training


100%|██████████| 50/50 [01:08<00:00,  1.36s/it]


	Using dataset: validation


100%|██████████| 16/16 [00:34<00:00,  2.13s/it]


	Using dataset: testing


100%|██████████| 13/13 [00:11<00:00,  1.18it/s]


Generating Predictions for: frozen_v2
	Using dataset: training


100%|██████████| 50/50 [01:16<00:00,  1.53s/it]


	Using dataset: validation


100%|██████████| 16/16 [00:37<00:00,  2.37s/it]


	Using dataset: testing


100%|██████████| 13/13 [00:12<00:00,  1.07it/s]


## Cluster Grid Search

In [19]:
def calculate_centriod_metrics(image_centriods, eps):
    if len(image_centriods) > 0:
        
        clustering_object = DBSCAN(eps=eps, min_samples=1).fit(image_centriods)
        labels = pd.Series(clustering_object.labels_)

        # Filter out invalid clusters
        valid_cluster_indices = labels > -1
        labels = labels[valid_cluster_indices].value_counts()

        seal_sub_image_number = labels.sum()
        cluster_number = len(labels)

        largest_cluster = labels.max()
        smallest_cluster = labels.min()

        return seal_sub_image_number, cluster_number, largest_cluster, smallest_cluster
   
    else:
        return 0, 0, 0, 0

In [20]:
def conduct_centriod_grid_search(centriods, write_path=None, write_name=None):
    data_frame_dict = {
        "Image Name": [],
        "Epsilon Value": [],
        "Sub-Images with Seals": [],
        "Number of Clusters": [],
        "Largest Cluster Size": [],
        "Smallest Cluster Size": [],
    }

    epsilon_values = [150, 300, 450]   

    for image_name in tqdm(centriods.keys()):
        image_centriods = centriods[image_name]
        
        for epsilon_value in epsilon_values:
            seal_sub_image_number, cluster_number, largest_cluster, smallest_cluster = calculate_centriod_metrics(image_centriods, epsilon_value)

            data_frame_dict["Image Name"].append(image_name)
            data_frame_dict["Epsilon Value"].append(epsilon_value)
            data_frame_dict[ "Sub-Images with Seals"].append(seal_sub_image_number)
            data_frame_dict["Number of Clusters"].append(cluster_number)
            data_frame_dict["Largest Cluster Size"].append(largest_cluster)
            data_frame_dict["Smallest Cluster Size"].append(smallest_cluster)\
            
    centriod_info_df = pd.DataFrame(data_frame_dict)

    if write_path is not None:
        if write_name is not None:
            centriod_info_df.to_csv(f"{write_path}/centriod_info_{write_name}.csv", index=False)
        else:
            centriod_info_df.to_csv(f"{write_path}/centriod_info.csv", index=False)

    return centriod_info_df


In [21]:
for model_name in models:
    print("Generating Grid Search for model:", model_name)
    for dataset_type in dataset_types:
        print("\tUsing Dataset:", dataset_type)

        write_name = f"{model_name}_{dataset_type}"

        centriods_path = r"..\MetaData\seals_centroids_{}.pkl".format(dataset_type)
        with open(centriods_path, "rb") as fp:
                    centriods = pickle.load(fp)
        
        conduct_centriod_grid_search(centriods, write_path, write_name)

Generating Grid Search for model: unfrozen
	Using Dataset: training


100%|██████████| 50/50 [00:00<00:00, 84.64it/s]


	Using Dataset: validation


100%|██████████| 16/16 [00:00<00:00, 134.13it/s]


	Using Dataset: testing


100%|██████████| 13/13 [00:00<00:00, 85.64it/s]


Generating Grid Search for model: frozen_v1
	Using Dataset: training


100%|██████████| 50/50 [00:00<00:00, 88.49it/s]


	Using Dataset: validation


100%|██████████| 16/16 [00:00<00:00, 122.99it/s]


	Using Dataset: testing


100%|██████████| 13/13 [00:00<00:00, 95.45it/s]


Generating Grid Search for model: frozen_v2
	Using Dataset: training


100%|██████████| 50/50 [00:00<00:00, 87.56it/s] 


	Using Dataset: validation


100%|██████████| 16/16 [00:00<00:00, 138.97it/s]


	Using Dataset: testing


100%|██████████| 13/13 [00:00<00:00, 99.48it/s]
