In [12]:
# Supporting Libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pickle
from torchvision import ops
from tqdm import tqdm

# Models
from sklearn.ensemble import RandomForestRegressor

In [13]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [14]:
def decode_prediction(prediction, 
                      score_threshold = 0.8, 
                      nms_iou_threshold = 0.2):
    """
    Inputs
        prediction: dict
        score_threshold: float
        nms_iou_threshold: float
    Returns
        prediction: tuple
    """
    boxes = prediction["boxes"]
    scores = prediction["scores"]
    labels = prediction["labels"]    
    # Remove any low-score predictions.
    if score_threshold is not None:
        want = scores > score_threshold
        boxes = boxes[want]
        scores = scores[want]
        labels = labels[want]    
    # Remove any overlapping bounding boxes using NMS.
    if nms_iou_threshold is not None:
        want = ops.nms(boxes = boxes, scores = scores, iou_threshold = nms_iou_threshold)
        boxes = boxes[want]
        scores = scores[want]
        labels = labels[want]    
        return (boxes.numpy(), labels.numpy(), scores.numpy())

def get_best_counts(df, preds, thresh):
    best_possible_count = []
    for name in df["File Name"]:
        opt_score = df[df["File Name"] == name]["Score"].values[0]
        b_count = 0
        for pred in preds[name]:
                    boxes, scores, labels = decode_prediction(pred, opt_score, thresh)
                    b_count += len(boxes)
        best_possible_count.append(b_count)
    return best_possible_count

def get_counts(df, preds, pred_scores, thresh):
    count = []
    for i in range(df.shape[0]):
        row = df.iloc[i, :]
        file_name = row["File Name"]
        image_count = 0
        score = pred_scores[i]
        for pred in preds[file_name]:
              boxes, scores, labels = decode_prediction(pred, score, thresh)
              image_count += len(boxes)
        count.append(image_count)
    return count

def get_metrics(actual_count, pred_count):
    ac = np.array(actual_count)
    pc = np.array(pred_count)
    average_error = abs(ac - pc).mean()
    average_percent_error = np.nan_to_num(abs(ac - pc)/ac, nan = pc).mean()
    median_error = np.median(abs(ac-pc))
    median_percent_error = np.median(np.nan_to_num(abs(ac - pc)/ac, nan = pc))
    return average_error, average_percent_error, median_error, median_percent_error

def eval_predictor(df, pred_info, pred_score, thresh, actual):
    pred_counts = get_counts(df, pred_info, pred_score, thresh)
    if actual:
         return get_metrics(df["Actual Count"], pred_counts)
    else:
        return get_metrics(df["Best Possible Counts"], pred_counts)
    
def sci_get_metrics(model, x_train, y_train, x_val, df_training, df_validation, training_preds, val_preds, thresh, actual):
    model.fit(x_train, y_train)
    preds = model.predict(x_train)
    train_ae, train_ape, train_me, train_mpe = eval_predictor(df_training, training_preds, preds, thresh, actual)
    preds = model.predict(x_val)
    val_ae, val_ape, val_me, val_mpe = eval_predictor(df_validation, val_preds, preds, thresh, actual)
    return train_ae, train_ape, val_ae, val_ape, train_me, train_mpe, val_me, val_mpe

def random_forest_grid_search(tree_sizes, tree_num, data_percentage, df_training, df_validation, training_preds, val_preds, thresh, actual = False):
    features = []
    number_trees = []
    percentage = []
    training_aes = []
    training_apes = []
    val_aes = []
    val_apes = []
    training_mes = []
    training_mpes = []
    val_mes = []
    val_mpes = []
    y_train = df_training["Score"]
    x_train = df_training.drop(columns = ["Score", "File Name", "Best Possible Counts", "Unnamed: 0", "Actual Count"])
    x_val = df_validation.drop(columns = ["Score", "File Name", "Best Possible Counts", "Unnamed: 0", "Actual Count"])
    for tree_size in tqdm(tree_sizes):
         for num in tree_num:
              for p in data_percentage:
                rf = RandomForestRegressor(n_estimators=num, max_features=tree_size, max_samples=p, random_state=0)
                train_ae, train_ape, val_ae, val_ape, train_me, train_mpe, val_me, val_mpe = sci_get_metrics(rf, x_train, y_train, x_val, df_training, df_validation, training_preds, val_preds, thresh, actual)
                training_aes.append(train_ae)
                training_apes.append(train_ape)
                val_aes.append(val_ae)
                val_apes.append(val_ape)
                training_mes.append(train_me)
                training_mpes.append(train_mpe)
                val_mes.append(val_me)
                val_mpes.append(val_mpe)
                features.append(tree_size)
                number_trees.append(num)
                percentage.append(p)
    d1 = {"Features": features, "Number of Trees": number_trees, "Percentage Sampled": percentage, "Training Average Error": training_aes, "Training Average Percent Error":training_apes, "Validation Average Error":val_aes, "Validation Average Percent Error":val_apes}
    d2 =  {"Features": features, "Number of Trees": number_trees, "Percentage Sampled": percentage, "Training Median Error": training_mes, "Training Median Percent Error":training_mpes, "Validation Median Error":val_mes, "Validation Median Percent Error":val_mpes}
    return pd.DataFrame(d1), pd.DataFrame(d2)

def get_total_boxes(df, preds, thresh):
    total_box_num = []
    for name in df["File Name"]:
        total_box_count = 0
        for pred in preds[name]:
            boxes, scores, labels = decode_prediction(pred, 0, thresh)
            total_box_count += len(boxes)
        total_box_num.append(total_box_count)
    return total_box_num  

In [15]:
df_training = pd.read_csv("../Data/training_total_0.05_300_V2")
df_validation = pd.read_csv("../Data/validation_total_0.05_300_V2")
thresh = .05

training_pred_path = r"C:\Users\kaanan\Desktop\RCNN\MetaData\training_preds_total_V2"
val_pred_path = r"C:\Users\kaanan\Desktop\RCNN\MetaData\validation_preds_total_V2"
grid_search_training_path = r"C:\Users\kaanan\Desktop\RCNN\Data\grid_seach_training_total_V2.csv"
grid_search_val_path = r"C:\Users\kaanan\Desktop\RCNN\Data\grid_seach_validation_total_V2.csv"

with open(training_pred_path, "rb") as fp:
    training_preds = pickle.load(fp)

with open(val_pred_path, "rb") as fp:
    val_preds = pickle.load(fp)

grid_train = pd.read_csv(grid_search_training_path)
grid_val = pd.read_csv(grid_search_val_path)

In [16]:
df_training["Total Boxes"] = get_total_boxes(df_training, training_preds, thresh)
df_validation["Total Boxes"] = get_total_boxes(df_validation, val_preds, thresh)

In [17]:
df_training["Best Possible Counts"] = get_best_counts(df_training, training_preds, thresh)
df_validation["Best Possible Counts"] = get_best_counts(df_validation, val_preds, thresh)

In [18]:
tree_sizes = [2, 3, 4, 5]
tree_num = [200, 250, 300, 350, 400, 450, 500]
percentage = [.2, .25, .3, .35, .4, .45, .5, .55, .6,.65, .7]

df_mean, df_median = random_forest_grid_search(tree_sizes, tree_num, percentage, df_training, df_validation, training_preds, val_preds, thresh)

100%|██████████| 4/4 [03:29<00:00, 52.44s/it]


In [19]:
df_mean.sort_values(by="Validation Average Percent Error")

Unnamed: 0,Features,Number of Trees,Percentage Sampled,Training Average Error,Training Average Percent Error,Validation Average Error,Validation Average Percent Error
82,3,200,0.45,13.435644,0.234502,22.545455,0.298214
93,3,250,0.45,13.514851,0.242900,22.863636,0.301230
115,3,350,0.45,13.138614,0.237066,23.272727,0.302929
104,3,300,0.45,13.425743,0.239180,23.181818,0.303826
197,4,350,0.70,10.554455,0.228289,23.727273,0.306640
...,...,...,...,...,...,...,...
287,5,450,0.25,16.435644,0.275249,22.727273,0.402982
32,2,300,0.70,10.970297,0.224885,24.181818,0.403302
55,2,450,0.20,17.366337,0.276712,22.272727,0.406678
132,3,450,0.20,16.831683,0.274931,22.454545,0.413471


In [20]:
# No RGB values
df_training = df_training[["File Name", "Box Num", "Score", "Actual Count", "Cluster Num", "Biggest Cluster", "Smallest Cluster", 'Unnamed: 0', 'Best Possible Counts']]
df_validation = df_validation[["File Name", "Box Num", "Score", "Actual Count", "Cluster Num", "Biggest Cluster", "Smallest Cluster", 'Unnamed: 0', 'Best Possible Counts']]

df_mean, df_median = random_forest_grid_search(tree_sizes, tree_num, percentage, df_training, df_validation, training_preds, val_preds, .05)

100%|██████████| 4/4 [03:28<00:00, 52.22s/it]


In [21]:
df_mean.sort_values(by="Validation Average Percent Error")

Unnamed: 0,Features,Number of Trees,Percentage Sampled,Training Average Error,Training Average Percent Error,Validation Average Error,Validation Average Percent Error
263,5,300,0.70,10.712871,0.241739,26.136364,0.281432
186,4,300,0.70,10.712871,0.241739,26.136364,0.281432
307,5,500,0.70,10.693069,0.239018,26.454545,0.282554
230,4,500,0.70,10.693069,0.239018,26.454545,0.282554
197,4,350,0.70,10.643564,0.240923,26.636364,0.283049
...,...,...,...,...,...,...,...
58,2,450,0.35,12.871287,0.259148,24.818182,0.336074
73,2,500,0.55,11.524752,0.251331,26.681818,0.336680
47,2,400,0.35,12.940594,0.258485,24.818182,0.339249
25,2,300,0.35,13.069307,0.258727,25.136364,0.340474
