In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil 
import glob
import json
import re
import cv2
from PIL import Image
from sklearn.metrics import mean_absolute_percentage_error
import seaborn as sns
%matplotlib inline

In [2]:
ROOT = '../'
IMG_DIR = os.path.join(ROOT, 'species_labelling', 'export_annotated_data')
CODE_DIR = os.path.join(ROOT, "code")

## Load Ground Truth

In [None]:
def load_ground_truth(foldername=os.path.join(ROOT,"data/") , filename="test_labels4-1.csv"): 
    ground_truth = pd.read_csv(foldername + filename)
    ground_truth = ground_truth.iloc[:,1:]
    return ground_truth

In [3]:
def load_megadetector_output(foldername="src/", filename='model_output_11202021_4.csv'):#filename="phase2_megadetector_output_YOLO.json"): 
    """
    Pkg dependencies: os, glob, re, pandas
    Purpose: 
    Inputs: 
    Outputs: 
    """

    output_file = pd.read_csv(os.path.join(ROOT,foldername, filename))
    megadetector = output_file[output_file['model_id'] == 5]
    
    image_group_ids = []
    image_ids = []
    detection_conf_list = []

    for row, value in megadetector.iterrows():
        for image in range(1,4):
            image_group_ids.append(value['image_group_id'])
            image_ids.append(value['image_id_{}'.format(image)])

            #BBOXES
            detection_int_string = value['image_id_{}_bbox'.format(image)]
            if isinstance(detection_int_string, float):
                detection_int_list = []
            else:
                detection_int_list = detection_int_string.split(';')
            #detection_list.append(detection_int_list)

            #CONF
            conf_int_string = value['image_id_{}_conf'.format(image)]
            if isinstance(conf_int_string, float):
                conf_int_list = []
            else:
                conf_int_list = conf_int_string.split(';')
            #conf_list.append(conf_int_list)       

            #BBOXES and CONF to list of dicts
            detection_conf_int_list = []

            if len(detection_int_list) > 0:

                for bbox,conf in zip(detection_int_list, conf_int_list):
                    detection_conf_dict = {'bbox': bbox,
                                          'conf': conf}
                    detection_conf_int_list.append(detection_conf_dict)

            detection_conf_list.append(detection_conf_int_list)



    megadetector_df = pd.DataFrame({'event_id': image_group_ids,
                 'image_id': image_ids,
                 'detections':detection_conf_list})

    #Remove rows where there was no image for the event
    megadetector_df = megadetector_df[~megadetector_df['image_id'].isnull()]
    
    def extract_yolo(list_of_detections):
        yolo_list = []

        for i in list_of_detections:
            yolo_list.append(i['bbox'])
        return yolo_list

    megadetector_df['yolo'] = megadetector_df['detections'].apply(lambda x: extract_yolo(x))
    megadetector_df['count'] = megadetector_df['yolo'].apply(lambda x: len(x))

    def extract_conf(list_of_detections):
        conf_list = []

        for i in list_of_detections:
            conf_list.append(i['conf'])
        return conf_list

    megadetector_df['all_conf'] = megadetector_df['detections'].apply(lambda x: extract_conf(x))
    megadetector_df['max_detection_conf'] = megadetector_df['all_conf'].apply(lambda x:  max(x) if len(x) > 0 else 0)
    megadetector_df['all_class_pred'] = megadetector_df['count'].apply(lambda x:[1]*x)
    megadetector_df.loc[:, "length"] = megadetector_df['image_id'].apply(lambda x: len(x))
    megadetector_df['image_id'] = megadetector_df['event_id'] + megadetector_df['image_id']
    
#     for row, val in megadetector_df.length.items():
#         if val == 24: 
#             megadetector_df.loc[row, 'image_id'] = megadetector_df.loc[row, 'image_id'][:-4]
#         else: 
#             megadetector_df.loc[row, 'image_id'] = megadetector_df.loc[row, 'image_id'][:-5]
    
    megadetector_df.drop(columns=['length'], inplace=True)
    
    return megadetector_df

  

In [4]:
megadetector = load_megadetector_output()

In [5]:
def load_yolo_output(foldername="src/", filename='model_output_11202021_4.csv'):
    """
    Pkg dependencies: os, glob, re, pandas
    Purpose: 
    Inputs: 
    Outputs: 
    
    """
 

    output_file = pd.read_csv(os.path.join(ROOT,foldername, filename))
    yolo = output_file[output_file['model_id'] == 3]

    image_group_ids = []
    image_ids = []
    detection_list = []

    for row, value in yolo.iterrows():
        for image in range(1,4):
            image_group_ids.append(value['image_group_id'])
            image_ids.append(value['image_id_{}'.format(image)])

            #BBOXES
            detection_int_list = []

            detection_int_string = value['image_id_{}_bbox'.format(image)]
            if isinstance(detection_int_string, float):
                detection_list.append(detection_int_list)
            else:
                detection_split_list = detection_int_string.split(';')
                for bbox in detection_split_list:

                    detection_int_list.append(detection_split_list)

                detection_list.append(detection_int_list)



    yolov5 = pd.DataFrame({'event_id': image_group_ids,
                 'image_id': image_ids,
                 'yolo_bbox':detection_list})

    yolov5['yolo_count'] = yolov5['yolo_bbox'].apply(lambda x: len(x))
    yolov5['image_id'] = yolov5['event_id'] + yolov5['image_id']

    return yolov5
        


In [6]:
yolov5 = load_yolo_output()

## Merge MD and Yolo and apply logic

Use yolo prediction unless MD has less boxes

In [296]:
def merge_md_yolo(yolo_df, megadetector_df):
    """
    Pkg dependencies: pandas
    Purpose:
    Inputs: YOLO pd.DataFrame, Megadetector pd.DataFrame, ground truth pd.DataFrame
    Outputs: Merged pd.DataFrame of YOLO, Megadetector and ground truth
    """

    # Merge megadetector to YOLO by "image_id"
    megadetector_df = megadetector_df.rename(columns = {'yolo': 'md_bbox',
                                                   'count': 'md_count'})

    merged_raw = megadetector_df[['event_id','image_id','md_bbox', 'md_count']].merge(yolo_df[['image_id', 'yolo_bbox', 'yolo_count']], left_on="image_id", right_on="image_id")

    # Group by imageid (there should be 3), take the max count across the imageid that compose the event
    gby_eventid_counts = merged_raw[['event_id', 'md_count', 'yolo_count']].groupby(by='event_id').agg('max')
    gby_eventid_counts = gby_eventid_counts.rename(columns = {'md_count': 'md_count_max',
                                    'yolo_count': 'yolo_count_max'})

    final_counts = pd.merge(final, gby_eventid_counts,
         on = 'event_id', how = 'left')

    
    
    event_id_group = []
    md_count_max_group = []
    yolo_count_max_group = []

    md_bbox_group = []
    yolo_bbox_group = []

    for group, values in counts_bboxes.groupby(['event_id', 'md_count_max', 'yolo_count_max']):

        event_id_group.append(group[0])
        md_count_max_group.append(group[1])
        yolo_count_max_group.append(group[2])

        md_bbox_dict = {}
        yolo_bbox_dict = {}

        for image, md, yolo in zip(list(values['image_id_appendix']), list(values['md_bbox']), list(values['yolo_bbox'])):
            md_bbox_dict[image] = md
            yolo_bbox_dict[image] = yolo


        md_bbox_group.append(md_bbox_dict)
        yolo_bbox_group.append(yolo_bbox_dict)

    final_counts_bboxes = pd.DataFrame({'event_id': event_id_group,
                   'md_count_max': md_count_max_group,
                  'yolo_count_max': yolo_count_max_group,
                  'md_bbox': md_bbox_group,
                 'yolo_bbox': yolo_bbox_group})
    #Now implementing final ensemble logic in full_ensemble.py
#     final_counts['final_count'] = final_counts.apply(lambda x: x['yolo_count_max'] if x['yolo_count_max'] < x['md_count_max'] else x['md_count_max'], axis = 1)

#     final_counts['final_bbox'] = final_counts.apply(lambda x: x['yolo_bbox'] if x['yolo_count_max'] < x['md_count_max'] else x['md_bbox'], axis = 1)



    return final_counts_bboxes



In [297]:
counts_bboxes = merge_md_yolo(yolov5, megadetector)

In [298]:
counts_bboxes

Unnamed: 0,event_id,md_count_max,yolo_count_max,md_bbox,yolo_bbox
0,2008329,0,1,"{'A': [], 'C': [], 'B': []}","{'A': [[0.5517, 0.3845, 0.0638, 0.1064]], 'C':..."
1,2009625,0,0,"{'C': [], 'A': [], 'B': []}","{'C': [], 'A': [], 'B': []}"
2,2010025,0,0,"{'A': [], 'B': [], 'C': []}","{'A': [], 'B': [], 'C': []}"
3,2010778,0,0,"{'C': [], 'B': [], 'A': []}","{'C': [], 'B': [], 'A': []}"
4,2010824,0,0,"{'B': [], 'A': [], 'C': []}","{'B': [[0.8921, 0.4316, 0.1064, 0.079]], 'A': ..."
...,...,...,...,...,...
1649,SSWI000000023457314,2,1,"{'A': [[0.2722, 0.9346, 0.1989, 0.125]], 'C': ...","{'A': [[0.2751, 0.9362, 0.1915, 0.1277]], 'C':..."
1650,SSWI000000023457319,1,1,"{'A': [[0.4269, 0.9028, 0.2354, 0.1802]], 'B':...","{'A': [[0.4179, 0.9164, 0.2401, 0.1672]], 'B':..."
1651,SSWI000000023494506,1,1,"{'A': [[0.7697, 0.7392, 0.1879, 0.1302]], 'B':...","{'A': [[0.772, 0.7416, 0.1824, 0.1277]], 'B': ..."
1652,SSWI000000023514111,1,1,"{'A': [[0.8752, 0.7003, 0.2144, 0.2322]], 'C':...","{'A': [[0.8723, 0.266, 0.2553, 0.5076], [0.873..."


In [290]:

event_id_group = []
md_count_max_group = []
yolo_count_max_group = []

md_bbox_group = []
yolo_bbox_group = []

for group, values in counts_bboxes.groupby(['event_id', 'md_count_max', 'yolo_count_max']):

    event_id_group.append(group[0])
    md_count_max_group.append(group[1])
    yolo_count_max_group.append(group[2])
    
    md_bbox_dict = {}
    yolo_bbox_dict = {}
    
    for image, md, yolo in zip(list(values['image_id_appendix']), list(values['md_bbox']), list(values['yolo_bbox'])):
        md_bbox_dict[image] = md
        yolo_bbox_dict[image] = yolo
        
       
    md_bbox_group.append(md_bbox_dict)
    yolo_bbox_group.append(yolo_bbox_dict)

In [295]:
pd.DataFrame({'event_id': event_id_group,
               'md_count_max': md_count_max_group,
              'yolo_count_max': yolo_count_max_group,
              'md_bbox': md_bbox_group,
             'yolo_bbox': yolo_bbox_group})

Unnamed: 0,event_id,md_count_max,yolo_count_max,md_bbox,yolo_bbox
0,2008329,0,1,"{'A': [], 'C': [], 'B': []}","{'A': [[0.5517, 0.3845, 0.0638, 0.1064]], 'C':..."
1,2009625,0,0,"{'C': [], 'A': [], 'B': []}","{'C': [], 'A': [], 'B': []}"
2,2010025,0,0,"{'A': [], 'B': [], 'C': []}","{'A': [], 'B': [], 'C': []}"
3,2010778,0,0,"{'C': [], 'B': [], 'A': []}","{'C': [], 'B': [], 'A': []}"
4,2010824,0,0,"{'B': [], 'A': [], 'C': []}","{'B': [[0.8921, 0.4316, 0.1064, 0.079]], 'A': ..."
...,...,...,...,...,...
1649,SSWI000000023457314,2,1,"{'A': [[0.2722, 0.9346, 0.1989, 0.125]], 'C': ...","{'A': [[0.2751, 0.9362, 0.1915, 0.1277]], 'C':..."
1650,SSWI000000023457319,1,1,"{'A': [[0.4269, 0.9028, 0.2354, 0.1802]], 'B':...","{'A': [[0.4179, 0.9164, 0.2401, 0.1672]], 'B':..."
1651,SSWI000000023494506,1,1,"{'A': [[0.7697, 0.7392, 0.1879, 0.1302]], 'B':...","{'A': [[0.772, 0.7416, 0.1824, 0.1277]], 'B': ..."
1652,SSWI000000023514111,1,1,"{'A': [[0.8752, 0.7003, 0.2144, 0.2322]], 'C':...","{'A': [[0.8723, 0.266, 0.2553, 0.5076], [0.873..."


In [None]:
counts_bboxes.groupby('image_group_id').max()[['final_count']].reset_index()

## Merge YOLO, Megadetector and ground truth dataframes

In [None]:
def merge_all(yolo_df, megadetector_df, ground_truth_df): 
    """
    Pkg dependencies: pandas 
    Purpose: 
    Inputs: YOLO pd.DataFrame, Megadetector pd.DataFrame, ground truth pd.DataFrame
    Outputs: Merged pd.DataFrame of YOLO, Megadetector and ground truth
    """
    
    ### FIRST OUTPUT ###
    # Merge all - The image id will repeat 3 times
    final_raw = megadetector_df.merge(yolo_df, left_on="image_id", right_on="image_id")
    final_raw.drop(columns=['event_id_y'], inplace=True)
    final_raw.rename(columns={'event_id_x': 'event_id'}, inplace=True)
    final_raw = ground_truth_df.merge(final_raw, left_on="TRIGGER_ID", right_on="event_id")
    final_raw.rename(columns={'count':'md_count', 'Total':'ground_truth_count', 'yolo':'md_bbox', 
                              'all_class_pred':'md_all_class_pred', 'all_conf':'md_all_conf', 'max_detection_conf': 'md_max_detection_conf'}, inplace=True)
    final_raw.sort_values(by="event_id").reset_index(drop=True)
    final_raw = final_raw[['event_id','CLASS_SPECIES','CLASS_SPECIES_RESTATED','md_all_class_pred','md_all_conf','md_max_detection_conf', \
               'md_bbox','yolo_bbox','ground_truth_count', 'md_count','yolo_count']]
    
    
    ### SECOND OUTPUT ###
    # Merge megadetector to YOLO by "image_id"
    final = megadetector_df.merge(yolo_df, left_on="image_id", right_on="image_id")
    final.drop(columns=["event_id_x", 'event_id_y'], inplace=True)
    final.loc[:, 'image_id'] = final['image_id'].apply(lambda x: x[:-1])
    
    # Group by imageid (there should be 3), take the max count across the imageid that compose the event
    gby_imageid = final.groupby(by='image_id').agg(['max'])
    counts_md = gby_imageid['count']
    counts_yolo = gby_imageid['yolo_count']
    
    # Merge ground truth to megadetector
    merged_md = ground_truth_df.merge(counts_md, left_on="TRIGGER_ID", right_on="image_id")
    merged_md.rename(columns={'max':'md_count'}, inplace=True)
    
    # Merge ground truth to yolo
    merged_yolo = ground_truth_df.merge(counts_yolo, left_on="TRIGGER_ID", right_on="image_id")
    merged_yolo.rename(columns={'max':'yolo_count'}, inplace=True)
    
    # Merge everything 
    merged_final = merged_yolo[['TRIGGER_ID', "CLASS_SPECIES", "Total", "CLASS_SPECIES_RESTATED", 'yolo_count']].merge(merged_md[['TRIGGER_ID','md_count']], left_on="TRIGGER_ID", right_on="TRIGGER_ID")
    merged_final = merged_final[['TRIGGER_ID', 'CLASS_SPECIES', "CLASS_SPECIES_RESTATED", "Total", "yolo_count", "md_count"]]
    merged_final.rename(columns={"Total":"ground_truth_count"}, inplace=True)
    merged_final.sort_values(by="TRIGGER_ID", inplace=True)
    
    # Differences across each of Choose 2 of 3
    merged_final['md_gt_diff'] = merged_final['md_count'] - merged_final['ground_truth_count']
    merged_final['yolo_gt_diff'] = merged_final['yolo_count'] - merged_final['ground_truth_count']
    merged_final['md_yolo_diff'] = merged_final['md_count'] - merged_final['yolo_count']
    
    return final_raw, merged_final

# Duplicates
# print(merged_final[merged_final.duplicated()].shape)

# ground_truth[ground_truth.duplicated()]
# ground_truth[ground_truth.TRIGGER_ID.duplicated(keep=False)].shape
# merged_final['Total'].sum()
# ground_truth[ground_truth.TRIGGER_ID.duplicated(keep=False)]

In [None]:
final_raw, merged_final = merge_all(yolov5, megadetector, ground_truth)

In [None]:
final_raw.shape

In [None]:
merged_final.shape

In [None]:
final_raw