In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil 
import glob
import json
import re
import cv2
from PIL import Image
from sklearn.metrics import mean_absolute_percentage_error
import seaborn as sns
%matplotlib inline

In [2]:
ROOT = '../'
IMG_DIR = os.path.join(ROOT, 'species_labelling', 'export_annotated_data')
CODE_DIR = os.path.join(ROOT, "code")

## Load Ground Truth

In [3]:
def load_ground_truth(foldername=os.path.join(ROOT,"data/") , filename="test_labels4-1.csv"): 
    ground_truth = pd.read_csv(foldername + filename)
    ground_truth = ground_truth.iloc[:,1:]
    return ground_truth

In [5]:
def load_megadetector_output(foldername="results/JSON_txt_outputs/", filename='phase2_megadetector_classifications_yolosplits_4-1_YOLO.json'):#filename="phase2_megadetector_output_YOLO.json"): 
    """
    Pkg dependencies: os, glob, re, pandas
    Purpose: 
    Inputs: 
    Outputs: 
    """

    with open(os.path.join(ROOT,foldername, filename), 'r') as fin: 
        fobj = fin.read()
        megadetector = json.loads(fobj)

    event_list = []
    img_list = []
    detection_list = []

    for event, image_set in megadetector['phase2_classification_results'].items():
        for image in image_set:
            event_list.append(image['event_id'])
            img_list.append(image['img_id'])
            detection_list.append(image['detections'])

    megadetector_df = pd.DataFrame({'event_id': event_list,
                  'image_id':img_list, 
                  'detections':detection_list})

    def extract_yolo(list_of_detections):
        yolo_list = []

        for i in list_of_detections:
            yolo_list.append(i['bbox'])
        return yolo_list

    megadetector_df['yolo'] = megadetector_df['detections'].apply(lambda x: extract_yolo(x))
    megadetector_df['count'] = megadetector_df['yolo'].apply(lambda x: len(x))

    def extract_conf(list_of_detections):
        conf_list = []

        for i in list_of_detections:
            conf_list.append(i['conf'])
        return conf_list

    megadetector_df['all_conf'] = megadetector_df['detections'].apply(lambda x: extract_conf(x))
    megadetector_df['max_detection_conf'] = megadetector_df['all_conf'].apply(lambda x:  max(x) if len(x) > 0 else 0)
    megadetector_df['all_class_pred'] = megadetector_df['count'].apply(lambda x:[1]*x)
    megadetector_df.loc[:, "length"] = megadetector_df['image_id'].apply(lambda x: len(x))
    for row, val in megadetector_df.length.items():
        if val == 24: 
            megadetector_df.loc[row, 'image_id'] = megadetector_df.loc[row, 'image_id'][:-4]
        else: 
            megadetector_df.loc[row, 'image_id'] = megadetector_df.loc[row, 'image_id'][:-5]
    
    megadetector_df.drop(columns=['length'], inplace=True)
    
    return megadetector_df

  

In [6]:
megadetector = load_megadetector_output()

In [7]:
megadetector.sort_values(by='image_id')

Unnamed: 0,event_id,image_id,detections,yolo,count,all_conf,max_detection_conf,all_class_pred
966,2008329,2008329_0A,[],[],0,[],0.000,[]
968,2008329,2008329_1B,[],[],0,[],0.000,[]
967,2008329,2008329_2C,[],[],0,[],0.000,[]
1153,2009625,2009625_0A,[],[],0,[],0.000,[]
1154,2009625,2009625_1B,[],[],0,[],0.000,[]
...,...,...,...,...,...,...,...,...
1820,SSWI000000023514111,SSWI000000023514111B,"[{'bbox': [0.8145, 0.6238, 0.1178, 0.2669], 'c...","[[0.8145, 0.6238, 0.1178, 0.2669]]",1,[0.999],0.999,[1]
1819,SSWI000000023514111,SSWI000000023514111C,"[{'bbox': [0.8147, 0.6279, 0.1137, 0.2435], 'c...","[[0.8147, 0.6279, 0.1137, 0.2435]]",1,[0.998],0.998,[1]
1577,SSWI000000023514155,SSWI000000023514155A,"[{'bbox': [0.8724, 0.622, 0.1378, 0.2855], 'co...","[[0.8724, 0.622, 0.1378, 0.2855]]",1,[0.999],0.999,[1]
1576,SSWI000000023514155,SSWI000000023514155B,"[{'bbox': [0.6836, 0.5465, 0.1399, 0.1251], 'c...","[[0.6836, 0.5465, 0.1399, 0.1251]]",1,[0.997],0.997,[1]


In [8]:
def split_and_convert(s):
    """
    Purpose: Utility function used in load_yolo_output function for bounding box.  
    """
    new = []
    out = s.split(',')
    for i in out: 
        new.append(round(float(i), 4))
    return new
    
def load_yolo_output(foldername="results/JSON_txt_outputs/", filename="phase2_yolo_yolosplits4_1.txt"):
    """
    Pkg dependencies: os, glob, re, pandas
    Purpose: 
    Inputs: 
    Outputs: 
    
    """
 


    # Load yolo model output file 
    with open(os.path.join(ROOT, foldername, filename), 'r') as fin: 
        yolov5 = fin.readlines()

    # Parse through file and pick out filename and bounding box
    filenames = []
    bbox = []
    for line_num, line in enumerate(yolov5):
        newline = line.split("\n")[0]
        semicolon_idxs = [m.start() for m in re.finditer(";", newline)]
        bbox_start, bbox_end = re.search(r"Bbox\[list]:", newline).start(), re.search(r"Bbox\[list]:", newline).end()

        for i, idx in list(zip(range(0,len(semicolon_idxs)), semicolon_idxs)): 
            # Filename
            if i == 0:
                filenames.append(newline[:idx].split("Filename: ")[1])#.lstrip()[:-4])

        # Yolo Bounding box
        bbox_data = newline[bbox_end:].lstrip().split(';')[:-1]
        if len(bbox_data) == 0:
            bbox.append([])
        else: 
            subl = [split_and_convert(i) for i in bbox_data]
            bbox.append(subl)

    # Construct DataFrame
    yolov5 = pd.DataFrame([pd.Series(filenames, name="image_id"), pd.Series(bbox, name="yolo_bbox")]).T
    
    yolov5.loc[:, "length"] = yolov5['image_id'].apply(lambda x: len(x))
    for row, val in yolov5.length.items():
        if val == 24: 
            yolov5.loc[row, 'image_id'] = yolov5.loc[row, 'image_id'][:-4]
            yolov5.loc[row, 'event_id'] = yolov5.loc[row, 'image_id'][:-1]
        else: 
            yolov5.loc[row, 'image_id'] = yolov5.loc[row, 'image_id'][:-5]
            yolov5.loc[row, 'event_id'] = yolov5.loc[row, 'image_id'][:-3]
    
    yolov5.sort_values(by="image_id", inplace=True, ignore_index=True)
    yolov5['yolo_count'] = yolov5['yolo_bbox'].apply(lambda x: len(x))
    yolov5.drop(columns=['length'], inplace=True)
    yolov5 = yolov5[['event_id', 'image_id','yolo_bbox','yolo_count']]

    return yolov5
        


In [9]:
yolov5 = load_yolo_output()

In [10]:
yolov5

Unnamed: 0,event_id,image_id,yolo_bbox,yolo_count
0,2008329,2008329_0A,"[[0.5517, 0.3845, 0.0638, 0.1064]]",1
1,2008329,2008329_1B,"[[0.5532, 0.3815, 0.0547, 0.0881]]",1
2,2008329,2008329_2C,[],0
3,2009625,2009625_0A,[],0
4,2009625,2009625_1B,[],0
...,...,...,...,...
4956,SSWI000000023514111,SSWI000000023514111B,"[[0.8161, 0.6261, 0.1185, 0.2675]]",1
4957,SSWI000000023514111,SSWI000000023514111C,"[[0.8131, 0.6064, 0.1064, 0.2036]]",1
4958,SSWI000000023514155,SSWI000000023514155A,"[[0.8769, 0.6261, 0.1307, 0.2736]]",1
4959,SSWI000000023514155,SSWI000000023514155B,"[[0.6839, 0.5426, 0.1581, 0.1368]]",1


In [84]:
megadetector_df = megadetector
yolo_df = yolov5

megadetector_df = megadetector_df.rename(columns = {'yolo': 'md_bbox',
                                                   'count': 'md_count'})

final = megadetector_df[['event_id','image_id','md_bbox', 'md_count']].merge(yolo_df[['image_id', 'yolo_bbox', 'yolo_count']], left_on="image_id", right_on="image_id")

# Group by imageid (there should be 3), take the max count across the imageid that compose the event
gby_eventid_counts = final[['event_id', 'md_count', 'yolo_count']].groupby(by='event_id').agg('max')
#counts_md = gby_imageid['count']
#counts_yolo = gby_imageid['yolo_count']


In [88]:
gby_eventid_counts = gby_eventid_counts.rename(columns = {'md_count': 'md_count_max',
                                    'yolo_count': 'yolo_count_max'})

In [89]:
final_counts = pd.merge(final, gby_eventid_counts,
         on = 'event_id', how = 'left')

In [95]:
final_counts['final_count'] = final_counts.apply(lambda x: x['yolo_count_max'] if x['yolo_count_max'] < x['md_count_max'] else x['md_count_max'], axis = 1)
                                                 
final_counts['final_bbox'] = final_counts.apply(lambda x: x['yolo_bbox'] if x['yolo_count_max'] < x['md_count_max'] else x['md_bbox'], axis = 1)
                                                    
                                                 

In [96]:
final_counts[[['event_id', 'image_id', 'final_count', 'final_bbox']]

Unnamed: 0,event_id,image_id,md_bbox,md_count,yolo_bbox,yolo_count,md_count_max,yolo_count_max,final_count,final_bbox
0,SSWI000000019807656,SSWI000000019807656A,"[[0.5436, 0.4957, 0.8986, 0.9914], [0.8767, 0....",10,"[[0.6033, 0.6429, 0.5502, 0.5988], [0.3389, 0....",4,10,4,4,"[[0.6033, 0.6429, 0.5502, 0.5988], [0.3389, 0...."
1,SSWI000000019807656,SSWI000000019807656B,"[[0.8367, 0.527, 0.3227, 0.8099], [0.2321, 0.2...",4,"[[0.8343, 0.6231, 0.3252, 0.6018], [0.2204, 0....",3,10,4,4,"[[0.8343, 0.6231, 0.3252, 0.6018], [0.2204, 0...."
2,SSWI000000019807656,SSWI000000019807656C,"[[0.8674, 0.3758, 0.2631, 0.4973], [0.2041, 0....",5,"[[0.2188, 0.2903, 0.1459, 0.1185], [0.8647, 0....",2,10,4,4,"[[0.2188, 0.2903, 0.1459, 0.1185], [0.8647, 0...."
3,SSWI000000020162423,SSWI000000020162423B,"[[0.9051, 0.7641, 0.1866, 0.4715]]",1,"[[0.9058, 0.7584, 0.1884, 0.4833]]",1,1,1,1,"[[0.9051, 0.7641, 0.1866, 0.4715]]"
4,SSWI000000020162423,SSWI000000020162423A,"[[0.8962, 0.7596, 0.2074, 0.4778]]",1,"[[0.8906, 0.7523, 0.2188, 0.4954]]",1,1,1,1,"[[0.8962, 0.7596, 0.2074, 0.4778]]"
...,...,...,...,...,...,...,...,...,...,...
4956,SSWI000000007972320,SSWI000000007972320A,"[[0.2733, 0.7588, 0.2554, 0.272], [0.1058, 0.4...",2,"[[0.2736, 0.7508, 0.2736, 0.2553]]",1,2,2,2,"[[0.2733, 0.7588, 0.2554, 0.272], [0.1058, 0.4..."
4957,SSWI000000007972320,SSWI000000007972320B,"[[0.5083, 0.6616, 0.355, 0.4526], [0.1192, 0.4...",2,"[[0.421, 0.6733, 0.538, 0.4529]]",1,2,2,2,"[[0.5083, 0.6616, 0.355, 0.4526], [0.1192, 0.4..."
4958,SSWI000000011771136,SSWI000000011771136C,"[[0.2859, 0.3655, 0.2553, 0.3508], [0.47, 0.36...",4,"[[0.1216, 0.3009, 0.0365, 0.0851], [0.4681, 0....",4,4,4,4,"[[0.2859, 0.3655, 0.2553, 0.3508], [0.47, 0.36..."
4959,SSWI000000011771136,SSWI000000011771136A,"[[0.4886, 0.3638, 0.3084, 0.4075], [0.0562, 0....",2,"[[0.0593, 0.3419, 0.0638, 0.2036], [0.4954, 0....",2,4,4,4,"[[0.4886, 0.3638, 0.3084, 0.4075], [0.0562, 0...."


## Merge MD and Yolo and apply logic

Use yolo prediction unless MD has less boxes

In [122]:
def merge_md_yolo(yolo_df, megadetector_df):
    """
    Pkg dependencies: pandas
    Purpose:
    Inputs: YOLO pd.DataFrame, Megadetector pd.DataFrame, ground truth pd.DataFrame
    Outputs: Merged pd.DataFrame of YOLO, Megadetector and ground truth
    """

    # Merge megadetector to YOLO by "image_id"
    megadetector_df = megadetector_df.rename(columns = {'yolo': 'md_bbox',
                                                   'count': 'md_count'})

    merged_raw = megadetector_df[['event_id','image_id','md_bbox', 'md_count']].merge(yolo_df[['image_id', 'yolo_bbox', 'yolo_count']], left_on="image_id", right_on="image_id")

    # Group by imageid (there should be 3), take the max count across the imageid that compose the event
    gby_eventid_counts = merged_raw[['event_id', 'md_count', 'yolo_count']].groupby(by='event_id').agg('max')
    gby_eventid_counts = gby_eventid_counts.rename(columns = {'md_count': 'md_count_max',
                                    'yolo_count': 'yolo_count_max'})

    final_counts = pd.merge(final, gby_eventid_counts,
         on = 'event_id', how = 'left')

    final_counts['final_count'] = final_counts.apply(lambda x: x['yolo_count_max'] if x['yolo_count_max'] < x['md_count_max'] else x['md_count_max'], axis = 1)

    final_counts['final_bbox'] = final_counts.apply(lambda x: x['yolo_bbox'] if x['yolo_count_max'] < x['md_count_max'] else x['md_bbox'], axis = 1)



    return final_counts[['event_id', 'image_id', 'final_count', 'final_bbox']]



In [123]:
counts_bboxes = merge_md_yolo(yolov5, megadetector)

In [124]:
yolov5

Unnamed: 0,event_id,image_id,yolo_bbox,yolo_count
0,2008329,2008329_0A,"[[0.5517, 0.3845, 0.0638, 0.1064]]",1
1,2008329,2008329_1B,"[[0.5532, 0.3815, 0.0547, 0.0881]]",1
2,2008329,2008329_2C,[],0
3,2009625,2009625_0A,[],0
4,2009625,2009625_1B,[],0
...,...,...,...,...
4956,SSWI000000023514111,SSWI000000023514111B,"[[0.8161, 0.6261, 0.1185, 0.2675]]",1
4957,SSWI000000023514111,SSWI000000023514111C,"[[0.8131, 0.6064, 0.1064, 0.2036]]",1
4958,SSWI000000023514155,SSWI000000023514155A,"[[0.8769, 0.6261, 0.1307, 0.2736]]",1
4959,SSWI000000023514155,SSWI000000023514155B,"[[0.6839, 0.5426, 0.1581, 0.1368]]",1


In [116]:
counts_bboxes.groupby('image_group_id').max()[['final_count']].reset_index()

Unnamed: 0,image_group_id,final_count
0,2008329,0
1,2009625,0
2,2010025,0
3,2010778,0
4,2010824,0
...,...,...
1649,SSWI000000023457314,1
1650,SSWI000000023457319,1
1651,SSWI000000023494506,1
1652,SSWI000000023514111,1


## Merge YOLO, Megadetector and ground truth dataframes

In [14]:
def merge_all(yolo_df, megadetector_df, ground_truth_df): 
    """
    Pkg dependencies: pandas 
    Purpose: 
    Inputs: YOLO pd.DataFrame, Megadetector pd.DataFrame, ground truth pd.DataFrame
    Outputs: Merged pd.DataFrame of YOLO, Megadetector and ground truth
    """
    
    ### FIRST OUTPUT ###
    # Merge all - The image id will repeat 3 times
    final_raw = megadetector_df.merge(yolo_df, left_on="image_id", right_on="image_id")
    final_raw.drop(columns=['event_id_y'], inplace=True)
    final_raw.rename(columns={'event_id_x': 'event_id'}, inplace=True)
    final_raw = ground_truth_df.merge(final_raw, left_on="TRIGGER_ID", right_on="event_id")
    final_raw.rename(columns={'count':'md_count', 'Total':'ground_truth_count', 'yolo':'md_bbox', 
                              'all_class_pred':'md_all_class_pred', 'all_conf':'md_all_conf', 'max_detection_conf': 'md_max_detection_conf'}, inplace=True)
    final_raw.sort_values(by="event_id").reset_index(drop=True)
    final_raw = final_raw[['event_id','CLASS_SPECIES','CLASS_SPECIES_RESTATED','md_all_class_pred','md_all_conf','md_max_detection_conf', \
               'md_bbox','yolo_bbox','ground_truth_count', 'md_count','yolo_count']]
    
    
    ### SECOND OUTPUT ###
    # Merge megadetector to YOLO by "image_id"
    final = megadetector_df.merge(yolo_df, left_on="image_id", right_on="image_id")
    final.drop(columns=["event_id_x", 'event_id_y'], inplace=True)
    final.loc[:, 'image_id'] = final['image_id'].apply(lambda x: x[:-1])
    
    # Group by imageid (there should be 3), take the max count across the imageid that compose the event
    gby_imageid = final.groupby(by='image_id').agg(['max'])
    counts_md = gby_imageid['count']
    counts_yolo = gby_imageid['yolo_count']
    
    # Merge ground truth to megadetector
    merged_md = ground_truth_df.merge(counts_md, left_on="TRIGGER_ID", right_on="image_id")
    merged_md.rename(columns={'max':'md_count'}, inplace=True)
    
    # Merge ground truth to yolo
    merged_yolo = ground_truth_df.merge(counts_yolo, left_on="TRIGGER_ID", right_on="image_id")
    merged_yolo.rename(columns={'max':'yolo_count'}, inplace=True)
    
    # Merge everything 
    merged_final = merged_yolo[['TRIGGER_ID', "CLASS_SPECIES", "Total", "CLASS_SPECIES_RESTATED", 'yolo_count']].merge(merged_md[['TRIGGER_ID','md_count']], left_on="TRIGGER_ID", right_on="TRIGGER_ID")
    merged_final = merged_final[['TRIGGER_ID', 'CLASS_SPECIES', "CLASS_SPECIES_RESTATED", "Total", "yolo_count", "md_count"]]
    merged_final.rename(columns={"Total":"ground_truth_count"}, inplace=True)
    merged_final.sort_values(by="TRIGGER_ID", inplace=True)
    
    # Differences across each of Choose 2 of 3
    merged_final['md_gt_diff'] = merged_final['md_count'] - merged_final['ground_truth_count']
    merged_final['yolo_gt_diff'] = merged_final['yolo_count'] - merged_final['ground_truth_count']
    merged_final['md_yolo_diff'] = merged_final['md_count'] - merged_final['yolo_count']
    
    return final_raw, merged_final

# Duplicates
# print(merged_final[merged_final.duplicated()].shape)

# ground_truth[ground_truth.duplicated()]
# ground_truth[ground_truth.TRIGGER_ID.duplicated(keep=False)].shape
# merged_final['Total'].sum()
# ground_truth[ground_truth.TRIGGER_ID.duplicated(keep=False)]

In [15]:
final_raw, merged_final = merge_all(yolov5, megadetector, ground_truth)

In [18]:
final_raw.shape

(5024, 11)

In [19]:
merged_final.shape

(1519, 9)

In [31]:
final_raw

Unnamed: 0,event_id,CLASS_SPECIES,CLASS_SPECIES_RESTATED,md_all_class_pred,md_all_conf,md_max_detection_conf,md_bbox,yolo_bbox,ground_truth_count,md_count,yolo_count
0,SSWI000000002741773,Porcupine,other,[],[],0.000,[],[],1,0,0
1,SSWI000000002741773,Porcupine,other,[],[],0.000,[],[],1,0,0
2,SSWI000000002741773,Porcupine,other,[1],[0.962],0.962,"[[0.1255, 0.5094, 0.1344, 0.08798]]",[],1,1,0
3,SSWI000000004032002,"Fox, Gray",foxgray_foxred,[1],[0.999],0.999,"[[0.5119, 0.728, 0.07965, 0.1622]]","[[0.5122, 0.7264, 0.0942, 0.1702]]",1,1,1
4,SSWI000000004032002,"Fox, Gray",foxgray_foxred,[1],[0.997],0.997,"[[0.4984, 0.683, 0.05721, 0.1576]]","[[0.497, 0.6763, 0.0699, 0.155]]",1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5019,3039286,blank,blank,[1],[0.9],0.900,"[[0.0305, 0.9459, 0.05864, 0.1038]]","[[0.041, 0.9362, 0.0821, 0.1277]]",0,1,1
5020,3039286,blank,blank,[1],[0.961],0.961,"[[0.0297, 0.9466, 0.05866, 0.1047]]","[[0.041, 0.9362, 0.0821, 0.1277]]",0,1,1
5021,2042734,blank,blank,[],[],0.000,[],[],0,0,0
5022,2042734,blank,blank,[],[],0.000,[],[],0,0,0
