In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import cv2 as cv
import numpy as np
import pprint
import matplotlib.pyplot as plt
import os
import time
import multiprocessing as mp
plt.rcParams['figure.figsize'] = [15, 10]
pd.options.display.max_columns = None
pd.options.display.width=None

from config import folder_locs, interest_cols, rando_seed

# Instructions:
1. Put all your photos in the input_photos folder
2. Put the annotation file in the input_csvs folder
3. Set the input_csv_filename variable (in the User Input section below) to the name of the annotation file
4. Run all the cells (top menu: Run > Run All Cells)

# User Inputs

enter the name of the csv file. 

if the file is an excel file, please save the file as a csv file by:
+ opening the excel file
+ selecting File from the menu at the top
+ selecting Save As
+ selecting CSV UTF-8 from the drop-down menu
+ selecting Save

In [3]:
input_csv_filename = 'groundtruth_with_HeadsTails_v2.csv'
photo_file_types = ['.jpg', '.jpeg', '.png']

# Code

creating the relative filepath for the annoation file

In [4]:
input_csv_filepath = folder_locs['in_files'] + input_csv_filename
print(input_csv_filepath)

./input_csvs/groundtruth_with_HeadsTails_v2.csv


loading the annotation file as a dataframe

In [5]:
csv_df = pd.read_csv(input_csv_filepath)
# csv_df

getting all the uniuqe labels in the annotation file

In [6]:
unique_labels = csv_df[interest_cols['label']].unique()
print(unique_labels)

['Swordfish' 'Swordfish_Head' 'Swordfish_Tail' 'Bigeyetuna'
 'Bigeyetuna_Head' 'Bigeyetuna_Tail' 'Mahimahi' 'Mahimahi_Tail'
 'Mahimahi_Head' 'Makoshark' 'Yellowfintuna' 'Shortbillspearfish' 'Human'
 'Nofish' 'Albacore' 'Wahoo' 'IndoPacificsailfish' 'Stripedmarlin'
 'Oilfish' 'Skipjacktuna' 'Shark' 'Opah' 'Sicklepomfret' 'Greatbarracuda'
 'Blackmarlin' 'Unknownfish' 'roudiescolar' 'Longsnoutedlancetfish'
 'Threshershark' 'Pelagicstingray' 'Bluemarlin' 'Snakemackerel'
 'Rainbowrunner' 'Pomfret' 'Molamola' 'Escolar' 'Tuna' 'Emptysnap'
 'Floatline' 'Heavysnap' 'linesnap' 'Blueshark' 'Lazyline' 'Emptyhand'
 'Heavyhand' 'Floathand' 'DaggerPomfret' 'MarineDebris' 'Loggerhead'
 'BlueShark' 'BramaSpp' 'NoFish' 'StripedMarlin'
 'LongsnoutedlancetfishGills' 'Squid' 'Marlin' 'Gillsonly' 'Hardfloat'
 'BFA' 'Laysan' 'Roughpomfret' 'Bramaspp']


generating colors for the bounding boxes of the unique labels.  
a random seed has been set in the config.py file, under the scripts folder.  
this means that the colors generated for a certain annotation file will always be the same

In [7]:
np.random.seed(rando_seed)
rando_bgrs = np.random.choice(a=np.arange(start=30, 
                                          stop=225), 
                              size=3*len(unique_labels), 
                              replace=False).reshape(len(unique_labels), 3).tolist()

creating a dictionary with the unique labels as keys, and the bounding box colors as the values

In [8]:
label_bgrs = {}
for i, a_label in enumerate(unique_labels):
    label_bgrs[a_label] = tuple(rando_bgrs[i])
pprint.pprint(label_bgrs)

{'Albacore': (197, 115, 86),
 'BFA': (213, 31, 82),
 'Bigeyetuna': (141, 214, 48),
 'Bigeyetuna_Head': (112, 39, 194),
 'Bigeyetuna_Tail': (147, 99, 143),
 'Blackmarlin': (76, 167, 120),
 'BlueShark': (119, 38, 43),
 'Bluemarlin': (173, 170, 127),
 'Blueshark': (107, 219, 193),
 'BramaSpp': (89, 196, 161),
 'Bramaspp': (117, 104, 151),
 'DaggerPomfret': (79, 110, 64),
 'Emptyhand': (73, 186, 33),
 'Emptysnap': (55, 53, 176),
 'Escolar': (155, 58, 70),
 'Floathand': (205, 182, 210),
 'Floatline': (172, 111, 138),
 'Gillsonly': (87, 190, 217),
 'Greatbarracuda': (216, 128, 32),
 'Hardfloat': (159, 67, 187),
 'Heavyhand': (135, 83, 163),
 'Heavysnap': (109, 114, 69),
 'Human': (49, 198, 103),
 'IndoPacificsailfish': (72, 95, 134),
 'Laysan': (179, 160, 181),
 'Lazyline': (77, 124, 207),
 'Loggerhead': (113, 201, 206),
 'Longsnoutedlancetfish': (63, 41, 154),
 'LongsnoutedlancetfishGills': (93, 84, 137),
 'Mahimahi': (222, 149, 153),
 'Mahimahi_Head': (188, 145, 97),
 'Mahimahi_Tail': (174

create a function to determine if a filename is a photo image.  
uses the extenstions defines in photo_file_types variable under Inputs section.

In [9]:
def is_file_a_photo(a_filename):
    is_in_photo = [x in a_filename for x in photo_file_types]
    is_in_photo = any(is_in_photo)
    if is_in_photo:
        return a_filename
    else:
        return None

get all the files in the input_photos folder, keep only the images.

In [10]:
files_in_input_photos = os.listdir(folder_locs['in_pics'])
files_in_input_photos = [is_file_a_photo(x) for x in files_in_input_photos if is_file_a_photo(x)]
len(files_in_input_photos)

0

get all files in the output_photos folder.  
The script will assume that these photos have already been processed, and will not process them again.

In [11]:
file_in_output_photos = os.listdir(folder_locs['out_pics'])
file_in_output_photos = [is_file_a_photo(x) for x in file_in_output_photos if is_file_a_photo(x)]
len(file_in_output_photos)

0

create a dataframe for the results of the script

In [12]:
results_df = pd.DataFrame(columns=['filename', 'result'])

add notes for files that have already been processed. 

In [13]:
results_df['filename'] = file_in_output_photos
results_df['result'] = 'already exists in output_photos'
# results_df

make a list of the files to process.   
This is all the images in the input_photos folder, that does not have a corresponding file in the output_photos folder.

In [14]:
files_to_process = list(set(files_in_input_photos).difference(set(file_in_output_photos)))
len(files_to_process)

0

make a list of photos that exists in the csv file, that do NOT exists in the input_photos folder.

In [15]:
files_that_exists = list(set(files_in_input_photos + file_in_output_photos))
missing_files = list(set(csv_df[interest_cols['filename']].tolist()).difference(set(files_that_exists)))
missing_files = [is_file_a_photo(x) for x in missing_files if is_file_a_photo(x)]
len(missing_files)

123404

add the missing files to the results_df

In [16]:
missing_rows = [{'filename': x, 'result':'missing file in input_photos'} for x in missing_files]
results_df = results_df.append(missing_rows, ignore_index=True)
# results_df

get a list of all files that are not photos in the csv_df

In [17]:
def is_file_not_a_photo(a_filename):
    is_not_in_photo = [x in a_filename for x in photo_file_types]
    is_not_in_photo = not any(is_not_in_photo)
    if is_not_in_photo:
        return a_filename
    else:
        return None

In [18]:
not_photo_file = csv_df[interest_cols['filename']].tolist()
not_photo_file = [is_file_not_a_photo(x) for x in not_photo_file if is_file_not_a_photo(x)]

add files that are not photos to results_df

In [19]:
not_photo_rows = [{'filename': x, 'result':'file is not a photo'} for x in not_photo_file]
results_df = results_df.append(not_photo_rows, ignore_index=True)
# results_df

In [20]:
missing_from_csv = list(set(files_to_process).difference(set(csv_df[interest_cols['filename']].tolist())))
csv_missing_rows = [{'filename': x, 'result':'photo is not in csv file'} for x in missing_from_csv]
results_df = results_df.append(csv_missing_rows, ignore_index=True)
# results_df

In [21]:
files_to_process_2 = list(set(files_to_process).difference(set(missing_from_csv)))
len(files_to_process_2)

0

subset the csv_df to contain only rows that have existing files in the input_photos folder

In [22]:
csv_df = csv_df.set_index(interest_cols['filename']).loc[files_to_process_2].reset_index()
# csv_df

create the function to annotate and save an image

In [23]:
def create_annotated_image(filename):
    txt_color=(255, 255, 255) # the text color will be white
    cur_df = csv_df[csv_df[interest_cols['filename']]==filename].copy() # subset the dataframe for one photo
    cur_filepath = folder_locs['in_pics'] + filename # create the filepath for the filename
    cur_img = cv.imread(cur_filepath)
    line_thickness = max(round(sum(cur_img.shape) / 2 * 0.003), 2)  # calculate bbox line thickness
    n_rows = cur_df.shape[0]
    cur_row = 0
    try:
        for i, row in cur_df.iterrows():

            cur_label = row[interest_cols['label']] # get the label of the row

            # get the color (BGR) for the label
            cur_col = label_bgrs[cur_label]

            # make rectangle xy coordinates
            x1 = int(row[interest_cols['x1']])
            y1 = int(row[interest_cols['y1']])
            x2 = int(row[interest_cols['x2']])
            y2 = int(row[interest_cols['y2']])
            coord_1, coord_2 = (x1, y1), (x2, y2)

            # draw the bounding box
            cv.rectangle(cur_img, 
                         coord_1, 
                         coord_2, 
                         cur_col, 
                         thickness=line_thickness, 
                         lineType=cv.LINE_AA,
                        )

            # insert text label
            font_thickness = max(line_thickness - 1, 1)
            w, h = cv.getTextSize(cur_label, 
                                  0, 
                                  fontScale=line_thickness / 3, 
                                  thickness=font_thickness)[0]
            coord_2 = coord_1[0] + w, coord_1[1] - h - 3
            cv.rectangle(cur_img, 
                         coord_1, 
                         coord_2, 
                         cur_col, 
                         -1, 
                         cv.LINE_AA,
                        )
            cv.putText(cur_img, 
                       cur_label, 
                       (coord_1[0], coord_1[1] - 2), 
                       0, 
                       line_thickness / 3, 
                       txt_color, 
                       thickness=font_thickness, 
                       lineType=cv.LINE_AA,
                      )

            cur_row += 1

        out_filepath = folder_locs['out_pics'] + filename # create relative filepath of the image to be saved
        cv.imwrite(out_filepath, cur_img) # save the file
        return {'filename': filename, 'result': 'success'}
    
    except:
        return {'filename': filename, 'result': 'failed'}
    

In [24]:
def mp_map_async_with_results(func,
                              obj):
    """
    PARAMS:
    + function: func
    + list: obj

    RETURNS:
    + list: res

    DESC:
    general function for parallel. takes a function, and a list of parameters,
    and applies the function to each item in the list.
    detects number of cores available, and uses all of them.
    parallel application is asynchronous, so it will not execute in list order necessarily.

    """

    n_processors = mp.cpu_count()
    n_processors = int(n_processors * 0.75) # use only 75% of available processors
    pool = mp.Pool(n_processors)
    res = pool.map_async(func,
                         obj)
    pool.close()
    pool.join()

    return res.get()


create a list of filenames to be processed

In [25]:
filenames = csv_df[interest_cols['filename']].unique()
# filenames = filenames[-200::]
len(filenames)

0

process the files in parallel

In [26]:
%%time
results = mp_map_async_with_results(create_annotated_image, filenames)

CPU times: user 8.07 ms, sys: 58.1 ms, total: 66.2 ms
Wall time: 70.8 ms


iterate through each filename and get the result of the processing (either 'success' of 'failed')

In [27]:
# %%time
# results = []
# for i, filename in enumerate(filenames):
#     results.append(create_annotated_image(filename))

add the results of the functions to the results_df

In [28]:
results_df = pd.concat([results_df, pd.DataFrame(results)], ignore_index=True)
# results_df

save the results_df as a csv file

In [29]:
results_df.to_csv(folder_locs['out_res'] + 'results_' + str(time.time()) + '.csv', index=False)