## Train-Val-Test Split

Steps:
For each class:
* Get the list of image names, sort them according to their time-stamp, and then split 70-15-15.
* Return the list of bounding boxes in the train, val and test set respectively.


In [1]:
import pandas as pd
import numpy as np

In [2]:
NUM_CLASSES = 10

bird_names = ["Black-naped Oriole", "Brown-throated Sunbird", "Collared Kingfisher", "Javan Myna", "Olive-backed Sunbird", "Pink-necked Green Pigeon", "Spotted Dove", "Striated Heron", "White-breasted Waterhen", "Yellow-vented Bulbul"]
bounding_box_dir = "data/Bounding Boxes/"
metadata_dir = "data/"

bounding_box_dfs = []
metadata_dfs = []
for bird_name in bird_names:
    bounding_box_csv_path = bounding_box_dir + bird_name + ".csv"
    bounding_box_df = pd.read_csv(bounding_box_csv_path)
    bounding_box_dfs.append(bounding_box_df)
    
    metadata_csv_path = metadata_dir + bird_name + ".csv"
    metadata_df = pd.read_csv(metadata_csv_path)
    metadata_dfs.append(metadata_df)

In [3]:
bounding_box_dfs[6]

Unnamed: 0,label,top_left_x,top_left_y,width,height,image_name,image_width,image_height
0,Spotted Dove,14,78,102,76,Spotted Dove-0.png,224,224
1,Spotted Dove,89,111,52,59,Spotted Dove-0.png,224,224
2,Spotted Dove,71,83,73,47,Spotted Dove-1.png,224,224
3,Spotted Dove,28,124,99,70,Spotted Dove-10.png,224,224
4,Spotted Dove,93,82,74,58,Spotted Dove-100.png,224,224
...,...,...,...,...,...,...,...,...
101,Spotted Dove,69,78,77,45,Spotted Dove-99.png,224,224
102,Spotted Dove,53,97,171,74,Spotted Dove-136.png,224,224
103,Spotted Dove,132,30,43,64,Spotted Dove-138.png,224,224
104,Spotted Dove,96,94,47,52,Spotted Dove-139.png,224,224


In [4]:
metadata_dfs[6]

Unnamed: 0.1,Unnamed: 0,index,id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,created_at,...,geoprivacy,taxon_geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id
0,0,414,82387771,2021-05-29 18:41:31,2021-05-29,2021-05-29 10:41:31 UTC,Singapore,44170,gancw1,2021-06-10 04:58:31 UTC,...,,open,False,gps,gps,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
1,1,363,75020896,2021-04-24 2:11:09 PM GMT+08:00,2021-04-24,2021-04-24 06:11:09 UTC,Perth,4063353,gavin_n,2021-04-24 10:12:40 UTC,...,,open,False,gps,gps,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
2,2,299,69368323,2021/02/08 8:35 AM UTC,2021-02-08,2021-02-08 08:35:00 UTC,UTC,3947761,e0424873yingfeng21,2021-02-10 16:04:44 UTC,...,,open,False,,,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
3,3,187,60046018,2020-09-19 8:12:57 AM GMT+08:00,2020-09-19,2020-09-19 00:12:57 UTC,Singapore,162339,sohkamyung,2020-09-19 06:25:20 UTC,...,,open,False,gps,gps,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
4,4,515,93855625,2021/09/06 4:50 PM +0800,2021-09-06,2021-09-06 08:50:00 UTC,Singapore,4833518,patengpat,2021-09-06 14:19:25 UTC,...,,open,False,,,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,145,145,53444106,Fri Jul 17 2020 18:25:16 GMT+0800 (GMT+8),2020-07-17,2020-07-17 10:25:16 UTC,Australia/Perth,3174592,tiark,2020-07-18 05:59:48 UTC,...,,open,False,,,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
146,146,512,93693563,2021-08-23 10:20:39,2021-08-23,2021-08-23 02:20:39 UTC,Singapore,4884191,george_119,2021-09-05 09:36:16 UTC,...,,open,False,gps,gps,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
147,147,528,95504897,2021-09-20 12:05:50,2021-09-20,2021-09-20 04:05:50 UTC,Singapore,162339,sohkamyung,2021-09-20 04:55:14 UTC,...,,open,False,gps,gps,Spotted Dove,Streptopelia chinensis,Spotted Dove,Aves,144549
148,148,428,84033296,2021/06/22 8:23 PM UTC,2021-06-22,2021-06-22 20:23:00 UTC,UTC,4374994,sharikavbhat,2021-06-22 08:55:24 UTC,...,,open,False,,,珠頸斑鳩,Streptopelia chinensis,Spotted Dove,Aves,144549


In [5]:
train_bb_all = pd.DataFrame()
val_bb_all = pd.DataFrame()
test_bb_all = pd.DataFrame()

for i in range(NUM_CLASSES):
    bb_df = bounding_box_dfs[i]
    metadata_df = metadata_dfs[i]
    bird_name = bird_names[i]
    
    # Get the image numbers we are using
    image_names = bb_df["image_name"].unique() # numpy array
    get_image_num_from_name = lambda image_name: image_name[len(bird_name) + 1 : -4]
    image_numbers = [int(get_image_num_from_name(image_name)) for image_name in image_names]
    
    # Sort by date observed on (because time can be missing for some observations)
    image_numbers.sort(key = lambda image_num : str(metadata_df.at[image_num, "observed_on"]))
    
    # Convert back into name format
    image_names = [bird_name + "-" + str(image_num) + ".png" for image_num in image_numbers]

    # Split into train-val-test    
    train_image_names = image_names[:70]
    val_image_names = image_names[70:85]
    test_image_names = image_names[85:]
    
    assert(len(train_image_names) == 70)
    assert(len(val_image_names) == 15)
    assert(len(test_image_names) == 15)

    train_bbs = bb_df.loc[bb_df['image_name'].isin(train_image_names)]
    val_bbs = bb_df.loc[bb_df['image_name'].isin(val_image_names)]
    test_bbs = bb_df.loc[bb_df['image_name'].isin(test_image_names)]
    
    # Add to the main dataframe
    train_bb_all = pd.concat([train_bb_all, train_bbs])
    val_bb_all = pd.concat([val_bb_all, val_bbs])
    test_bb_all = pd.concat([test_bb_all, test_bbs])
    
# Shuffle everything randomly
train_bb_all = train_bb_all.sample(frac=1).reset_index(drop=True)
val_bb_all = val_bb_all.sample(frac=1).reset_index(drop=True)
test_bb_all = test_bb_all.sample(frac=1).reset_index(drop=True)
    
# Ensure that there is no leakage between the training, validation and test sets.

assert(len(set(train_bb_all['image_name'].unique()).intersection(set(val_bb_all['image_name'].unique()))) == 0)
assert(len(set(train_bb_all['image_name'].unique()).intersection(set(test_bb_all['image_name'].unique()))) == 0)
assert(len(set(val_bb_all['image_name'].unique()).intersection(set(test_bb_all['image_name'].unique()))) == 0)

# Save these new bounding box files
train_bb_all.to_csv(bounding_box_dir + "train_bbs.csv", index = False)
val_bb_all.to_csv(bounding_box_dir + "val_bbs.csv", index = False)
test_bb_all.to_csv(bounding_box_dir + "test_bbs.csv", index = False)

### Sanity Checks

In [6]:
print(metadata_df.loc[image_numbers,"observed_on"]) # Should be in increasing date order

6      2020-03-16
85     2020-03-29
10     2020-04-06
136    2020-05-03
46     2020-05-10
          ...    
78     2022-02-10
69     2022-02-12
87     2022-02-20
94     2022-02-21
44     2022-02-27
Name: observed_on, Length: 100, dtype: object


In [7]:
train_image_names # Should correspond to the image_numbers in the same order

['Yellow-vented Bulbul-6.png',
 'Yellow-vented Bulbul-85.png',
 'Yellow-vented Bulbul-10.png',
 'Yellow-vented Bulbul-136.png',
 'Yellow-vented Bulbul-46.png',
 'Yellow-vented Bulbul-100.png',
 'Yellow-vented Bulbul-5.png',
 'Yellow-vented Bulbul-108.png',
 'Yellow-vented Bulbul-116.png',
 'Yellow-vented Bulbul-80.png',
 'Yellow-vented Bulbul-95.png',
 'Yellow-vented Bulbul-30.png',
 'Yellow-vented Bulbul-28.png',
 'Yellow-vented Bulbul-107.png',
 'Yellow-vented Bulbul-68.png',
 'Yellow-vented Bulbul-99.png',
 'Yellow-vented Bulbul-32.png',
 'Yellow-vented Bulbul-33.png',
 'Yellow-vented Bulbul-113.png',
 'Yellow-vented Bulbul-47.png',
 'Yellow-vented Bulbul-65.png',
 'Yellow-vented Bulbul-115.png',
 'Yellow-vented Bulbul-2.png',
 'Yellow-vented Bulbul-31.png',
 'Yellow-vented Bulbul-26.png',
 'Yellow-vented Bulbul-98.png',
 'Yellow-vented Bulbul-24.png',
 'Yellow-vented Bulbul-13.png',
 'Yellow-vented Bulbul-3.png',
 'Yellow-vented Bulbul-70.png',
 'Yellow-vented Bulbul-111.png',
 'Ye

In [8]:
test_image_names

['Yellow-vented Bulbul-114.png',
 'Yellow-vented Bulbul-103.png',
 'Yellow-vented Bulbul-91.png',
 'Yellow-vented Bulbul-110.png',
 'Yellow-vented Bulbul-117.png',
 'Yellow-vented Bulbul-29.png',
 'Yellow-vented Bulbul-132.png',
 'Yellow-vented Bulbul-59.png',
 'Yellow-vented Bulbul-101.png',
 'Yellow-vented Bulbul-36.png',
 'Yellow-vented Bulbul-78.png',
 'Yellow-vented Bulbul-69.png',
 'Yellow-vented Bulbul-87.png',
 'Yellow-vented Bulbul-94.png',
 'Yellow-vented Bulbul-44.png']

In [9]:
print(train_bb_all.shape, val_bb_all.shape, test_bb_all.shape)

(749, 8) (163, 8) (159, 8)


In [10]:
train_bb_all

Unnamed: 0,label,top_left_x,top_left_y,width,height,image_name,image_width,image_height
0,Brown-throated Sunbird,23,83,138,73,Brown-throated Sunbird-6.png,224,224
1,Brown-throated Sunbird,74,100,34,50,Brown-throated Sunbird-39.png,224,224
2,Olive-backed Sunbird,45,80,80,102,Olive-backed Sunbird-64.png,224,224
3,Brown-throated Sunbird,118,84,44,77,Brown-throated Sunbird-83.png,224,224
4,Olive-backed Sunbird,53,52,111,138,Olive-backed Sunbird-124.png,224,224
...,...,...,...,...,...,...,...,...
744,Collared Kingfisher,104,66,47,81,Collared Kingfisher-36.png,224,224
745,Pink-necked Green Pigeon,169,101,23,30,Pink-necked Green Pigeon-53.png,224,224
746,Brown-throated Sunbird,84,26,80,108,Brown-throated Sunbird-48.png,224,224
747,Collared Kingfisher,84,64,64,112,Collared Kingfisher-105.png,224,224
