In [None]:
# Import necessary libraries and modules

import pandas as pd
import numpy as np
import json
import os
from random import sample
import shutil
import glob
from google.cloud import storage

In [None]:
# Increase the display size in order to prevent truncation later on

pd.set_option('display.max_rows', 200)

In [None]:
# Function to read in json metadata for the chosen datasets as data frame

def metadata_to_df(file, dataset):
    with open(file) as json_data:
        json_dict = json.load(json_data)
    df_images = pd.DataFrame(json_dict['images'])
    df_images = df_images.set_index('id')
    df_annot = pd.DataFrame(json_dict['annotations'])
    df_annot = df_annot.set_index('image_id')
    df_annot = df_annot.drop(['id'], axis=1)
    df_annot.index.name = 'id'
    df = pd.merge(df_annot, df_images,  how='left', on='id')
    df = df[['category_id', 'file_name']]
    df_cats = pd.DataFrame(json_dict['categories'])
    for col in ['count', 'species', 'genus', 'family',
            'ord', 'class', 'common name']:
        if col in df_cats.columns:
            df_cats = df_cats.drop(col, axis=1)
    df_cats = df_cats.rename(columns={'id':'category_id'})
    df = df.reset_index().merge(df_cats, how="left").set_index('id')
    df['name'].value_counts()
    df["Dataset"] = dataset
    return df

In [None]:
# Use function to create dataframe for each dataset

caltech_df = metadata_to_df('caltech_images_20210113.json', 'Caltech')
island_df = metadata_to_df('island_conservation.json', 'Island')
missouri_df = metadata_to_df("missouri_camera_traps_set1.json", 'Missouri')
camdeboo_df = metadata_to_df("SnapshotCamdeboo_S1_v1.0.json", 'Camdeboo')
wcs_df = metadata_to_df("wcs_camera_traps.json", "WCS")
ena24_df = metadata_to_df("ena24.json", 'ENA24')
wellington_df = metadata_to_df("wellington_camera_traps.json", 'Wellington')
karoo_df = metadata_to_df('SnapshotKaroo_S1_v1.0.json', "Karoo")
kgalagai_df = metadata_to_df('SnapshotKgalagai_S1_v1.0.json', "Kgalagai")
enonkishu_df = metadata_to_df('SnapshotEnonkishu_S1_v1.0.json', "Enonkishu")
mountain_zebra_df = metadata_to_df('SnapshotMountainZebra_S1_v1.0.json',
                             'Mountain Zebra')
kruger_df = metadata_to_df('SnapshotKruger_S1_v1.0.json', 'Kruger')
nacti_df = metadata_to_df('nacti_metadata.json', 'NACTI')
serengeti_df = metadata_to_df('SnapshotSerengeti_S1-11_v2.1.json',
                        'Serengeti')

In [None]:
# Combine resultant dataframes

complete_df = pd.concat([caltech_df, island_df, missouri_df,
                         camdeboo_df, wcs_df, ena24_df, wellington_df,
                         karoo_df, kgalagai_df, enonkishu_df,
                         mountain_zebra_df, kruger_df, nacti_df,
                         serengeti_df])

In [None]:
# Load the corrected species/subfamilies/families information

names_df = pd.read_csv("names_species_families.csv")

In [None]:
# Rename column to match with complete_df

names_df = names_df.rename(columns={'Label':'name'})

In [None]:
# Merge data frames

complete_df = complete_df.reset_index().merge(names_df,  how='left', on='name').set_index('id')

In [None]:
# Check how many images, species, subfamilies and families are contained in dataframe

complete_df.nunique()

In [None]:
# Check how many species have at least 1,000 images

species_counts = complete_df['Species'].value_counts()
highest_species_counts = species_counts[species_counts >= 1000]
len(highest_species_counts)

In [None]:
# Drop species that are either too vague to categorise (e.g. 'Bird') or not animals ('human', 'motorcycle', etc.)

highest_species_counts = highest_species_counts.drop(labels=['Bird', 'Car', 'Deer',
                            'Domestic animal', 'Human', 'Motorcycle', 'Petrel',
                            'Rat', 'Rodent', 'Unknown'])

In [None]:
# Create new dataframe of the species that will be classified

subset_df = complete_df[complete_df['Species'].isin(highest_species_counts.index)]

In [None]:
# Create separate dataframes for each dataset. These will be used to select the images

kgalagai_df = subset_df[subset_df["Dataset"] == "Kgalagai"]
caltech_df = subset_df[subset_df["Dataset"] == "Caltech"]
island_df = subset_df[subset_df["Dataset"] == "Island"]
missouri_df = subset_df[subset_df["Dataset"] == "Missouri"]
camdeboo_df = subset_df[subset_df["Dataset"] == "Camdeboo"]
wcs_df = subset_df[subset_df["Dataset"] == "WCS"]
ena24_df = subset_df[subset_df["Dataset"] == "ENA24"]
wellington_df = subset_df[subset_df["Dataset"] == "Wellington"]
karoo_df = subset_df[subset_df["Dataset"] == "Karoo"]
enonkishu_df = subset_df[subset_df["Dataset"] == "Enonkishu"]
mountain_zebra_df = subset_df[subset_df["Dataset"] == "Mountain Zebra"]
kruger_df = subset_df[subset_df["Dataset"] == "Kruger"]
nacti_df = subset_df[subset_df["Dataset"] == "NACTI"]
serengeti_df = subset_df[subset_df["Dataset"] == "Serengeti"]

In [None]:
# Create a dataframe showing number of species images in each dataset

species_per_dataset = subset_df.groupby(['Species', 'Dataset']).size()
species_per_dataset = pd.DataFrame(species_per_dataset)
species_per_dataset.reset_index(inplace=True)
species_per_dataset.columns = ['Species', 'Dataset', 'Count']

In [None]:
# Remove Missouri from dataframe because this will be used as out-of-sample data

species_per_dataset = species_per_dataset[~(species_per_dataset['Dataset'] == 'Missouri')]

In [None]:
# Add a column which will indicate how many images to use

species_per_dataset['Selection'] = species_per_dataset['Count']

In [None]:
# Change the selection value to maximum of 1,000 divided by the number of datasets containing that species

list_of_species = list(species_per_dataset['Species'].unique())
for species in list_of_species:
    length = len(species_per_dataset[species_per_dataset['Species'] == species])
    index = species_per_dataset[species_per_dataset['Species']==species].index
    for n in index:
        if species_per_dataset['Selection'][n] <= int(1000/length):
            continue
        else:
            species_per_dataset.loc[n,'Selection'] = int(1000/length)

In [None]:
# Add a column showing the total images selected so far per species, plus column for images left to choose

grouped = species_per_dataset.groupby(['Species']).sum()
for species in grouped.index:
    species_per_dataset.loc[species_per_dataset['Species'] == species, 'Total'] =  grouped.loc[species,'Selection']
species_per_dataset['Remaining'] = species_per_dataset['Count'] - species_per_dataset['Selection']

In [None]:
# Get new list based on species where less than 1000 images have been selected
new_list_of_species = list(species_per_dataset[species_per_dataset["Total"] != 1000]['Species'].unique())
# Repeat above process to select more images
for species in new_list_of_species:
    length = sum(species_per_dataset[species_per_dataset["Remaining"] != 0]['Species'] == species)
    index = species_per_dataset[species_per_dataset["Remaining"] != 0][species_per_dataset[species_per_dataset["Remaining"] != 0]['Species'] == species].index
    for n in index:
        species_per_dataset.loc[n,'Selection'] += min(int((1000 - species_per_dataset.loc[n, 'Total'])/length), species_per_dataset.loc[n,'Remaining'])
for species in list_of_species:
    species_per_dataset.loc[species_per_dataset['Species'] == species, 'Total'] =  sum(species_per_dataset.loc[species_per_dataset['Species'] == species, 'Selection'])
    species_per_dataset.loc[species_per_dataset["Species"] == species, "Remaining"] = species_per_dataset.loc[species_per_dataset["Species"] == species, "Count"] - species_per_dataset.loc[species_per_dataset["Species"] == species,"Selection"]

In [None]:
# Repeat entire process
new_list_of_species = list(species_per_dataset[species_per_dataset["Total"] != 1000]['Species'].unique())
for species in new_list_of_species:
    length = sum(species_per_dataset[species_per_dataset["Remaining"] != 0]['Species'] == species)
    index = species_per_dataset[species_per_dataset["Remaining"] != 0][species_per_dataset[species_per_dataset["Remaining"] != 0]['Species'] == species].index
    for n in index:
        species_per_dataset.loc[n,'Selection'] += min(int((1000 - species_per_dataset.loc[n, 'Total'])/length), species_per_dataset.loc[n,'Remaining'])
for species in list_of_species:
    species_per_dataset.loc[species_per_dataset['Species'] == species, 'Total'] =  sum(species_per_dataset.loc[species_per_dataset['Species'] == species, 'Selection'])
    species_per_dataset.loc[species_per_dataset["Species"] == species, "Remaining"] = species_per_dataset.loc[species_per_dataset["Species"] == species, "Count"] - species_per_dataset.loc[species_per_dataset["Species"] == species,"Selection"]

In [None]:
# Make manual adjustments to final few counts in order to get total to 1,000. 

species_per_dataset.loc[19,'Selection'] = 148
species_per_dataset.loc[20,'Selection'] = 148
species_per_dataset.loc[21,'Selection'] = 148
species_per_dataset.loc[23,'Selection'] = 148
species_per_dataset.loc[36,'Selection'] = 244
species_per_dataset.loc[46,'Selection'] = 479
species_per_dataset.loc[54,'Selection'] = 251
species_per_dataset.loc[69,'Selection'] = 334
species_per_dataset.loc[74,'Selection'] = 428
species_per_dataset.loc[90,'Selection'] = 355
species_per_dataset.loc[100,'Selection'] = 162
species_per_dataset.loc[101,'Selection'] = 162
species_per_dataset.loc[102,'Selection'] = 162
species_per_dataset.loc[104,'Selection'] = 162
species_per_dataset.loc[108,'Selection'] = 84
species_per_dataset.loc[109,'Selection'] = 84
species_per_dataset.loc[110,'Selection'] = 84
species_per_dataset.loc[111,'Selection'] = 84
species_per_dataset.loc[121,'Selection'] = 216
species_per_dataset.loc[123,'Selection'] = 216
species_per_dataset.loc[130,'Selection'] = 216
species_per_dataset.loc[133,'Selection'] = 306
species_per_dataset.loc[134,'Selection'] = 306
species_per_dataset.loc[142,'Selection'] = 334
species_per_dataset.loc[159,'Selection'] = 202
species_per_dataset.loc[160,'Selection'] = 202
species_per_dataset.loc[161,'Selection'] = 202
species_per_dataset.loc[163,'Selection'] = 195
species_per_dataset.loc[165,'Selection'] = 195
species_per_dataset.loc[167,'Selection'] = 195
species_per_dataset.loc[186,'Selection'] = 178
species_per_dataset.loc[188,'Selection'] = 178
species_per_dataset.loc[191,'Selection'] = 178
species_per_dataset.loc[192,'Selection'] = 178
species_per_dataset.loc[195,'Selection'] = 230
species_per_dataset.loc[230,'Selection'] = 238
species_per_dataset.loc[253,'Selection'] = 427
species_per_dataset.loc[255,'Selection'] = 166
species_per_dataset.loc[256,'Selection'] = 166
species_per_dataset.loc[293,'Selection'] = 334
species_per_dataset.loc[312,'Selection'] = 334
species_per_dataset.loc[315,'Selection'] = 306
species_per_dataset.loc[323,'Selection'] = 267
species_per_dataset.loc[326,'Selection'] = 267
species_per_dataset.loc[330,'Selection'] = 377
species_per_dataset.loc[346,'Selection'] = 143
species_per_dataset.loc[347,'Selection'] = 143
species_per_dataset.loc[348,'Selection'] = 143
species_per_dataset.loc[349,'Selection'] = 143
species_per_dataset.loc[350,'Selection'] = 143
species_per_dataset.loc[351,'Selection'] = 143

In [None]:
# Check that all totals are now 1,000

species_per_dataset.groupby(['Species']).sum()

In [None]:
# Second check to confirm we have exactly 1,000 of all species:

species_per_dataset.groupby(['Species']).sum()['Selection'].unique()

In [None]:
# Function to select random images based on the quantities defined in the species_per_dataset dataframe

def choose_random_images(dataset, dataframe):
    df = pd.DataFrame(columns = dataframe.columns)
    for species in species_per_dataset['Species'].unique():
        if species in dataframe['Species'].unique():
            qty = int(species_per_dataset[(species_per_dataset['Species'] == species)
                & (species_per_dataset['Dataset'] == dataset)]['Selection'])
            temp_df = (dataframe[dataframe['Species'] == species].sample(qty))
            df = pd.concat([df, temp_df])
    return df

In [None]:
# Create new dataframes containing only information on the images to be used in the project

df_list = [kgalagai_df, caltech_df, island_df, camdeboo_df, wcs_df, ena24_df, wellington_df, 
               karoo_df, enonkishu_df, mountain_zebra_df, kruger_df, nacti_df, serengeti_df]
name_list = ['Kgalagai', 'Caltech', 'Island', 'Camdeboo', 'WCS', 'ENA 24', 'Wellington', 'Karoo', 'Enonkishu',
            'Mountain Zebra', 'Kruger', 'NACTI', 'Serengeti']
subset_list = []
for i in range(0, len(df_list)):
    subset_list.append(choose_random_images(name_list[i], df_list[i]))
kgalagai_subset_df, caltech_subset_df, island_subset_df, camdeboo_subset_df, wcs_subset_df, ena24_subset_df, \
wellington_subset_df, karoo_subset_df, enonkishu_subset_df, mountain_zebra_subset_df, kruger_subset_df, \
nacti_subset_df, serengeti_subset_df = subset_list

In [None]:
# Merge new dataframes

image_selection_df = pd.concat([kgalagai_subset_df, caltech_subset_df,
                            island_subset_df, camdeboo_subset_df, wcs_subset_df,
                            ena24_subset_df, wellington_subset_df, karoo_subset_df,
                            enonkishu_subset_df, mountain_zebra_subset_df,
                            kruger_subset_df, nacti_subset_df, serengeti_subset_df])

In [None]:
# Check length matches expected number (112,000)

len(image_selection_df.index)

In [None]:
# Function to pull out the file names to be downloaded (using AzCopy)

def images_to_download(dataframe):
    x = list(dataframe['file_name'])
    x = ';'.join(x)
    return x

In [None]:
# Get lists of images to be downloaded 

kgalagai_images = images_to_download(kgalagai_subset_df)
caltech_images = images_to_download(caltech_subset_df)
island_images = images_to_download(island_subset_df)
camdeboo_images = images_to_download(camdeboo_subset_df)
wcs_images = images_to_download(wcs_dsubset_f)
ena24_images = images_to_download(ena24_subset_df)
wellington_images = images_to_download(wellington_subset_df)
karoo_images = images_to_download(karoo_subset_df)
enonkishu_images = images_to_download(enonkishu_subset_df)
mountain_zebra_images = images_to_download(mountain_zebra_subset_df)
kruger_images = images_to_download(kruger_subset_df)
nacti_images = images_to_download(nacti_subset_df)
serengeti_images = images_to_download(serengeti_subset_df)

In [None]:
# Dataframe contains some duplicates (two or more entries for same image). Need to drop these:

image_selection_df = image_selection_df.drop_duplicates(subset='file_name')

In [None]:
len(image_selection_df.index)

In [None]:
image_selection_df['Species'].value_counts()

In [None]:
# Function to create training, validation and test datasets (with random choices)

def choose_splits(dataframe):
    train_df = pd.DataFrame(columns = dataframe.columns)
    validation_df = pd.DataFrame(columns = dataframe.columns)
    for species in species_list:
        temp_df = (dataframe[dataframe['Species'] == species].sample(int(image_selection_df['Species'].value_counts()[species]*.8)))
        train_df = pd.concat([train_df, temp_df])
    remainder_df = pd.concat([dataframe, train_df]).drop_duplicates(keep=False)
    for species in species_list:
        temp_df = (remainder_df[remainder_df['Species'] == species].sample(int(image_selection_df['Species'].value_counts()[species]*.1)))
        validation_df = pd.concat([validation_df, temp_df])
    test_df = pd.concat([remainder_df, validation_df]).drop_duplicates(keep=False)
    return train_df, validation_df, test_df

In [None]:
# Create folders for training, validation and test datasets on local disk

path = "F:\\"
training_path = os.path.join(path, "training")
validation_path = os.path.join(path, "validation")
test_path = os.path.join(path, "test")
path_list = [training_path, validation_path, test_path]
for path in path_list:
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# List of species to be used

species_list = list(image_selection_df['Species'].unique())

In [None]:
# Create folders in each of the training, validation and test folders
# One folder for each species in species list

train_path = "F:\\training"
validation_path = "F:\\validation"
test_path = "F:\\test"

for species in species_list:
    os.makedirs(os.path.join(train_path, species))
    os.makedirs(os.path.join(validation_path, species))
    os.makedirs(os.path.join(test_path, species))

In [None]:
# Dictionary mapping datasets to their current location on the local drive

path_dict = {'Serengeti': 'F:\Serengeti\snapshotserengeti-unzipped',
             'WCS':'F:\WCS\wcs-unzipped', 'Enonkishu':'F:\Enonkishu\ENO_public',
             'Camdeboo':'F:\Camdeboo\CDB_public', 'Mountain Zebra':'F:\Mountain Zebra\MTZ_public',
               'Kgalagai':'F:\Kgalagai\KGA_public', 'Kruger':'F:\Kruger\KRU_public',
             'ENA24':'F:\ENA24\images', 'Island':'F:\Island\public',
             'Wellington':'F:\Wellington\images', 'Caltech':'F:\Caltech\cct_images',
               'Karoo':'F:\Karoo\KAR_public', "NACTI":r"F:\NACTI\nacti-unzipped"}

In [1]:
# On inspection, it is clear that files belonging to the WCS dataset often share the same name. The initial
# download placed them in different folders (Part 0, Part 1, etc). However, they will potentially be placed
# in the same folder when they are arranged into training, validation and test sets. Therefore, it is necessary
# to modify the file names in order to avoid overwriting existing files and mixing up labels.

In [None]:
# Begin by adding a column with a unique number for each file (as a string)

image_selection_df['unique_id'] = np.arange(image_selection_df.shape[0])
image_selection_df['unique_id'] = image_selection_df['unique_id'].astype(str)

In [None]:
# Add another column to dataframe, this time consisting of the file name without "jpg"

modification = []
for file in image_selection_df['file_name']:
        # USe reverse find to work backwards to find start of 'jpg'
        index = file.rfind(".")
        modification.append(file[:index])
image_selection_df['file_name_modified'] = modification

In [None]:
# Add the unique number onto the modified file name, plus '.jpg'

image_selection_df.loc[image_selection_df["Dataset"] == 'WCS', 
    "file_name_modified"] = image_selection_df.loc[image_selection_df["Dataset"] == 'WCS', 
            "file_name_modified"]+"_"+image_selection_df.loc[image_selection_df["Dataset"] == 'WCS', 
                    "unique_id"]+".jpg"

In [None]:
# Create dataframe of just the WCS dataset images

wcs_new = image_selection_df[image_selection_df['Dataset'] == 'WCS']

In [None]:
# Find the files (using the original file name) and rename using the modified filename

for file in wcs_new['file_name']:
    old_name = os.path.join(path_dict['WCS'], file)
    new_name = os.path.join(path_dict['WCS'], wcs_new[wcs_new['file_name'] == file]['file_name_modified'].iloc[0])
    try:
        os.rename(old_name, new_name)
    except:
        continue

In [None]:
# Can now correct the file name column in the main dataframe

image_selection_df.loc[image_selection_df['Dataset'] == 'WCS', 
        'file_name'] = image_selection_df.loc[image_selection_df['Dataset'] == 'WCS', 'file_name_modified']

In [None]:
# Use the earlier defined function to create the training, validation and test splits

train_df, validation_df, test_df = choose_splits(image_selection_df)

In [None]:
# Function to copy files from their downloaded location into the correct folder in the
# appropriate dataset (training, validation or test)

def file_copier(dataframe, path):
    for species in training_species:
        temp_df = dataframe[dataframe['Species'] == species]
        for dataset in temp_df['Dataset'].unique():
            temp_df2 = temp_df[temp_df['Dataset'] == dataset]
            for file in temp_df2['file_name']:
                file_to_locate = os.path.join(path_dict[dataset], file)
                new_location = os.path.join(path, species)
                try:
                    shutil.copy2(file_to_locate, new_location)
                except:
                    continue

In [None]:
file_copier(train_df, "F:\\training")

In [None]:
file_copier(validation_df, "F:\\validation")

In [None]:
file_copier(test_df, "F:\\test")

In [2]:
# Can now move on to creating the family-level datasets

In [None]:
# Check for and locate any entries without a family value

image_selection_df['Species'][image_selection_df['Family'].isna()].unique()

In [None]:
# Get index of missing value

image_selection_df['Family'][image_selection_df['Species'] == 'Porcupine'][image_selection_df['Family'][image_selection_df['Species'] == 'Porcupine'].isna()]

In [None]:
# Correct missing family value

image_selection_df.loc[55942, 'Family'] = 'Hystricidae'

In [None]:
# Set subfamily and faily to 'None' for empty images

image_selection_df.loc[image_selection_df['Species'] == 'Empty', ['Subfamily', 'Family']] = 'None'

In [None]:
# Get list of family values for entries that do not have a subfamily classification

families_to_use = list(image_selection_df['Family'][image_selection_df['Subfamily'].isna()].unique())

In [None]:
# Add column for one classification (either subfamily or family), intially filled iwth subfamily values

image_selection_df['Subfamily/Family'] = image_selection_df['Subfamily']

In [None]:
# Change the subfamily/family entry to family classification when subfamily doesn't exist

for family in families_to_use:
    image_selection_df.loc[image_selection_df['Family'] == family, ['Subfamily/Family']] = family

In [None]:
# Check unique subfamily/family types

image_selection_df['Subfamily/Family'].unique()

In [None]:
# Correct typing error in Reduncinae (extra space at end of word)

image_selection_df.loc[image_selection_df['Subfamily/Family'] == 'Reduncinae ', ['Subfamily/Family']] = 'Reduncinae'

In [None]:
# Save final image selection dataframe

image_selection_df.to_csv("image_selection_df.csv")

In [None]:
# Create folder to the family-level datasets

if not os.path.exists("F:\\family_data"):
    os.makedirs("F:\\family_data")

In [None]:
# Create training, validation and test folders

path = "F:\\family_data"
training_path = os.path.join(path, "training_family")
validation_path = os.path.join(path, "validation_family")
test_path = os.path.join(path, "test_family")
path_list = [training_path, validation_path, test_path]
for path in path_list:
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# Create subfolders in each main folder for every subfamily/family classification

families = list(image_selection_df['Subfamily/Family'].unique())
train_path = "F:\\family_data\\training_family"
validation_path = "F:\\family_data\\validation_family"
test_path = "F:\\family_data\\test_family"

for family in families:
    os.makedirs(os.path.join(train_path, family))
    os.makedirs(os.path.join(validation_path, family))
    os.makedirs(os.path.join(test_path, family))

In [None]:
# Function to create random training, validation and test splits

def choose_splits_family(dataframe):
    train_df = pd.DataFrame(columns = dataframe.columns)
    validation_df = pd.DataFrame(columns = dataframe.columns)
    for family in families:
        temp_df = (dataframe[dataframe['Subfamily/Family'] == family].sample(int(dataframe['Subfamily/Family'].value_counts()[family]*.8)))
        train_df = pd.concat([train_df, temp_df])
    remainder_df = pd.concat([dataframe, train_df]).drop_duplicates(keep=False)
    for family in families:
        temp_df = (remainder_df[remainder_df['Subfamily/Family'] == family].sample(int(dataframe['Subfamily/Family'].value_counts()[family]*.1)))
        validation_df = pd.concat([validation_df, temp_df])
    test_df = pd.concat([remainder_df, validation_df]).drop_duplicates(keep=False)
    return train_df, validation_df, test_df

In [None]:
# Create the splits

family_train_df, family_validation_df, family_test_df = choose_splits_family(image_selection_df)

In [None]:
# Function to copy files from original download location to approrpiate family-level folder

def file_copier_family(dataframe, path):
    for family in families:
        temp_df = dataframe[dataframe['Subfamily/Family'] == family]
        for dataset in temp_df['Dataset'].unique():
            temp_df2 = temp_df[temp_df['Dataset'] == dataset]
            for file in temp_df2['file_name']:
                file_to_locate = os.path.join(path_dict[dataset], file)
                new_location = os.path.join(path, family)
                try:
                    shutil.copy2(file_to_locate, new_location)
                except:
                    continue

In [None]:
file_copier_family(family_validation_df, "F:\\family_data\\validation_family")

In [None]:
file_copier_family(family_test_df, "F:\\family_data\\test_family")

In [None]:
file_copier_family(family_train_df, "F:\\family_data\\training_family")

In [None]:
# Import credentials for GCP bucket

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="x-pathway-318914-d2e75ae928d4.json"

In [None]:
# Define GCP bucket

client=storage.Client()
bucket = client.get_bucket("dsm500_bucket_europe_west2_a")

In [None]:
# Create function to upload images from local disk to appropriate folder in GCP bucket

def upload_local_directory_to_gcs(upload_list, location, bucket):
    for item in upload_list:
        local_path = "F://" + location + "/" + item
        gcs_path = location + "/" + item
        assert os.path.isdir(local_path)
        for local_file in glob.glob(local_path + '/**'):
            if not os.path.isfile(local_file):
                upload_local_directory_to_gcs(local_file, bucket, gcs_path + "/" + os.path.basename(local_file))
            else:
                remote_path = gcs_path + "/" + local_file[1 + len(local_path):]
                blob = bucket.blob(remote_path)
                blob.upload_from_filename(local_file)

In [None]:
upload_local_directory_to_gcs(species_list, "training", bucket)

In [None]:
upload_local_directory_to_gcs(species_list, "validation", bucket)

In [None]:
upload_local_directory_to_gcs(species_list, "test", bucket)

In [None]:
# Create list of subfamily/family classifications to use in uploading function

family_list = list(image_selection_df['Subfamily/Family'].unique())

In [None]:
upload_local_directory_to_gcs(family_list, "family_data/training_family", bucket)

In [None]:
upload_local_directory_to_gcs(family_list, "family_data/validation_family", bucket)

In [None]:
upload_local_directory_to_gcs(family_list, "family_data/test_family", bucket)

In [None]:
# Initial model training attempt required too much computational power. Therefore, decision was made to
# delete any subfamilies/families with only one species

In [None]:
# Get list of families/subfamilies with less than/equal to 1,000 images (indicating presence of only one species) 

families_to_delete = list((image_selection_df.groupby('Subfamily/Family').size()[image_selection_df.groupby('Subfamily/Family').size() <=1000]).index)

In [None]:
# Use families list to get species images to delete 
species_to_delete = []
for family in families_to_delete:
       species_to_delete.append(image_selection_df[image_selection_df['Subfamily/Family'] == family]['Species'].unique()[0])

In [None]:
# Next, need to get extra unused images to be used for the species_level classifier test sets

In [None]:
# Create dataframe of unused images, then filter it to contain only Canidae, Felinae or Sciruidae images

used_files = list(image_selection_df['file_name'])
unused_images_df = subset_df[~subset_df['id'].isin(used_files)]
unused_images_df = unused_images_df.drop_duplicates(subset='id')
# Use 'felidae' in the or statement in order to filter by family column
unused_images_df = unused_images_df.loc[(unused_images_df['Family'] == 'Felidae') | (unused_images_df['Family'] == 'Sciuridae') | (unused_images_df['Family'] == 'Canidae')]
# Remove pantherinae in order to be left with felinae
unused_images_df = unused_images_df[~(unused_images_df['Subfamily'] == 'Pantherinae')]

In [None]:
# Repeat process performed above to get proprotoinate number of images from each different dataset

species_per_dataset = unused_images_df.groupby(['Species', 'Dataset']).size()
species_per_dataset = pd.DataFrame(species_per_dataset)
species_per_dataset.reset_index(inplace=True)
species_per_dataset.columns = ['Species', 'Dataset', 'Count']
species_per_dataset = species_per_dataset[~(species_per_dataset['Dataset'] == 'Missouri')]
species_per_dataset['Selection'] = species_per_dataset['Count']
list_of_species = list(species_per_dataset['Species'].unique())
for species in list_of_species:
    length = len(species_per_dataset[species_per_dataset['Species'] == species])
    index = species_per_dataset[species_per_dataset['Species']==species].index
    for n in index:
        if species_per_dataset['Selection'][n] <= int(100/length):
            continue
        else:
            species_per_dataset.loc[n,'Selection'] = int(100/length)
grouped = species_per_dataset.groupby(['Species']).sum()
for species in grouped.index:
    species_per_dataset.loc[species_per_dataset['Species'] == species, 'Total'] =  grouped.loc[species,'Selection']

In [None]:
# Manual correction of one row

species_per_dataset.loc[38,'Selection'] = 50+20

In [None]:
# Confirm selectin includes 100 images for each species

species_per_dataset.groupby(['Species']).sum()

In [None]:
# Create new dataframes of unused images by dataset

kgalagai_new_df = unused_images_df[unused_images_df["Dataset"] == "Kgalagai"]
caltech_new_df = unused_images_df[unused_images_df["Dataset"] == "Caltech"]
island_new_df = unused_images_df[unused_images_df["Dataset"] == "Island"]
missouri_new_df = unused_images_df[unused_images_df["Dataset"] == "Missouri"]
camdeboo_new_df = unused_images_df[unused_images_df["Dataset"] == "Camdeboo"]
wcs_new_df = unused_images_df[unused_images_df["Dataset"] == "WCS"]
ena24_new_df = unused_images_df[unused_images_df["Dataset"] == "ENA24"]
wellington_new_df = unused_images_df[unused_images_df["Dataset"] == "Wellington"]
karoo_new_df = unused_images_df[unused_images_df["Dataset"] == "Karoo"]
enonkishu_new_df = unused_images_df[unused_images_df["Dataset"] == "Enonkishu"]
mountain_zebra_new_df = unused_images_df[unused_images_df["Dataset"] == "Mountain Zebra"]
kruger_new_df = unused_images_df[unused_images_df["Dataset"] == "Kruger"]
nacti_new_df = unused_images_df[unused_images_df["Dataset"] == "NACTI"]
serengeti_new_df = unused_images_df[unused_images_df["Dataset"] == "Serengeti"]

In [None]:
# Use new dataframes and image selection function to get dataframe of new test images

kgalagai_new_test = choose_random_images('Kgalagai', kgalagai_new_df)
caltech_new_test = choose_random_images('Caltech', caltech_new_df)
island_new_test = choose_random_images('Island', island_new_df)
camdeboo_new_test = choose_random_images('Camdeboo', camdeboo_new_df)
wcs_new_test = choose_random_images('WCS', wcs_new_df)
ena24_new_test = choose_random_images('ENA24', ena24_new_df)
wellington_new_test = choose_random_images('Wellington', wellington_new_df)
karoo_new_test = choose_random_images('Karoo', karoo_new_df)
enonkishu_new_test = choose_random_images('Enonkishu', enonkishu_new_df)
mountain_new_test = choose_random_images('Mountain Zebra', mountain_zebra_new_df)
kruger_new_test = choose_random_images('Kruger', kruger_new_df)
nacti_new_test = choose_random_images('NACTI', nacti_new_df)
serengeti_new_test = choose_random_images('Serengeti', serengeti_new_df)

In [None]:
# Combbine into single dataframe

new_image_selection_df = pd.concat([kgalagai_new_test, caltech_new_test,
                            island_new_test, camdeboo_new_test, wcs_new_test,
                            ena24_new_test, wellington_new_test,
                            karoo_new_test, enonkishu_new_test,
                            mountain_new_test, kruger_new_test,
                            nacti_new_test, serengeti_new_test])

In [None]:
# Use eariler function to get download information for use with AzCopy

kgalagai_new_images = images_to_download('Kgalagai', kgalagai_new_test)
caltech_new_images = images_to_download('Caltech', caltech_new_test)
island_new_images = images_to_download('Island', island_new_test)
camdeboo_new_images = images_to_download('Camdeboo', camdeboo_new_test)
wcs_new_images = images_to_download('WCS', wcs_new_test)
ena24_new_images = images_to_download('ENA24', ena24_new_test)
wellington_new_images = images_to_download('Wellington', wellington_new_test)
karoo_new_images = images_to_download('Karoo', karoo_new_test)
enonkishu_new_images = images_to_download('Enonkishu', enonkishu_new_test)
mountain_zebra_new_images = images_to_download('Mountain Zebra', mountain_new_test)
kruger_new_images = images_to_download('Kruger', kruger_new_test)
nacti_new_images = images_to_download('NACTI', nacti_new_test)
serengeti_new_images = images_to_download('Serengeti', serengeti_new_test)

In [None]:
# Define new mapping and species list for copying files to correct location

path_dict_new = {'Serengeti': 'F:\Final_Test_Data\Serengeti\snapshotserengeti-unzipped',
             'WCS':'F:\Final_Test_Data\WCS\wcs-unzipped',
             'Camdeboo':'F:\Final_Test_Data\Camdeboo\CDB_public', 
                 'Mountain Zebra':'F:\Final_Test_Data\Mountain Zebra\MTZ_public',
               'Kgalagai':'F:\Final_Test_Data\Kgalagai\KGA_public', 
                 'Kruger':'F:\Final_Test_Data\Kruger\KRU_public',
             'ENA24':'F:\Final_Test_Data\ENA24\images', 
                 'Island':'F:\Final_Test_Data\Island\public',
             'Wellington':'F:\Final_Test_Data\Wellington\images', 
                 'Caltech':'F:\Final_Test_Data\Caltech\cct_images',
               'Karoo':'F:\Final_Test_Data\Karoo\KAR_public', 
                 "NACTI":r"F:\Final_Test_Data\NACTI\nacti-unzipped"}

new_species_list = list(new_image_selection_df['Species'].unique())

In [None]:
# Modified file copier function

def file_copier(dataframe, path):
    for species in new_species_list:
        temp_df = dataframe[dataframe['Species'] == species]
        for dataset in temp_df['Dataset'].unique():
            temp_df2 = temp_df[temp_df['Dataset'] == dataset]
            for file in temp_df2['file_name']:
                file_to_locate = os.path.join(path_dict_new[dataset], file)
                new_location = os.path.join(path, species)
                try:
                    shutil.copy2(file_to_locate, new_location)
                except:
                    continue

In [None]:
# Copy files

file_copier(new_image_selection_df, "F:\\Final_Test_Data\Test")

In [None]:
# Create lists for uploading to CGP

felinae_list = list(new_image_selection_df[new_image_selection_df['Subfamily'] == 'Felinae']['Species'].unique())
canidae_list = list(new_image_selection_df[new_image_selection_df['Family'] == 'Canidae']['Species'].unique())
sciuridae_list = list(new_image_selection_df[new_image_selection_df['Family'] == 'Sciuridae']['Species'].unique())

In [None]:
upload_local_directory_to_gcs(felinae_list, "felinae/test", bucket)

In [None]:
upload_local_directory_to_gcs(canidae_list, "canidae/test", bucket)

In [None]:
upload_local_directory_to_gcs(sciuridae_list, "sciuridae/test", bucket)

In [None]:
# Now need to upload out-of-sample data using Missouri dataset. Entire dataset was previously downloaded in bulk,
# so simply a case of selecting approrpiate images and uploading to GCP

In [None]:
# Create dataframe of appropriate images (canidae, felinae, sciuridae)

missouri_df = unused_images_df[unused_images_df['Dataset']=='Missouri']

In [None]:
def upload_local_directory_to_gcs(upload_list, location, bucket):
    for item in upload_list:
        local_path = "F://" + location + "/" + item
        gcs_path = location + "/" + item
        assert os.path.isdir(local_path)
        for local_file in glob.glob(local_path + '/**'):
            if not os.path.isfile(local_file):
                upload_local_directory_to_gcs(local_file, bucket, gcs_path + "/" + os.path.basename(local_file))
            else:
                remote_path = gcs_path + "/" + local_file[1 + len(local_path):]
                blob = bucket.blob(remote_path)
                blob.upload_from_filename(local_file)

In [None]:
# Simplified uploaded function to account for fact that only one species present per class (so list not used)

def upload_local_directory_to_gcs2(item, location, bucket):
    local_path = "F://" + location + "/" + item
    gcs_path = location + "/" + item
    assert os.path.isdir(local_path)
    for local_file in glob.glob(local_path + '/**'):
        if not os.path.isfile(local_file):
            upload_local_directory_to_gcs(local_file, bucket, gcs_path + "/" + os.path.basename(local_file))
        else:
            remote_path = gcs_path + "/" + local_file[1 + len(local_path):]
            blob = bucket.blob(remote_path)
            blob.upload_from_filename(local_file)

In [None]:
# Locate in species list and upload the specific species present in missouri dataset

upload_local_directory_to_gcs2(canidae_list[2], "canidae/out_of_sample", bucket)

In [None]:
upload_local_directory_to_gcs2(sciuridae_list[1], "sciuridae/out_of_sample", bucket)

In [None]:
upload_local_directory_to_gcs2(felinae_list[3], "felinae/out_of_sample", bucket)