In [22]:
import pandas as pd
import pickle as pkl
import json
import os
import unicodedata
import numpy as np

### 1. Load the main dataframe that mappes the historical names from Iconographia Zoologica to GBIF terms

In [2]:
iz_gbif = pd.read_pickle("./main-dataset/dataframes/iz_gbif_mapping.pkl")
iz_gbif

Unnamed: 0,kingdom,phylum,class,order,family,acceptedNameUsageID,acceptedName,scientificName,taxonID,iz_name,file,genus,species,subspecies,binomial
0,Animalia,Arthropoda,Insecta,Coleoptera,Erotylidae,1042638,Triplax,"Triplax Herbst, 1793",1042638,Triplax,Triplax - Print - Iconographia Zoologica - Spe...,Triplax,species,subspecies,Triplax species
1,Animalia,Arthropoda,Malacostraca,Stomatopoda,Squillidae,5177253,Alima neptuni,"Alima neptuni (Linnaeus, 1768)",7803110,Alima gracilis,Alima gracilis - - Print - Iconographia Zoolog...,Alima,neptuni,subspecies,Alima neptuni
2,Animalia,Chordata,Reptilia,Squamata,Viperidae,2444468,Crotalus horridus,"Crotalus horridus Linnaeus, 1758",2444468,Crotalus horridus,Crotalus horridus - 1700-1880 - Print - Iconog...,Crotalus,horridus,subspecies,Crotalus horridus
3,Animalia,Chordata,Aves,Passeriformes,Tichodromidae,2484918,Tichodroma muraria,"Tichodroma muraria (Linnaeus, 1766)",2484918,Tichodroma muraria,Tichodroma muraria - 1820-1860 - Print - Icono...,Tichodroma,muraria,subspecies,Tichodroma muraria
4,Animalia,Chordata,Mammalia,Rodentia,Sciuridae,5219685,Sciurus niger cinereus,"Sciurus niger cinereus Linnaeus, 1758",5219687,Sciurus cinereus,Sciurus cinereus - 1700-1880 - Print - Iconogr...,Sciurus,niger,cinereus,Sciurus niger
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15552,Animalia,Mollusca,Gastropoda,Neogastropoda,Muricidae,6499818,Vokesimurex recurvirostris,"Vokesimurex recurvirostris (Broderip, 1833)",5193437,Murex recurvirostris,Murex recurvirostris - - Print - Iconographia ...,Vokesimurex,recurvirostris,subspecies,Vokesimurex recurvirostris
15553,Animalia,Chordata,Aves,,,8896691,Tchitrea mutata,Tchitrea mutata,8896691,Tchitrea mutata,Tchitrea mutata - 1700-1880 - Print - Iconogra...,Tchitrea,mutata,subspecies,Tchitrea mutata
15554,Animalia,Cnidaria,Anthozoa,Alcyonacea,Gorgoniidae,9019872,Gorgonia cerea,Gorgonia cerea Esper,9019872,Gorgonia cerea,Gorgonia cerea - - Print - Iconographia Zoolog...,Gorgonia,cerea,subspecies,Gorgonia cerea
15555,Animalia,Chordata,Ascidiacea,Stolidobranchia,Styelidae,2331954,Styela plicata,"Styela plicata (Lesueur, 1823)",4355135,Ascidia phusca,Ascidia phusca - - Print - Iconographia Zoolog...,Styela,plicata,subspecies,Styela plicata


### 2. Select classes
Select class names for training that are in both in the iconographia as well as in the auxiliary datasets

In [3]:
#inat binomials
with open('./main-dataset/additional_data/iNat2018_categories.json') as f:
    iNat18_categories = json.load(f)
    
iNat18_binomials = [name['name'] for name in iNat18_categories]

#bhl binomials
bhl_binomials = pd.read_pickle('./main-dataset/class_embeddings/texts/bhl_embeddings_norm.pkl').index.tolist()

#gbif binomials
with open('./main-dataset/class_embeddings/taxonomy/all_classes.txt', 'r') as f:
    all_gbif_taxa = json.load(f)
gbif_binomials = [cl.split('_b')[0] for cl in list(all_gbif_taxa.keys()) if cl.endswith('_b')]

#the suffixes refer to the rank belonging to the taxon, where _b refers to the binomial
union_classes_tax = set(iz_gbif.binomial.values) & set(gbif_binomials)
union_classes_bhl = set(iz_gbif.binomial.values) & set(bhl_binomials)
union_classes_iNat = set(iz_gbif.binomial.values) & set(iNat18_binomials)

union_classes = union_classes_tax.union(union_classes_bhl,union_classes_iNat)

df = iz_gbif[iz_gbif.binomial.isin(union_classes)]
df.describe()

Unnamed: 0,kingdom,phylum,class,order,family,acceptedNameUsageID,acceptedName,scientificName,taxonID,iz_name,file,genus,species,subspecies,binomial
count,14502,14499,14495,14416,14497,14502,14502,14502,14502,14502,14502,14502,14502,14502,14502
unique,1,21,71,373,1939,8016,8012,8016,8575,8575,14502,5844,3674,128,7973
top,Animalia,Chordata,Insecta,Coleoptera,Conidae,5219173,Canis lupus,"Canis lupus Linnaeus, 1758",5219173,Canis lupus,Cervus elaphus - gewei - 1700-1880 - Print - I...,Conus,species,subspecies,Canis lupus
freq,14502,7358,2802,1738,410,283,283,283,280,280,1,393,3240,14098,285


### Load class embeddings

In [4]:
photo_embeddings = pd.read_pickle('./main-dataset/class_embeddings/photographs/photo_embeddings_norm.pkl')
taxon_embeddings = pd.read_pickle('./main-dataset/class_embeddings/taxonomy/taxon_embeddings_norm.pkl')
bhl_embeddings = pd.read_pickle('./main-dataset/class_embeddings/texts/bhl_embeddings_norm.pkl')

### Load features augmented images
Before cropping all images, the largest side of each image was first resized to 300. During resizing, we kept the aspect ratio identical to the original image.
2048-dimensional features were extracted by applying the pre-trained Inception V3 model to crops (middle, upper left, upper right, lower left and lower right) of each resized original illustration and its horizontally flipped version. 

1. extract features from ALL crops

**Please note** that every dataframe is indexed by the list of class labels _union_classes_, as these have been selected for training

In [53]:
centerCrop_feat_df = pd.read_pickle('./main-dataset/illustrations/augmented_data/centerCrop/df_centerCrop_feats.pkl')
centerCrop_feat_df = centerCrop_feat_df[centerCrop_feat_df.labels.isin(union_classes)]

tenCrop_feat_df = pd.read_pickle('./main-dataset/illustrations/augmented_data/tenCrop/df_tenCrop_feats.pkl')  
tenCrop_feat_df = tenCrop_feat_df[tenCrop_feat_df.labels.isin(union_classes)]

2. Crops containing only white space or text were manually discarded. 

In [51]:
non_empty_images = [unicodedata.normalize('NFC', f) for f in os.listdir('./main-dataset/illustrations/augmented_data/tenCrop/tenCrop_Images_No_Empty/') if not f.startswith('.')]
tenCrop_feat_df_non_empty_images = tenCrop_feat_df[tenCrop_feat_df.aug_files.isin(non_empty_images)]

In [19]:
tenCrop_feat_df_non_empty_images.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2041,2042,2043,2044,2045,2046,2047,labels,files,aug_files
0,0.018,0.019,0.029,0.117,0.011,0.598,0.197,0.157,0.263,0.036,...,0.014,0.113,0.653,0.166,1.108,0.174,0.854,Triplax species,Triplax - Print - Iconographia Zoologica - Spe...,0.Triplax - Print - Iconographia Zoologica - S...
10,0.045,0.054,0.011,0.199,0.009,0.259,0.241,0.166,0.262,0.104,...,0.022,0.143,0.564,0.313,0.554,0.016,1.061,Alima neptuni,Alima gracilis - - Print - Iconographia Zoolog...,10.Alima gracilis - - Print - Iconographia Zoo...
11,0.056,0.055,0.014,0.204,0.014,0.317,0.264,0.107,0.222,0.099,...,0.035,0.115,0.518,0.336,0.466,0.014,0.993,Alima neptuni,Alima gracilis - - Print - Iconographia Zoolog...,11.Alima gracilis - - Print - Iconographia Zoo...
100,0.033,0.013,0.133,0.211,0.032,0.641,0.501,0.243,0.471,0.016,...,0.007,0.215,0.876,0.238,0.612,0.105,0.596,Oxychilus draparnaudi,Helix lucida - - Print - Iconographia Zoologic...,100.Helix lucida - - Print - Iconographia Zool...
101,0.043,0.019,0.105,0.219,0.019,0.579,0.432,0.168,0.354,0.005,...,0.038,0.185,0.774,0.239,0.523,0.08,0.633,Oxychilus draparnaudi,Helix lucida - - Print - Iconographia Zoologic...,101.Helix lucida - - Print - Iconographia Zool...


In [54]:
print ('Nr images image dataset:', len(centerCrop_feat_df))
print ('Nr classes image dataset:', len(np.unique(df.binomial.values)))
print ('Nr classes photo embeddings:', len(photo_embeddings))
print ('Nr classes hierarchy embeddings:', len(taxon_embeddings))
print ('Nr classes text embeddings:', len(bhl_embeddings))

Nr images image dataset: 14502
Nr classes image dataset: 7973
Nr classes photo embeddings: 547
Nr classes hierarchy embeddings: 7920
Nr classes text embeddings: 3040
