In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

random.seed(42)

PATH_TO_DATA = '../data/'

In [4]:
verbatim_file = PATH_TO_DATA + 'processed/single_csv/verbatim.csv'
verbatim_df = pd.read_csv(verbatim_file)
verbatim_df.tail(3)

Unnamed: 0,gbifID,eventDate,continent,country,stateProvince,decimalLatitude,decimalLongitude,scientificName,vernacularName,higherClassification,family
3674057,3905003113,2022-08-13,Europe,Germany,Nordrhein-Westfalen,51.186953,6.072675,Lycaena phlaeas,Small Copper,Animalia|Lycaenidae,Lycaenidae
3674058,3907062141,2022-08-13,Europe,Netherlands,Drenthe,53.0,6.4,Pyronia tithonus,Gatekeeper,Animalia|Nymphalidae,Nymphalidae
3674059,4889724145,2022-08-14,Europe,United Kingdom,England - Cheshire,53.221595,-2.51297,Triodia sylvina,Orange Swift,Animalia|Hepialidae,Hepialidae


In [5]:
multimedia_file = PATH_TO_DATA + 'processed/single_csv/multimedia.csv'
multimedia_df = pd.read_csv(multimedia_file)
multimedia_df.tail(3)

Unnamed: 0,gbifID,format,identifier
4271285,4891549309,image/jpeg,https://observation.org/photos/91183706.jpg
4271286,4891549354,image/jpeg,https://observation.org/photos/91196424.jpg
4271287,4891549377,image/jpeg,https://observation.org/photos/91209494.jpg


In [8]:
species_counts = verbatim_df['scientificName'].value_counts()

### Creating testing dataset (top20max50)
- 20 species with most occurrences
- 50 images per species
- Total images: 1000

In [9]:
species_for_testing_ds = species_counts.head(20).keys()
species_for_testing_ds

Index(['Pararge aegeria', 'Vanessa atalanta', 'Noctua pronuba', 'Aglais io',
       'Lycaena phlaeas', 'Polyommatus icarus', 'Autographa gamma',
       'Pieris napi', 'Maniola jurtina', 'Pieris rapae', 'Polygonia c-album',
       'Cydalima perspectalis', 'Coenonympha pamphilus', 'Aglais urticae',
       'Gonepteryx rhamni', 'Peribatodes rhomboidaria', 'Araschnia levana',
       'Celastrina argiolus', 'Camptogramma bilineata', 'Vanessa cardui'],
      dtype='object', name='scientificName')

In [11]:
gbifID_blacklist = [1966132886, 4459520332] # entering gbifIDs which match images with poor quality
selected_species = species_counts.head(20).keys()
selected_data = {'gbifID': [], 'scientificName': []}

for species in selected_species:
    species_ids = verbatim_df[verbatim_df['scientificName'] == species]['gbifID'].tolist()

    valid_species_ids = [id for id in species_ids if id not in gbifID_blacklist]
    
    if len(valid_species_ids) >= 50:
        selected = random.sample(species_ids, 50)
    
    # Füge die ausgewählten IDs zur Liste hinzu
    for gbifID in selected:
        selected_data['gbifID'].append(gbifID)
        selected_data['scientificName'].append(species)


selected_df = pd.DataFrame(selected_data)

output_file = PATH_TO_DATA + 'processed/testing_dataset_top20max50.csv'
# selected_df.to_csv(output_file, index=False)

### Creating testing dataset (top589max3000)
- 589 species with most occurrences (all above 1k)
- maximum of 3000 images per species (randomly selected)
- Total images: ~1.3 million

In [16]:
full_df = pd.merge(verbatim_df, multimedia_df, on='gbifID')
print(full_df.shape)
full_df.tail(3)

(4271288, 13)


Unnamed: 0,gbifID,eventDate,continent,country,stateProvince,decimalLatitude,decimalLongitude,scientificName,vernacularName,higherClassification,family,format,identifier
4271285,3905003113,2022-08-13,Europe,Germany,Nordrhein-Westfalen,51.186953,6.072675,Lycaena phlaeas,Small Copper,Animalia|Lycaenidae,Lycaenidae,image/jpeg,https://observation.org/photos/56429659.jpg
4271286,3907062141,2022-08-13,Europe,Netherlands,Drenthe,53.0,6.4,Pyronia tithonus,Gatekeeper,Animalia|Nymphalidae,Nymphalidae,image/jpeg,https://observation.org/photos/56466702.jpg
4271287,4889724145,2022-08-14,Europe,United Kingdom,England - Cheshire,53.221595,-2.51297,Triodia sylvina,Orange Swift,Animalia|Hepialidae,Hepialidae,image/jpeg,https://observation.org/photos/56494589.jpg


In [18]:
full_df = full_df.drop_duplicates(subset=['gbifID']) # reduce to single occ per gbifID
print(full_df.shape)

(3674060, 13)


In [19]:
full_df_grouped = full_df.groupby('scientificName') # Group by 'scientificName'
filtered_dfs = []

MAX_AMOUNT = 3000
MIN_AMOUNT = 1000

for name, group in full_df_grouped:
    sample_size = len(group)
    if sample_size >= MAX_AMOUNT:
        sampled = group.sample(n=MAX_AMOUNT, random_state=42) # classes with more than 5000 samples get reduced to 5000
        filtered_dfs.append(sampled)
    elif MIN_AMOUNT <= sample_size < MAX_AMOUNT: # classes between 1000 and 5000 stay unchanged
        filtered_dfs.append(group)
    # Classes with less than 1000 samples are skipped

# Concatenate the filtered DataFrames
final_df = pd.concat(filtered_dfs)
print(final_df.shape)
final_df['scientificName'].value_counts()

(1388034, 13)


scientificName
Udea ferrugalis                       3000
Tyria jacobaeae                       3000
Zygaena filipendulae                  3000
Triodia sylvina                       3000
Acrobasis tumidana                    3000
Abrostola triplasia                   3000
Yponomeuta spec.                      3000
Acentria ephemerella                  3000
Aglais urticae                        3000
Aglais io                             3000
Agapeta hamana                        3000
Adela reaumurella                     3000
Acronicta tridens / psi               3000
Acronicta rumicis                     3000
Vanessa atalanta                      3000
Parapoynx stratiotata                 3000
Papilio machaon                       3000
Pandemis cerasana                     3000
Lomaspilis marginata                  3000
Anania hortulata                      3000
Agriphila straminella                 3000
Agriopis marginaria                   3000
Agriopis leucophaearia                3

In [None]:
# save dataset to file
# full_df.to_csv(PATH_TO_DATA + 'processed/dataset_full.csv', index=False)
# final_df.to_csv(PATH_TO_DATA + 'processed/dataset_top589_max3000.csv', index=False)

### Create Datasets for CV and Baseline

In [3]:
top589_max3000_file = PATH_TO_DATA + 'processed/dataset_top589_max3000.csv'
top589_max3000_df = pd.read_csv(top589_max3000_file)
top589_max3000_df.head(3)

Unnamed: 0,gbifID,eventDate,continent,country,stateProvince,decimalLatitude,decimalLongitude,scientificName,vernacularName,higherClassification,family,format,identifier
0,3906248438,2022-06-13,Europe,Netherlands,Gelderland,52.2,6.05,Abraxas grossulariata,Magpie,Animalia|Geometridae,Geometridae,image/jpeg,https://observation.org/photos/52122202.jpg
1,3715637791,2018-06-15,Europe,Netherlands,Utrecht,52.2,4.85,Abraxas grossulariata,Magpie,Animalia|Geometridae,Geometridae,image/jpeg,https://observation.org/photos/17321218.jpg
2,3721642196,2021-07-03,Europe,Netherlands,Noord-Holland,53.05,4.8,Abraxas grossulariata,Magpie,Animalia|Geometridae,Geometridae,image/jpeg,https://observation.org/photos/38571328.jpg


In [4]:
dataset_configs = {
    'top277': (277, [3000, 2000, 1000, 500]), # (amount of classes, [amount of samples per species])
    'top387': (387, [2000, 1000, 500]),
    'top589': (589, [1000, 500])
}

for class_amount, sample_amounts in dataset_configs.values():
    selected_species = top589_max3000_df['scientificName'].value_counts().head(class_amount).keys() # first 277 classes have 3000 samples


    top277_max3000_df = top589_max3000_df[top589_max3000_df['scientificName'].isin(selected_species)]
    top277_max3000_df_grouped = top277_max3000_df.groupby('scientificName') # Group by 'scientificName'

    for amount in sample_amounts:
        filtered_dfs = []

        for name, group in top277_max3000_df_grouped:
            sampled = group.sample(n=amount, random_state=42)
            filtered_dfs.append(sampled)
        variation_df = pd.concat(filtered_dfs)
        print(f'Shape for dataset with {amount} samples for {class_amount} species: {variation_df.shape}')
        #if input(f'Enter 1 to save dataset_top{class_amount}_max{amount}.csv') == '1':
        variation_df.to_csv(PATH_TO_DATA + f'processed/resized_datasets/dataset_top{class_amount}_max{amount}.csv', index=False)
        print(f'Saved dataset_top{class_amount}_max{amount}.csv')

Shape for dataset with 3000 samples for 277 species: (831000, 13)
Saved dataset_top277_max3000.csv
Shape for dataset with 2000 samples for 277 species: (554000, 13)
Saved dataset_top277_max2000.csv
Shape for dataset with 1000 samples for 277 species: (277000, 13)
Saved dataset_top277_max1000.csv
Shape for dataset with 500 samples for 277 species: (138500, 13)
Saved dataset_top277_max500.csv
Shape for dataset with 2000 samples for 387 species: (774000, 13)
Saved dataset_top387_max2000.csv
Shape for dataset with 1000 samples for 387 species: (387000, 13)
Saved dataset_top387_max1000.csv
Shape for dataset with 500 samples for 387 species: (193500, 13)
Saved dataset_top387_max500.csv
Shape for dataset with 1000 samples for 589 species: (589000, 13)
Saved dataset_top589_max1000.csv
Shape for dataset with 500 samples for 589 species: (294500, 13)
Saved dataset_top589_max500.csv
