## The Pl@ntNet Trusted dataset

In [21]:
import os
import pandas as pd
import numpy as np
pd.options.display.max_columns=10

In [26]:
from IPython.display import display

# random seed for reproducibility
np.random.seed(42)

# working on a subset of Pl@ntNet Trusted: 2500 occurrences
df = pd.read_csv('../data/occurrences/PL_trusted.csv',
                 sep=';', header='infer', quotechar='"', low_memory=True)

df = df[['Longitude','Latitude','glc19SpId']].dropna(axis=0,how='all').astype({'glc19SpId':'int64'})
target_df = df['glc19SpId']
print(len(df), 'examples in the dataset')
print(len(target_df.unique()), 'number of species\n')

duplicated_df = df[df.duplicated(subset=['Latitude','Longitude'],keep=False)]
print(f'{len(duplicated_df)} entries observed at interfering locations')
display(duplicated_df.head(5))

237087 examples in the dataset
1364 number of species

84524 entries observed at interfering locations


Unnamed: 0,Longitude,Latitude,glc19SpId
1,-0.5925,45.10639,31997
4,2.34315,48.87488,33228
7,4.82718,45.77687,32127
8,6.225866,44.08693,30207
9,-1.738766,46.47871,32236


PlantNet Trusted contient 237,087 occurrences, 1364 classes, et 84,524 points superposés (36%).

## The PlantNet Complete dataset

In [27]:
from IPython.display import display

# random seed for reproducibility
np.random.seed(42)

# working on a subset of Pl@ntNet Trusted: 2500 occurrences
df = pd.read_csv('../data/occurrences/PL_complete.csv',
                 sep=';', header='infer', quotechar='"', low_memory=True)

df = df[['Longitude','Latitude','glc19SpId','scName']]
df = df.dropna(axis=0, how='all') #drop nan lines
df = df.astype({'glc19SpId': 'int64'})
# target pandas series of the species identifiers (there are 505 labels)
target_df = df['glc19SpId']
print(len(df), 'examples in the dataset')
print(len(target_df.unique()), 'number of species\n')

duplicated_df = df[df.duplicated(subset=['Latitude','Longitude'],keep=False)]
print(f'{len(duplicated_df)} entries observed at interfering locations :')
display(duplicated_df.head(5))

2377610 examples in the dataset
3906 number of species

1056501 entries observed at interfering locations :


Unnamed: 0,Longitude,Latitude,glc19SpId,scName
1,-0.900372,46.10539,30504,Cercis siliquastrum L.
3,7.09043,47.75413,30721,Capsella bursa-pastoris (L.) Medik.
4,6.241667,43.11694,30747,Pancratium maritimum L.
6,4.997221,43.51215,32383,Aloe vera (L.) Burm.f.
8,-2.854644,47.56847,30582,Erigeron karvinskianus DC.


PlantNet Complete contient 2,377,610 occurrences, 3906 classes, et 1,056,501 points superposés (44%).

In [33]:
from IPython.display import display

# random seed for reproducibility
np.random.seed(42)

# working on a subset of Pl@ntNet Trusted: 2500 occurrences
df = pd.read_csv('../data/occurrences/GLC_2018.csv',
                 sep=';', header='infer', quotechar='"', low_memory=True)

df = df[['Longitude','Latitude','glc19SpId']].dropna(axis=0,how='all').astype({'glc19SpId':'int64'})
target_df = df['glc19SpId']
print(len(df), 'examples in the dataset')
print(len(target_df.unique()), 'number of species\n')

duplicated_df = df[df.duplicated(subset=['Latitude','Longitude'],keep=False)]
print(f'{len(duplicated_df)} entries observed at interfering locations')
display(duplicated_df.head(5))

281952 examples in the dataset
3231 number of species

229255 entries observed at interfering locations


Unnamed: 0,Longitude,Latitude,glc19SpId
0,2.7267,47.83388,29976
1,7.13615,47.95033,30115
2,3.5597,45.63251,30102
3,6.391726,49.415718,30378
4,7.537994,47.623173,30830


GLC 2018 contient 281,952 occurrences, 3231 classes, et 229,255 points superposés (81%).

## Species taxonomic names

In [32]:
# correspondence table between ids and the species taxonomic names
# (Taxref names with year of discoverie)
taxonomic_names = pd.read_csv('../data/occurrences/taxaName_glc19SpId.csv',
                              sep=';',header='infer', quotechar='"',low_memory=True)
display(taxonomic_names.sample(5))

Unnamed: 0,taxaName,glc19SpId,test
4711,"Arrhenia obscurata (D.A.Reid) Redhead, Lutzoni...",4712,False
29190,"Anacampsis temerella (Lienig & Zeller, 1846)",29191,False
9781,"Plagiodera versicolora (Laicharting, 1781)",9782,False
18684,"Hellinsia tephradactyla (Hubner, 1813)",18685,False
12263,"Cryptarcha strigata (Fabricius, 1787)",12264,False


34,719 classes qui peuvent apparraître au total ( mais meme sur l'ensemble des datasets ça fait bcp moins).