In [36]:
%matplotlib inline
import numpy as np
import pandas as pd

In [37]:
import sys
sys.path.append("../gbif_species_name_extraction/")
from gbif_species_name_extraction import extract_species_information, extract_gbif_species_names_info

In [38]:
sys.path.append("../../mapping/")
from support_functions import pandas_df_to_markdown_table

## Recorder unique scientific names derived from the nameserver

I extracted all unique species names from recorder (NBN-data) to make an overview of the species (for all surveys). After matching with GBIF, we can make a mapping in between 'recommended names' as defined by recorder and the accepted_keys from GBIF. If we manage to make this mapping, we can later on make an easy selection in both directions GBIF <-> Recorder.

First of all, extract the Recorder unique species names, by querying the NBN-database. for which the Query is stored in `SQL_unique_names.sql`:

In [39]:
recorder_names = pd.read_csv("recorder_sql_unique_names_output.csv", delimiter='\t', dtype=object)

In [40]:
recorder_names.head()

Unnamed: 0,RECOMMENDED_TAXON_VERSION_KEY,scientificName,TAXON_GROUP_NAME
0,NHMSYS0000456996,Caylusea,bloemplant
1,NHMSYS0000900079,Listrognathus mactator,insect - vliesvleugelige (Hymenoptera)
2,NBNSYS0100004462,Paratanytarsus dissimilis,insect - vlieg & mug (Diptera)
3,NHMSYS0000603228,Gymnodinium colymbeticum,wier
4,NHMSYS0000875969,Formicinae,insect - vliesvleugelige (Hymenoptera)


In [41]:
len(recorder_names)

144135

In [42]:
recorder_names[recorder_names['TAXON_GROUP_NAME'] == 'niet toegewezen']

Unnamed: 0,RECOMMENDED_TAXON_VERSION_KEY,scientificName,TAXON_GROUP_NAME
6467,NHMSYS0001697203,Prokaryota,niet toegewezen
10559,NHMSYS0000841310,Arthropoda,niet toegewezen
37772,INBSYS0000012542,Procyon lotor,niet toegewezen
38067,NHMSYS0000841034,Lepidoptera,niet toegewezen
42688,NBNSYS0000160353,Lycopodiopsida,niet toegewezen
65550,NHMSYS0000842161,Protista,niet toegewezen
70145,NBNSYS0000172125,Bryophyta,niet toegewezen
79432,NHMSYS0000841378,Pisces,niet toegewezen
92240,NHMSYS0000841351,Insecta,niet toegewezen
101203,NHMSYS0001747258,Lycopodiophyta,niet toegewezen


In [43]:
recorder_names['TAXON_GROUP_NAME'].unique()

array(['bloemplant', 'insect - vliesvleugelige (Hymenoptera)',
       'insect - vlieg & mug (Diptera)', 'wier', 'diatomee',
       'pseudoschorpioen (Pseudoscorpiones)', 'raderdiertje (Rotifera)',
       'springstaart (Collembola)', 'insect - kever (Coleoptera)',
       'levermos', 'foraminifeer', 'rondworm (Nematoda)', 'zwam',
       'lintworm (Cestoda)', 'ringworm', 'amfibie', 'mollusk (Mollusca)',
       'insect - nachtvlinder', 'bacterie', 'neteldier (=cnidarian)',
       'insect - vlo (Siphonaptera)', 'schaaldier', 'fungoide',
       'insect - trips (Thysanoptera)', 'buikhaarworm (Gastrotricha)',
       'insect - wants, cicade, bladluis (Hemiptera)', 'mijt (Acari)',
       'vogel', 'zuigworm (Trematoda)', 'beenvis (Actinopterygii)',
       'insect - dagvlinder', 'miljoenpoot', 'korstmos', 'spin (Araneae)',
       'mos', 'kraakbeenvis  (Chondrichthyes)', 'platworm (Turbellaria)',
       'varen', 'slurfworm (Kinorhyncha)', 'insect - haft (Ephemeroptera)',
       'ribkwal (Ctenophora

We introduce a mapping in between the INFORMAL GROUP defined by recorder and the kingdom we're dealing with:

In [44]:
mapping_kingdom = {'insect - kever (Coleoptera)' : 'Animalia',
                   'conifeer' : 'Plantae', 
                   'beenvis (Actinopterygii)' : 'Animalia', 
                   'vogel' : 'Animalia', 
                   'insect - nachtvlinder' : 'Animalia', 
                   'bloemplant' : 'Plantae',
                   'spin (Araneae)' : 'Animalia',
                   'insect - vlieg & mug (Diptera)' : 'Animalia', 
                   'diatomee' : 'Plantae',
                   'mollusk (Mollusca)' : 'Animalia',
                   'insect - dagvlinder' : 'Animalia', 
                   'insect - kokerjuffer (Trichoptera)' : 'Animalia',
                   'ringworm' : 'Animalia',
                   'mos' : 'Plantae',
                   'insect - wants, cicade, bladluis (Hemiptera)' : 'Animalia', 
                   'mijt (Acari)' : 'Animalia', 
                   'varen' : 'Plantae', 
                   'schaaldier' : 'Animalia', 
                   'landzoogdier' : 'Animalia', 
                   'kranswier' : 'Plantae',
                   'insect - sprinkhaan & krekel (Orthoptera)' : 'Animalia', 
                   'korstmos' : 'Plantae',
                   'platworm (Turbellaria)' : 'Animalia',
                   'insect - libel (Odonata)' : 'Animalia', 
                   'wier' : 'Plantae', 'paardenstaart' : 'Plantae',
                   'niet toegewezen': None, 
                   'levermos' : 'Plantae', 
                   'tweestaart (Diplura)' : 'Animalia',
                   'rondbek (Agnatha)' : 'Animalia', 
                   'duizendpoot' : 'Animalia', 
                   'wolfsklauw' : 'Plantae', 
                   'amfibie' : 'Animalia', 
                   'bacterie' : 'Bacteria',
                   'insect - nachtvlinder' : 'Animalia',
                   'reptiel' : 'Animalia',
                   'insect - vliesvleugelige (Hymenoptera)' : 'Animalia',
                   'zwam' : 'Fungi',
                   'kraakbeenvis  (Chondrichthyes)' : 'Animalia',
                   'rondworm (Nematoda)' : 'Animalia',
                   'virus' : 'Viruses',
                   'insect - steenvlieg (Plecoptera)' : 'Animalia',
                   'zeezoogdier' : 'Animalia',
                   'ginkgo' : 'Plantae',
                   'parasitaire nematode (Nematoda)' : 'Animalia',
                   'zuigworm (Trematoda)' : 'Animalia',
                   'biesvaren' : 'Plantae',
                   'neteldier (=cnidarian)' : 'Animalia',
                   'insect - trips (Thysanoptera)'  : 'Animalia',
                   'oerdiertje (Protozoa)' : 'Protozoa',
                   'hauwmos' : 'Plantae',
                   'lintworm (Cestoda)' : 'Animalia',
                   'manteldiertje (Urochordata)' : 'Animalia',
                   'stekelhuidige' : 'Animalia',
                   'ectoparasitaire platworm' : 'Animalia',
                   'ribkwal (Ctenophora)' : 'Animalia',
                   'mosdiertje (Bryozoa)' : 'Animalia',
                   'pseudoschorpioen (Pseudoscorpiones)' : 'Animalia',
                   'raderdiertje (Rotifera)' : 'Animalia',
                   'springstaart (Collembola)' : 'Animalia',
                   'foraminifeer' : 'Protozoa',
                   'insect - vlo (Siphonaptera)' : 'Animalia',
                   'fungoide' : 'Fungi', 
                   'buikhaarworm (Gastrotricha)' : 'Animalia',
                   'miljoenpoot' : 'Animalia',
                   'slurfworm (Kinorhyncha)' : 'Animalia',
                   'insect - haft (Ephemeroptera)' : 'Animalia',
                   'slijmzwam' : 'Protozoa',
                   'spons (Porifera)' : 'Animalia',
                   'slurfworm (Echiura)' : 'Animalia',
                   'zeespin (Pycnogonida)' : 'Animalia',
                   'spuitworm (Sipuncula)' : 'Animalia',
                   'insect - gaasvlieg (Neuroptera)' : 'Animalia',
                   'insect - houtluis (Psocoptera)' : 'Animalia',
                   'insect - waaiervleugelige (Strepsiptera)' : 'Animalia',
                   'kelkworm (Entoprocta)' : 'Animalia',
                   'beerdiertje (Tardigrada)' : 'Animalia',
                   'brachiopood (Brachiopoda)' : 'Animalia',
                   'baardworm (Pogonophora)' : 'Animalia',
                   'snoerworm (Nemertinea)' : 'Animalia',
                   'insect - oorworm (Dermaptera)' : 'Animalia',
                   'insect - luis (Phthiraptera)' : 'Animalia',
                   'kaakworm (Gnathostomulida)' : 'Animalia',
                   'pauropood' : 'Animalia',
                   'oerinsect (Protura)' : 'Animalia',
                   'loricifeer' : 'Animalia',
                   'insect - schorpioenvlieg (Mecoptera)' : 'Animalia',
                   'paardenhaarworm (Nematomorpha)' : 'Animalia',
                   'insect - kakkerlak (Dictyoptera)' : 'Animalia',
                   'priapulid' : 'Animalia',
                   'insect - zilvervis (Thysanura)' : 'Animalia',
                   'eikelworm (Hemichordata)' : 'Animalia',
                   'hooiwagen (Opiliones)' : 'Animalia',
                   'middendiertje (Mesozoa)' : 'Animalia',
                   'pijlworm (Chaetognatha)' : 'Animalia',
                   'insect - rotsspringer (Archaeognatha)' : 'Animalia',
                   'dwergpotige (Symphyla)' : 'Animalia',
                   'insect - kameelhalsvlieg (Raphidioptera)' : 'Animalia',
                   'hoefijzerworm (Phoronida)' : 'Animalia',
                   'insect - wandelende tak (Phasmida)' : 'Animalia',
                   'lipklever (Cycliophora)' : 'Animalia',
                   'insect - elzenvlieg (Megaloptera)' : 'Animalia',
                   'schorpioen' : 'Animalia',
                   'insect - bidsprinkhaan (Mantodea)' : 'Animalia',
                   'ongedetermineerd' : None,
                   np.nan : None
                    }

In [45]:
recorder_names['kingdom'] = recorder_names['TAXON_GROUP_NAME'].replace(mapping_kingdom)

In [46]:
recorder_names["kingdom"].unique()

array(['Plantae', 'Animalia', 'Protozoa', 'Fungi', 'Bacteria', None,
       'Viruses'], dtype=object)

In [47]:
len(recorder_names)

144135

In [70]:
%%timeit
gbif_matching = extract_species_information(testcase, 
                                            namecol="scientificName", 
                                            kingdomcol = "kingdom")

Using columns scientificName and kingdom for API request.
Using columns scientificName and kingdom for API request.
Using columns scientificName and kingdom for API request.
Using columns scientificName and kingdom for API request.
1 loop, best of 3: 11.3 s per loop


Mapping with GBIF:

In [50]:
gbif_matching = extract_species_information(recorder_names, namecol="scientificName",
                                           kingdomcol = "kingdom")

Using columns scientificName and kingdom for API request.


ConnectionError: HTTPConnectionPool(host='api.gbif.org', port=80): Max retries exceeded with url: /v1/species/match?verbose=False&strict=True&kingdom=Animalia&name=Nicothoidae (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7f3ecf1d6c50>: Failed to establish a new connection: [Errno -2] Name or service not known',))

Get some statistics about matching:

The number of names that were not found by the GBIF API:

In [31]:
sum(gbif_matching["gbifapi_matchType"].isnull())

180

The type of matching for the other taxon names:

In [34]:
gbif_matching.groupby(("gbifapi_matchType", "gbifapi_confidence")).count()["recommended nameserver scientific name"]

gbifapi_matchType  gbifapi_confidence
EXACT              90                       1
                   93                       8
                   95                      79
                   96                      47
                   97                     164
                   98                      12
                   99                      55
                   100                   2826
FUZZY              84                       1
                   87                       1
                   88                       7
                   89                       1
                   90                       1
                   93                       1
                   95                       6
Name: recommended nameserver scientific name, dtype: int64

In [91]:
pandas_df_to_markdown_table(gbif_matching[gbif_matching["gbifapi_matchType"].isnull()])

In [67]:
gbif_matching.to_csv("recorder_gbif_matched.csv")

##  Species list to report on defined by Tim/Sander

Read species list:

In [68]:
invasive_species = pd.read_csv("../../species-list/species-list.tsv", delimiter="\t", dtype=object)

In [69]:
invasive_species

Unnamed: 0,name,kingdom,euConcernStatus,gbifapi_usageKey,gbifapi_scientificName,gbifapi_canonicalName,gbifapi_status,gbifapi_rank,gbifapi_matchType,gbifapi_confidence,gbifapi_acceptedKey,gbifapi_acceptedScientificName
0,Acer negundo,Plantae,under consideration,3189866,Acer negundo L.,Acer negundo,ACCEPTED,SPECIES,EXACT,100,3189866,Acer negundo L.
1,Alopochen aegyptiaca,Animalia,under consideration,2498252,"Alopochen aegyptiaca (Linnaeus, 1766)",Alopochen aegyptiaca,ACCEPTED,SPECIES,EXACT,100,2498252,"Alopochen aegyptiaca (Linnaeus, 1766)"
2,Alternanthera philoxeroides,Plantae,under consideration,3084923,Alternanthera philoxeroides (Mart.) Griseb.,Alternanthera philoxeroides,ACCEPTED,SPECIES,EXACT,100,3084923,Alternanthera philoxeroides (Mart.) Griseb.
3,Ameiurus melas,Animalia,under consideration,2340977,"Ameiurus melas (Rafinesque, 1820)",Ameiurus melas,ACCEPTED,SPECIES,EXACT,100,2340977,"Ameiurus melas (Rafinesque, 1820)"
4,Asclepias syriaca,Plantae,under consideration,3170247,Asclepias syriaca L.,Asclepias syriaca,ACCEPTED,SPECIES,EXACT,100,3170247,Asclepias syriaca L.
5,Symphyotrichum salignum,Plantae,,3151811,Symphyotrichum ×salignum (Willd.) G.L. Nesom,Symphyotrichum salignum,ACCEPTED,SPECIES,EXACT,100,3151811,Symphyotrichum ×salignum (Willd.) G.L. Nesom
6,Baccharis halimifolia,Plantae,listed,3129663,Baccharis halimifolia L.,Baccharis halimifolia,ACCEPTED,SPECIES,EXACT,100,3129663,Baccharis halimifolia L.
7,Bison bison,Animalia,under consideration,2441176,"Bison bison (Linnaeus, 1758)",Bison bison,ACCEPTED,SPECIES,EXACT,100,2441176,"Bison bison (Linnaeus, 1758)"
8,Cabomba caroliniana,Plantae,listed,2882443,Cabomba caroliniana A. Gray,Cabomba caroliniana,ACCEPTED,SPECIES,EXACT,100,2882443,Cabomba caroliniana A. Gray
9,Callosciurus erythraeus,Animalia,listed,2437394,"Callosciurus erythraeus (Pallas, 1779)",Callosciurus erythraeus,ACCEPTED,SPECIES,EXACT,100,2437394,"Callosciurus erythraeus (Pallas, 1779)"


In [70]:
gbif_matching.columns

Index(['recommended nameserver scientific name',
       'INBO_TAXON_VERSION_KEY_FOR_RECOMMENDED_NAME', 'INFORMAL GROUP',
       'kingdom', 'gbifapi_usageKey', 'gbifapi_scientificName',
       'gbifapi_canonicalName', 'gbifapi_status', 'gbifapi_rank',
       'gbifapi_matchType', 'gbifapi_confidence', 'gbifapi_acceptedKey',
       'gbifapi_acceptedScientificName'],
      dtype='object')

In [71]:
invasive_species_key = invasive_species[~invasive_species["gbifapi_acceptedKey"].isnull()][['name', 'kingdom', 
                                                                                            'gbifapi_acceptedKey', 
                                                                                            'gbifapi_acceptedScientificName']]

In [72]:
gbif_matching_key = gbif_matching[~gbif_matching["gbifapi_acceptedKey"].isnull()][['recommended nameserver scientific name', 'INFORMAL GROUP', 
                               'gbifapi_acceptedKey', 'gbifapi_status', 'gbifapi_confidence']]

In [102]:
combination = pd.merge(invasive_species_key, 
                gbif_matching_key, 
                how='left',
                on='gbifapi_acceptedKey', 
                suffixes=('_species', '_recorder'))

In [103]:
combination

Unnamed: 0,name,kingdom,gbifapi_acceptedKey,gbifapi_acceptedScientificName,recommended nameserver scientific name,INFORMAL GROUP,gbifapi_status,gbifapi_confidence
0,Acer negundo,Plantae,3189866,Acer negundo L.,Acer negundo,bloemplant,ACCEPTED,100
1,Alopochen aegyptiaca,Animalia,2498252,"Alopochen aegyptiaca (Linnaeus, 1766)",,,,
2,Alternanthera philoxeroides,Plantae,3084923,Alternanthera philoxeroides (Mart.) Griseb.,,,,
3,Ameiurus melas,Animalia,2340977,"Ameiurus melas (Rafinesque, 1820)",,,,
4,Asclepias syriaca,Plantae,3170247,Asclepias syriaca L.,,,,
5,Symphyotrichum salignum,Plantae,3151811,Symphyotrichum ×salignum (Willd.) G.L. Nesom,,,,
6,Baccharis halimifolia,Plantae,3129663,Baccharis halimifolia L.,Baccharis halimifolia,bloemplant,ACCEPTED,100
7,Bison bison,Animalia,2441176,"Bison bison (Linnaeus, 1758)",,,,
8,Cabomba caroliniana,Plantae,2882443,Cabomba caroliniana A. Gray,,,,
9,Callosciurus erythraeus,Animalia,2437394,"Callosciurus erythraeus (Pallas, 1779)",Callosciurus erythraeus,landzoogdier,ACCEPTED,100


In [97]:
combination.to_csv("species_list_with_recorder_version.csv")

In [95]:
recorder_matched = combination[~combination["recommended nameserver scientific name"].isnull()]

In [83]:
len(recorder_matched)

40