# Verify mapping: planten-exoten

In [1]:
import pandas as pd
import numpy as np
import sys
import yaml
from pywhip import whip_csv
from IPython.display import HTML, display_html

## Process data

1. Go to https://ipt.inbo.be/resource?r=planten-exoten-natuurpunt-occurrences
2. Download DwC-A
3. Unzip and place `occurrence.txt` in `data/raw/plants/`

Read data:

In [2]:
data = pd.read_csv('../data/raw/plants/occurrence.txt', delimiter='\t', dtype=object)

## Verify data

Steps that cannot be done with whip.

### occurrenceID duplicates

In [3]:
# Number of records with a duplicate occurrenceID. Should be 0.
data[data['occurrenceID'].duplicated(keep=False)]['occurrenceID'].sort_values().count()

0

### scientificName

In [4]:
names = data.groupby(['taxonID', 'scientificName','taxonRank','vernacularName'])['occurrenceID'].count().reset_index()

Save unique scientific names to file:

In [5]:
names[['taxonID', 'scientificName', 'taxonRank', 'vernacularName']].to_csv('../data/interim/plants-species.csv', index=False)

## Some metadata stats

### Basic metadata

Number of records:

In [6]:
len(data)

1468904

### Taxonomic coverage

In [7]:
data.groupby('taxonRank').agg({'scientificName': pd.Series.nunique}).reset_index()

Unnamed: 0,taxonRank,scientificName
0,cultivar,47
1,forma,6
2,hybrid,167
3,multispecies,350
4,species,1887
5,subspecies,148
6,variety,60


### Temporal coverage

In [8]:
data['eventDate'].min()

'1862-06-30'

In [9]:
data['eventDate'].max()

'2025-12-14'

### Sampling methods

In [10]:
data.groupby('identificationVerificationStatus').count()['occurrenceID'].reset_index()

Unnamed: 0,identificationVerificationStatus,occurrenceID
0,approved on expert judgement,143398
1,approved on knowledge rules,519890
2,approved on photographic evidence,498660
3,unverified,306956


## Whip data

In [11]:
spec_file = open('../specification/plants-dwc-occurrence.yml').read()
specifications = yaml.safe_load(spec_file)

In [12]:
whipped = whip_csv("../data/raw/plants/occurrence.txt", specifications, delimiter='\t')

Your dataset does not comply with the specifications, use get_report() for more detailed information.


Show report:

In [13]:
display_html(HTML(whipped.get_report('html')), metadata=dict(isolated=True))

#,Data value,Message,Failed rows,First row
1,translocation,unallowed value translocation,4,367929

#,Data value,Message,Failed rows,First row
1,aecium,unallowed value aecium,34,1290313
2,remains_old_fruiting_body,unallowed value remains_old_fruiting_body,5,1163367
3,translocation,unallowed value translocation,4,367929
4,spermogonium,unallowed value spermogonium,2,1290369
5,telium,unallowed value telium,1,1290308
6,uredinium,unallowed value uredinium,1,1290365

#,Data value,Message,Failed rows,First row
1,aecium,unallowed value aecium,34,1290313
2,remains_old_fruiting_body,unallowed value remains_old_fruiting_body,5,1163367
3,spermogonium,unallowed value spermogonium,2,1290369
4,telium,unallowed value telium,1,1290308
5,uredinium,unallowed value uredinium,1,1290365

#,Data value,Message,Failed rows,First row
1,translocation,unallowed value translocation,4,367929
2,specimen collected | DNA barcoding,unallowed value specimen collected | DNA barcoding,3,1165289
