# Verify mapping: planten-exoten

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../../../src/')
import support_functions as support
import yaml
from pywhip import whip_csv
from IPython.display import HTML, display_html

## Process data

1. Go to https://ipt.inbo.be/resource?r=planten-exoten-natuurpunt-occurrences
2. Download DwC-A
3. Unzip and place `occurrence.txt` in `data/raw/`

Read data:

In [2]:
data = pd.read_csv('../data/raw/occurrence.txt', delimiter='\t', dtype=object)

## Verify data

Steps that cannot be done with whip.

### occurrenceID duplicates

In [3]:
# Number of records with a duplicate occurrenceID. Should be 0.
data[data['occurrenceID'].duplicated(keep=False)]['occurrenceID'].sort_values().count()

0

### scientificName

In [4]:
names = data.groupby(['taxonID', 'scientificName','taxonRank','vernacularName'])['occurrenceID'].count().reset_index()

Save unique scientific names to file:

In [5]:
names[['taxonID', 'scientificName', 'taxonRank', 'vernacularName']].to_csv('../data/interim/species.csv', index=False)

## Some metadata stats

### Basic metadata

Number of records:

In [6]:
len(data)

942751

### Taxonomic coverage

In [7]:
data.groupby('taxonRank').agg({'scientificName': pd.Series.nunique}).reset_index()

Unnamed: 0,taxonRank,scientificName
0,cultivar,48
1,forma,4
2,hybrid,163
3,multispecies,321
4,species,1742
5,subspecies,150
6,variety,52


### Temporal coverage

In [8]:
data['eventDate'].min()

'1862-06-30'

In [9]:
data['eventDate'].max()

'2023-11-19'

### Sampling methods

In [10]:
data.groupby('identificationVerificationStatus').count()['occurrenceID'].reset_index()

Unnamed: 0,identificationVerificationStatus,occurrenceID
0,approved on expert judgement,123578
1,approved on knowledge rules,310524
2,approved on photographic evidence,342434
3,unverified,166215


## Whip data

In [11]:
spec_file = open('../specification/dwc-occurrence.yaml').read()
specifications = yaml.safe_load(spec_file)

In [12]:
whipped = whip_csv("../data/raw/occurrence.txt", specifications, delimiter='\t')

Your dataset does not comply with the specifications, use get_report() for more detailed information.


Show report:

In [13]:
display_html(HTML(whipped.get_report('html')), metadata=dict(isolated=True))

#,Data value,Message,Failed rows,First row
1,2147483647,max value is 100000000,1,317272
