# Verify mapping

In [1]:
import pandas as pd
import numpy as np
import yaml
from pywhip import whip_csv
from IPython.display import HTML, display_html

## Read data

In [2]:
dataset_dir = "34 meetnetten-rugstreeppad-zichtwaarneming-occurrences"

In [3]:
event = pd.read_csv("../data/event.txt", delimiter="\t", dtype=object)

In [4]:
occ = pd.read_csv("../data/occurrence.txt", delimiter="\t", dtype=object)

In [5]:
# mof = pd.read_csv("../data/measurementorfact.txt", delimiter="\t", dtype=object)

## Some stats

Number of records:

In [6]:
len(event)

19

In [7]:
len(occ)

117

In [8]:
# len(mof)

In [9]:
event["eventDate"].min()

'2018-05-30'

In [10]:
event["eventDate"].max()

'2018-06-23'

In [11]:
occ["scientificName"].unique()

array(['Bufo calamita', 'Bufo bufo', 'Rana temporaria',
       'Pelophylax ridibundus'], dtype=object)

In [12]:
occ.groupby(["scientificName","taxonRank","vernacularName"])["occurrenceID"].count().reset_index()

Unnamed: 0,scientificName,taxonRank,vernacularName,occurrenceID
0,Bufo bufo,species,Gewone pad,1
1,Bufo calamita,species,Rugstreeppad,114
2,Pelophylax ridibundus,species,Meerkikker,1
3,Rana temporaria,species,Bruine kikker,1


## Verify data

### Relationships between files

In [13]:
occ_event = pd.merge(occ, event, how = "left")
# mof_event = pd.merge(mof, event, how = "left")

Number of records with that have empty values when merging with event. Should be 0 for all.

In [14]:
occ_event[occ_event["type"].isnull()]["id"].unique()

array([], dtype=object)

In [15]:
# mof_event[mof_event["type"].isnull()]["id"].unique()

### Unique IDs

Number of records with a duplicate ids. Should be 0 for all.

In [16]:
event[event["eventID"].duplicated(keep=False)]["eventID"].sort_values().count()

0

In [17]:
occ[occ["occurrenceID"].duplicated(keep=False)]["occurrenceID"].sort_values().count()

0

## Whip data

### Event

In [18]:
event_spec_file = open("../datasets/" + dataset_dir + "/specification/dwc-event.yaml").read()
event_spec = yaml.load(event_spec_file)

  


In [19]:
event_whipped = whip_csv("../data/event.txt", event_spec, delimiter="\t")

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [20]:
display_html(HTML(event_whipped.get_report("html")), metadata=dict(isolated=True))

#,Data value,Message,Failed rows,First row
1,"Meetnetten.be - Rugstreeppad in Flanders, Belgium","unallowed value Meetnetten.be - Rugstreeppad in Flanders, Belgium",19,1


### Occurrence

In [21]:
occ_spec_file = open("../datasets/" + dataset_dir + "/specification/dwc-occurrence.yaml").read()
occ_spec = yaml.load(occ_spec_file)

  


In [22]:
occ_whipped = whip_csv("../data/occurrence.txt", occ_spec, delimiter="\t")

Hooray, your data set is according to the guidelines!


In [23]:
display_html(HTML(occ_whipped.get_report("html")), metadata=dict(isolated=True))

### Measurement or fact

In [24]:
mof_spec_file = open("../datasets/" + dataset_dir + "/specification/dwc-mof.yaml").read()
mof_spec = yaml.load(mof_spec_file)

  


In [None]:
mof_whipped = whip_csv("../data/measurementorfact.txt", mof_spec, delimiter="\t")

In [None]:
display_html(HTML(mof_whipped.get_report("html")), metadata=dict(isolated=True))