# Verify mapping

In [1]:
import pandas as pd
import numpy as np
import yaml
from pywhip import whip_csv
from IPython.display import HTML, display_html

## Read data

In [2]:
event = pd.read_csv("../data/event.txt", delimiter="\t", dtype=object)

In [3]:
occ = pd.read_csv("../data/occurrence.txt", delimiter="\t", dtype=object)

In [4]:
mof = pd.read_csv("../data/measurementorfact.txt", delimiter="\t", dtype=object)

## Some stats

Number of records:

In [5]:
len(event)

9816

In [6]:
len(occ)

24124

In [7]:
len(mof)

72372

In [8]:
event["eventDate"].min()

'2016-05-01'

In [9]:
event["eventDate"].max()

'2018-09-09'

In [10]:
occ["scientificName"].unique()

array(['Hipparchia semele', 'Pieris brassicae', 'Vanessa atalanta',
       'Pieris rapae', 'Lasiommata megera', 'Aglais io',
       'Pyronia tithonus', 'Hesperia comma', 'Cupido minimus',
       'Gonepteryx rhamni', 'Erynnis tages', 'Pararge aegeria',
       'Coenonympha pamphilus', 'Polyommatus icarus', 'Aricia agestis',
       'Melitaea cinxia', 'Pyrgus malvae', 'Maniola jurtina',
       'Lycaena phlaeas', 'Thymelicus lineola', 'Leptidea sinapis',
       'Cyaniris semiargus', 'Carcharodus alceae', 'Papilio machaon',
       'Polygonia c-album', 'Carterocephalus palaemon', 'Vanessa cardui',
       'Pieris napi', 'Araschnia levana', 'Aglais urticae',
       'Favonius quercus', 'Celastrina argiolus',
       'Anthocharis cardamines', 'Ochlodes sylvanus', 'Pieris spec.',
       'Aphantopus hyperantus', 'Colias croceus', 'Issoria lathonia',
       'Callophrys rubi', 'Plebejus argus', 'Thymelicus sylvestris',
       'Colias hyale'], dtype=object)

In [11]:
occ.groupby(["scientificName","taxonRank","vernacularName"])["occurrenceID"].count().reset_index()

Unnamed: 0,scientificName,taxonRank,vernacularName,occurrenceID
0,Aglais io,species,Dagpauwoog,773
1,Aglais urticae,species,Kleine vos,123
2,Anthocharis cardamines,species,Oranjetipje,49
3,Aphantopus hyperantus,species,Koevinkje,68
4,Araschnia levana,species,Landkaartje,303
5,Aricia agestis,species,Bruin blauwtje,453
6,Callophrys rubi,species,Groentje,3
7,Carcharodus alceae,species,Kaasjeskruiddikkopje,54
8,Carterocephalus palaemon,species,Bont dikkopje,120
9,Celastrina argiolus,species,Boomblauwtje,342


## Verify data

### Relationships between files

In [12]:
occ_event = pd.merge(occ, event, how = "left")
mof_event = pd.merge(mof, event, how = "left")

Number of records with that have empty values when merging with event. Should be 0 for all.

In [13]:
occ_event[occ_event["type"].isnull()]["id"].unique()

array([], dtype=object)

In [14]:
mof_event[mof_event["type"].isnull()]["id"].unique()

array([], dtype=object)

### Unique IDs

Number of records with a duplicate ids. Should be 0 for all.

In [15]:
event[event["eventID"].duplicated(keep=False)]["eventID"].sort_values().count()

0

In [16]:
occ[occ["occurrenceID"].duplicated(keep=False)]["occurrenceID"].sort_values().count()

0

## Whip data

### Event

In [17]:
event_spec_file = open("../datasets/meetnetten-butterflies-occurrences/specification/dwc-event.yaml").read()
event_spec = yaml.load(event_spec_file)

  


In [18]:
event_whipped = whip_csv("../data/event.txt", event_spec, delimiter="\t")

Hooray, your data set is according to the guidelines!


In [19]:
display(HTML(event_whipped.get_report("html")))

### Occurrence

In [20]:
occ_spec_file = open("../datasets/meetnetten-butterflies-occurrences/specification/dwc-occurrence.yaml").read()
occ_spec = yaml.load(occ_spec_file)

  


In [21]:
occ_whipped = whip_csv("../data/occurrence.txt", occ_spec, delimiter="\t")

Hooray, your data set is according to the guidelines!


In [22]:
display(HTML(occ_whipped.get_report("html")))

### Measurement or fact

In [23]:
mof_spec_file = open("../datasets/meetnetten-butterflies-occurrences/specification/dwc-mof.yaml").read()
mof_spec = yaml.load(mof_spec_file)

  


In [24]:
mof_whipped = whip_csv("../data/measurementorfact.txt", mof_spec, delimiter="\t")

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [25]:
display(HTML(mof_whipped.get_report("html")))

#,Data value,Message,Failed rows,First row
1,partially clouded,unallowed value partially clouded,244,2010
2,unclouded,unallowed value unclouded,174,1629
3,half clouded,unallowed value half clouded,60,840
4,heavily clouded,unallowed value heavily clouded,20,1119

#,Data value,Message,Failed rows,First row
1,windstil (0 Bft),unallowed value windstil (0 Bft),641,209

#,Data value,Message,Failed rows,First row
1,unknown,value 'unknown' is not numeric,1893,28

#,Data value,Message,Failed rows,First row
1,unknown,value 'unknown' is not numeric,1893,28
