# Verify mapping

In [51]:
import pandas as pd
import numpy as np
import yaml
from pywhip import whip_csv
from IPython.display import HTML, display_html

## Read data

In [52]:
event = pd.read_csv("../data/event.txt", delimiter="\t", dtype=object)

In [53]:
occ = pd.read_csv("../data/occurrence.txt", delimiter="\t", dtype=object)

In [54]:
mof = pd.read_csv("../data/measurementorfact.txt", delimiter="\t", dtype=object)

## Some stats

Number of records:

In [55]:
len(event)

39

In [56]:
len(occ)

466

In [57]:
len(mof)

1398

In [58]:
event['eventDate'].min()

'2018-06-10'

In [59]:
event['eventDate'].max()

'2018-07-15'

In [61]:
occ['scientificName'].unique()

array(['Apatura iris', 'Vanessa atalanta', 'Gonepteryx rhamni',
       'Polygonia c-album', 'Maniola jurtina', 'Pieris brassicae',
       'Ochlodes sylvanus', 'Satyrium ilicis', 'Pararge aegeria',
       'Celastrina argiolus', 'Favonius quercus', 'Aglais io',
       'Papilio machaon', 'Argynnis paphia', 'Limenitis camilla',
       'Vanessa cardui', 'Pieris rapae', 'Pieris napi',
       'Coenonympha pamphilus', 'Issoria lathonia', 'Lycaena phlaeas',
       'Pyronia tithonus', 'Araschnia levana', 'Aphantopus hyperantus',
       'Polyommatus icarus', 'Plebejus argus', 'Aricia agestis',
       'Satyrium w-album'], dtype=object)

In [62]:
occ.groupby(['scientificName','taxonRank','vernacularName'])['occurrenceID'].count().reset_index()

Unnamed: 0,scientificName,taxonRank,vernacularName,occurrenceID
0,Aglais io,species,Dagpauwoog,14
1,Apatura iris,species,Grote weerschijnvlinder,43
2,Aphantopus hyperantus,species,Koevinkje,6
3,Araschnia levana,species,Landkaartje,9
4,Argynnis paphia,species,Keizersmantel,12
5,Aricia agestis,species,Bruin blauwtje,2
6,Celastrina argiolus,species,Boomblauwtje,29
7,Coenonympha pamphilus,species,Hooibeestje,11
8,Favonius quercus,species,Eikenpage,19
9,Gonepteryx rhamni,species,Citroenvlinder,29


## Verify data

### Relationships between files

In [63]:
occ_event = pd.merge(occ, event, how = "left")
mof_event = pd.merge(mof, event, how = "left")

Number of records with that have empty values when merging with event. Should be 0 for all.

In [64]:
occ_event[occ_event["type"].isnull()]["occurrenceID"].count()

135

In [65]:
mof_event[mof_event["type"].isnull()]["measurementType"].count()

405

### Unique IDs

Number of records with a duplicate ids. Should be 0 for all.

In [66]:
event[event["eventID"].duplicated(keep=False)]["eventID"].sort_values().count()

0

In [67]:
occ[occ["occurrenceID"].duplicated(keep=False)]["occurrenceID"].sort_values().count()

0

## Whip data

### Event

In [69]:
event_spec_file = open('../datasets/meetnetten-butterflies-area-occurrences/specification/dwc-event.yaml').read()
event_spec = yaml.load(event_spec_file)

  


In [70]:
event_whipped = whip_csv("../data/event.txt", event_spec, delimiter='\t')

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [71]:
display_html(HTML(event_whipped.get_report('html')), metadata=dict(isolated=True))

#,Data value,Message,Failed rows,First row
1,data are generalized from POLYGON to a UTM 1Km grid,unallowed value data are generalized from POLYGON to a UTM 1Km grid,20,1
2,data are generalized from POLYGON to a UTM 5Km grid,unallowed value data are generalized from POLYGON to a UTM 5Km grid,19,2

#,Data value,Message,Failed rows,First row
1,"Meetnetten - Site count for butterflies in Flanders, Belgium","unallowed value Meetnetten - Site count for butterflies in Flanders, Belgium",39,1

#,Data value,Message,Failed rows,First row
1,High resolution data available on request,unallowed value High resolution data available on request,39,1

#,Data value,Message,Failed rows,First row
1,butterfly site count,unallowed value butterfly site count,39,1


### Occurrence

In [72]:
occ_spec_file = open('../datasets/meetnetten-butterflies-area-occurrences/specification/dwc-occurrence.yaml').read()
occ_spec = yaml.load(occ_spec_file)

  


In [73]:
occ_whipped = whip_csv("../data/occurrence.txt", occ_spec, delimiter='\t')

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [74]:
display_html(HTML(occ_whipped.get_report('html')), metadata=dict(isolated=True))

#,Data value,Message,Failed rows,First row
1,INBO:MEETNETTEN:OCC:0622250,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,1
2,INBO:MEETNETTEN:OCC:0622356,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,2
3,INBO:MEETNETTEN:OCC:0622357,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,3
4,INBO:MEETNETTEN:OCC:0622358,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,4
5,INBO:MEETNETTEN:OCC:0622359,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,5
6,INBO:MEETNETTEN:OCC:0622360,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,6
7,INBO:MEETNETTEN:OCC:0622361,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,7
8,INBO:MEETNETTEN:OCC:0622362,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,8
9,INBO:MEETNETTEN:OCC:0622352,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,9
10,INBO:MEETNETTEN:OCC:0623010,value does not match regex 'INBO:MEETNET:OCC:\d{7}',1,10


### Measurement or fact

In [75]:
mof_spec_file = open('../datasets/meetnetten-butterflies-area-occurrences/specification/dwc-mof.yaml').read()
mof_spec = yaml.load(mof_spec_file)

  


In [76]:
mof_whipped = whip_csv("../data/measurementorfact.txt", mof_spec, delimiter='\t')

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [77]:
display_html(HTML(mof_whipped.get_report('html')), metadata=dict(isolated=True))

#,Data value,Message,Failed rows,First row
1,wind-force,unallowed value wind-force,466,1

#,Data value,Message,Failed rows,First row
1,cloudiness,unallowed value cloudiness,466,3

#,Data value,Message,Failed rows,First row
1,clear sky (0/8),unallowed value clear sky (0/8),254,6
2,partially cloudy (1 to 2/8,unallowed value partially cloudy (1 to 2/8,111,3
3,half cloudy (3 to 5/8),unallowed value half cloudy (3 to 5/8),49,15
4,unknown,unallowed value unknown,40,37
5,heavy clouded (6 to 7/8),unallowed value heavy clouded (6 to 7/8),2,12
