# Verify mapping

In [1]:
import pandas as pd
import numpy as np
import yaml
from pywhip import whip_csv
from IPython.display import HTML, display_html

## Read data

In [2]:
event = pd.read_csv("../data/event.txt", delimiter="\t", dtype=object)

In [3]:
occ = pd.read_csv("../data/occurrence.txt", delimiter="\t", dtype=object)

In [4]:
mof = pd.read_csv("../data/measurementorfact.txt", delimiter="\t", dtype=object)

## Some stats

Number of records:

In [5]:
len(event)

55

In [6]:
len(occ)

466

In [7]:
len(mof)

1398

In [8]:
event["eventDate"].min()

'2018-06-09'

In [9]:
event["eventDate"].max()

'2018-07-17'

In [10]:
occ["scientificName"].unique()

array(['Apatura iris', 'Vanessa atalanta', 'Gonepteryx rhamni',
       'Polygonia c-album', 'Satyrium ilicis', 'Maniola jurtina',
       'Celastrina argiolus', 'Limenitis camilla',
       'Coenonympha pamphilus', 'Issoria lathonia', 'Pieris brassicae',
       'Ochlodes sylvanus', 'Aglais io', 'Papilio machaon',
       'Pararge aegeria', 'Argynnis paphia', 'Pieris rapae',
       'Pieris napi', 'Favonius quercus', 'Vanessa cardui',
       'Lycaena phlaeas', 'Polyommatus icarus', 'Plebejus argus',
       'Araschnia levana', 'Aphantopus hyperantus', 'Pyronia tithonus',
       'Aricia agestis', 'Satyrium w-album'], dtype=object)

In [11]:
occ.groupby(["scientificName","taxonRank","vernacularName"])["occurrenceID"].count().reset_index()

Unnamed: 0,scientificName,taxonRank,vernacularName,occurrenceID
0,Aglais io,species,Dagpauwoog,14
1,Apatura iris,species,Grote weerschijnvlinder,43
2,Aphantopus hyperantus,species,Koevinkje,6
3,Araschnia levana,species,Landkaartje,9
4,Argynnis paphia,species,Keizersmantel,12
5,Aricia agestis,species,Bruin blauwtje,2
6,Celastrina argiolus,species,Boomblauwtje,29
7,Coenonympha pamphilus,species,Hooibeestje,11
8,Favonius quercus,species,Eikenpage,19
9,Gonepteryx rhamni,species,Citroenvlinder,29


## Verify data

### Relationships between files

In [12]:
occ_event = pd.merge(occ, event, how = "left")
mof_event = pd.merge(mof, event, how = "left")

Number of records with that have empty values when merging with event. Should be 0 for all.

In [13]:
occ_event[occ_event["type"].isnull()]["id"].unique()

array([], dtype=object)

In [14]:
mof_event[mof_event["type"].isnull()]["id"].unique()

array([], dtype=object)

### Unique IDs

Number of records with a duplicate ids. Should be 0 for all.

In [15]:
event[event["eventID"].duplicated(keep=False)]["eventID"].sort_values().count()

0

In [16]:
occ[occ["occurrenceID"].duplicated(keep=False)]["occurrenceID"].sort_values().count()

0

## Whip data

### Event

In [17]:
event_spec_file = open("../datasets/meetnetten-butterflies-occurrences/specification/dwc-event.yaml").read()
event_spec = yaml.load(event_spec_file)

  


In [18]:
event_whipped = whip_csv("../data/event.txt", event_spec, delimiter="\t")

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [19]:
# display(HTML(event_whipped.get_report("html")))
f = open("../reports/event.html", "w")
f.write(event_whipped.get_report("html"))

69663

### Occurrence

In [20]:
occ_spec_file = open("../datasets/meetnetten-butterflies-occurrences/specification/dwc-occurrence.yaml").read()
occ_spec = yaml.load(occ_spec_file)

  


In [21]:
occ_whipped = whip_csv("../data/occurrence.txt", occ_spec, delimiter="\t")

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [22]:
# display(HTML(occ_whipped.get_report("html")))
f = open("../reports/occurrence.html", "w")
f.write(occ_whipped.get_report("html"))

50065

### Measurement or fact

In [23]:
mof_spec_file = open("../datasets/meetnetten-butterflies-occurrences/specification/dwc-mof.yaml").read()
mof_spec = yaml.load(mof_spec_file)

  


In [24]:
mof_whipped = whip_csv("../data/measurementorfact.txt", mof_spec, delimiter="\t")

Dataset does not comply the specifications, check reportsfor a more detailed information.


In [25]:
# display(HTML(mof_whipped.get_report("html")))
f = open("../reports/mof.html", "w")
f.write(mof_whipped.get_report("html"))

27591