In [1]:
import csv
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 80)

In [2]:
from dwcavalidator.collect import DwcaScreening

# Dataset to screen:

In [3]:
dataset_draft = pd.read_csv('./example_draft_presentation.tsv', sep='\t', dtype=object)
dataset_draft.index = np.arange(1, 6, 1)

In [4]:
!head -n 2 example_draft_presentation.tsv

id	type	language	license	rightsHolder	accessRights	datasetID	institutionCode	datasetName	ownerInstitutionCode	basisOfRecord	informationWithheld	dataGeneralizations	occurrenceID	recordedBy	individualCount	sex	lifeStage	associatedReferences	samplingProtocol	samplingEffort	eventDate	verbatimEventDate	continent	countryCode	stateProvince	municipality	verbatimLocality	verbatimCoordinates	verbatimCoordinateSystem	verbatimSRS	decimalLatitude	decimalLongitude	geodeticDatum	coordinateUncertaintyInMeters	georeferenceRemarks	identificationVerificationStatus	scientificName	kingdom	phylum	class	order	taxonRank	scientificNameAuthorship	vernacularName	nomenclaturalCode
INBO:VLINDERS:00000637	Event	en	http://creativecommons.org/publicdomain/zero/1.0/	INBO	http://www.inbo.be/en/norms-for-data-use	http://doi.org/10.15468/njgbmh	INBO	Vlinderdatabank - Butterflies in Flanders and the Brussels Capital Region, Belgium	INBO	HumanObservation	see metadata	coordinates are generalized to centroid a 5x5km UTM gri

In [16]:
dataset_draft[["occurrenceID", "basisOfRecord", "eventDate", 
               "decimalLatitude", "decimalLongitude", "recordedBy"]]

Unnamed: 0,occurrenceID,basisOfRecord,eventDate,decimalLatitude,decimalLongitude,recordedBy
1,INBO:VLINDERS:00000637,HumanObservation,1984-07-24/2000-01-03,51.15889,3.10725,observerID:JNHRLP | observerID:JNHRLCTGH
2,INBO:VLINDERS:00000651,HumanObservation,07241981,51.02361,3.32081,observerID:JNHRLP ; observerID:JNHRLC
3,INBO:VLINDERS:00000717,Human Observation,1985-04-19,50.88093,4.38597,observerID:JNHRLP | observerID:JNHRLC
4,INBO:VLINDERS:00000786,HumanObservation,1985-09-12,58.11234,58.0,observerID:DRCKPL
5,,HumanObservation,1987-04-15,51.05594,4.74792,observerID:925


In [6]:
!head -n 30 dwc-occurrence.yaml

occurrenceID:
 empty: False # Every record should have an occurrenceID.

basisOfRecord:
 allowed: [HumanObservation, PreservedSpecimen, Occurrence]

eventDate:
 dateformat: ['%Y-%m-%d', '%Y-%m', '%Y'] # The ISO8601 format, but no ranges
 mindate: 1830-01-01
 maxdate: 2014-12-31

decimalLatitude:
 type: float
 numberformat: '.5'
 # Coordinates are within Flanders, Belgium
 min: 50.68
 max: 51.51

decimalLongitude:
 type: float
 numberformat: '.5'
 # Coordinates are within Flanders, Belgium
 min: 2.54
 max: 5.92

recordedBy:
 delimitedvalues:
   delimiter: " | " # Observers are delimited with space pipe space
   regex: 'observerID:.+'
   maxlength: 17 # ID contains 6 characters (11 for observerID: + 6 for ID itself)


In [7]:
validate_my_dataset = DwcaScreening('./dwc-occurrence.yaml', 
                                    lowercase_terms=False, 
                                    unknown_fields=True)
validate_my_dataset.screen_dwc('./example_draft_presentation.tsv', 
                               delimiter='\t', maxentries=50)

Dataset does not comply the specifications, check errors for a more detailed information.


In [14]:
my_errors = validate_my_dataset.export_table()
my_errors[my_errors.isnull()] = ''
my_errors[["occurrenceID", "basisOfRecord", "eventDate", 
           "decimalLatitude", "decimalLongitude", "recordedBy"]]

Unnamed: 0,occurrenceID,basisOfRecord,eventDate,decimalLatitude,decimalLongitude,recordedBy
1,,,"String format not compliant with %Y-%m-%d, %Y-%m, %Y",,,{1: ['max length is 17']}
2,,,"String format not compliant with %Y-%m-%d, %Y-%m, %Y",,,{0: ['max length is 17']}
3,,unallowed value Human Observation,,,,
4,,,,max value is 51.51,numberformat of value 58.0 not in agreement with .5,
5,empty values not allowed,,,,,


# After correction...

In [9]:
vlinders_subset = pd.read_csv('./example_corrected_presentation.tsv', sep='\t')

In [18]:
vlinders_subset[["occurrenceID", "basisOfRecord", "eventDate", 
                 "decimalLatitude", "decimalLongitude", "recordedBy"]]

Unnamed: 0,occurrenceID,basisOfRecord,eventDate,decimalLatitude,decimalLongitude,recordedBy
0,INBO:VLINDERS:00000637,HumanObservation,1984-07-24,51.15889,3.10725,observerID:JNHRLP | observerID:JNHRLC
1,INBO:VLINDERS:00000651,HumanObservation,1981-07-24,51.02361,3.32081,observerID:JNHRLP | observerID:JNHRLC
2,INBO:VLINDERS:00000717,HumanObservation,1985-04-19,50.88093,4.38597,observerID:JNHRLP | observerID:JNHRLC
3,INBO:VLINDERS:00000786,HumanObservation,1985-09-12,51.11,3.96424,observerID:DRCKPL
4,INBO:VLINDERS:00001041,HumanObservation,1987-04-15,51.05594,4.74792,observerID:925


In [11]:
vlinders = DwcaScreening('./dwc-occurrence.yaml', 
                         lowercase_terms=False, unknown_fields=True)
vlinders.screen_dwc('./example_corrected_presentation.tsv',
                    delimiter='\t', maxentries=50)

Hooray, your data set is according to the guidelines!
