# Purpose of this notebook

Show how we fetch data from the CVDR repository to be used to create our corresponding datasets

## Fetching

In [1]:
import collections
import datetime
import time
import random
import pprint

import wetsuite.helpers.notebook
import wetsuite.helpers.localdata
import wetsuite.datacollect.koop_repositories 
import wetsuite.helpers.date
import wetsuite.helpers.etree
import wetsuite.helpers.koop_parse

In [2]:
# store to put downloads into:
cvdr_fetched = wetsuite.helpers.localdata.LocalKV( 'cvdr_fetched.db', str, bytes )

# out of interest  (can take a few seconds once it's large, because get_num_items walks through everything)
cvdr_fetched.summary(get_num_items=True)

{'size_bytes': 56832757760,
 'size_readable': '57G',
 'num_items': 881258,
 'avgsize_bytes': 64490}

In [3]:
# no fancy queries, just date ranges
queries = []

if 0: # larger range in smaller chunks
    # For reference, there 20 to 250 things per day,
    # split many years into shorter spans, to do many fetches in smaller chunks
    for from_date, to_date in wetsuite.helpers.date.date_ranges( 
        datetime.date( 2000, 1, 1 ),
        datetime.date.today(),
        increment_days=50,
        strftime_format="%Y-%m-%d"
    ):
        # TODO: check whether there is a better field than modified
        queries.append( f'dcterms.modified>={from_date} and dcterms.modified<={to_date}' )

else: # last X days
    # ask for recent changes (we actually treat this as 
    #   "fetch documents that were mentioned", not as a "re-fetch things that were changed" )
    some_time_ago = datetime.date.today() - datetime.timedelta(days=40)
    queries.append( f'dcterms.modified>={some_time_ago.strftime("%Y-%m-%d")}' )

#queries = list(reversed(queries))
#random.shuffle(queries)
print( queries )

['dcterms.modified>=2023-12-24']


In [4]:
sru_cvdr = wetsuite.datacollect.koop_repositories.CVDR()

for query in queries:
    print( f'Search: {query}' )
    sru_cvdr.search_retrieve( query ) # purely for the number of records, itself only for the progress bar

    numrecs = sru_cvdr.num_records()
    pbar = wetsuite.helpers.notebook.progress_bar( numrecs, description='fetching' )

    count_cached, count_fetched, count_error = 0, 0, 0

    def cvdr_callback( record_node ):
        ''' Read search result records, pick out the URLs to fetch and fetch them. 
            Is a local function because we count per query, in a slightly weirdly scoped way '''
        #print( wetsuite.helpers.etree.debug_pretty( record_node ) ) # for later reference, if you want to extract more out of these search records
        global count_cached, count_fetched, count_error

        merged = wetsuite.helpers.koop_parse.cvdr_meta( record_node, flatten=True ) 
        # using flatten is a little creative for something that needs to be a precise value (see cvdr_meta's docstring) but in current use it is valid.
        #pprint.pprint( merged )

        for resource_name, resource_key in ( 
            ('XML', 'publicatieurl_xml'),
            ('HTML', 'publicatieurl_xhtml'),
        ):
            if resource_key not in merged:
                print('SKIP: no %r in %r'%(resource_key, merged))
            else:
                try:
                    _, came_from_cache = wetsuite.helpers.localdata.cached_fetch( cvdr_fetched, merged[ resource_key] ) # we currently care only about the XML it links to
                    if not came_from_cache:
                        count_fetched += 1
                        time.sleep( 1 )
                    else:
                        count_cached += 1
                # mainly expecting 404, 500
                except ValueError as e:
                    count_error += 1
                    print( "ERROR downloading %s: %s  for %r"%(resource_name, e, merged[resource_key]))

        pbar.value       += 1
        pbar.description  = f'{count_fetched} fetched, {count_cached} cached' # , {count_error} errors

    try:
        sru_cvdr.search_retrieve_many( query, at_a_time=500, up_to=50000, callback=cvdr_callback)
    except ValueError as e:
        count_error += 1
        print( "ERROR querying %s: %s"%(query, e) )

Search: dcterms.modified>=2023-12-24


fetching:   0%|          | 0/10060 [00:00<?, ?it/s]

SKIP: no 'publicatieurl_xhtml' in {'organisatietype': 'Gemeente', 'publicatieurl_xml': 'https://repository.officiele-overheidspublicaties.nl/cvdr/CVDR696162/1/xml/CVDR696162_1.xml', 'preferred_url': 'https://lokaleregelgeving.overheid.nl/CVDR696162/1', 'identifier': 'CVDR696162_1', 'title': 'Omgevingsplan gemeente Groningen', 'language': 'nl', 'type': 'regeling (overheid:Informatietype)', 'creator': 'Groningen (overheid:Gemeente)', 'modified': '2024-01-01+01:00', 'isFormatOf': 'stcrt-2023-35432 (https://zoek.officielebekendmakingen.nl/stcrt-2023-35432.html),  stb-2023-113 (https://zoek.officielebekendmakingen.nl/stb-2023-113.html),  stb-2022-181 (https://zoek.officielebekendmakingen.nl/stb-2022-181.html),  stb-2022-172 (https://zoek.officielebekendmakingen.nl/stb-2022-172.html),  stb-2021-98 (https://zoek.officielebekendmakingen.nl/stb-2021-98.html),  stb-2020-557 (https://zoek.officielebekendmakingen.nl/stb-2020-557.html),  stb-2020-400 (https://zoek.officielebekendmakingen.nl/stb-202

## Creating dataset

We'll spare you the full contents of that store,
because it contains most versions of most things, 
is even more overcomplete than that because of past experiments,
and probably not something you want to fetch yourself for the sheer size of it.

Mostly for our own reference, it contains keys that are URLs like:
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/html/100078_1.html
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/xml/100078_1.xml

The values are the according files, as bytestrings.

Right now we care more about parseable data than readable pages,
so we focus on the XML (also in the parsing helper functions), 
but also extract HTML for those that prefer it.
We ignore anything else it might contain.

Also, it seems that KOOP search results expose some variation in the capitalisation, led to duplicate URLs in the above, e.g. 
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/xml/100078_1.xml
- https://repository.officiele-overheidspublicaties.nl/cvdr/100078/1/xml/100078_1.xml

...so we also ensure we pick just one.

In [5]:
# case insensitive choice
casededup_xml   = collections.defaultdict(list)  # lowercased version of URL -> actual URLs
casededup_html  = collections.defaultdict(list)  # lowercased version of URL -> actual URLs
ignore_list     = []

unique_xml_urls = []
unique_html_urls = []

for url in cvdr_fetched:
    if url.endswith('.xml'):
        casededup_xml[ url.lower() ].append( url )
    elif url.endswith('.xhtml'):
        casededup_html[ url.lower() ].append( url )
    else:
        ignore_list.append( url )

for lurl in list(casededup_xml):
     url_list = sorted( casededup_xml[lurl] ) #some consistency in which one we pick
     unique_xml_urls.append( url_list[0] )

for lurl in list(casededup_html):
     url_list = sorted( casededup_html[lurl] ) #some consistency in which one we pick
     unique_html_urls.append( url_list[0] )

print( len(cvdr_fetched), len( unique_xml_urls ), len( unique_html_urls ) )

882889 283379 0


In [18]:
# Group expressions by their work ID 
#   work_id -> [ (version_in_expression_id, expression_id, xml_url, html_url), ... ]
groups = collections.defaultdict(list)


for url in unique_xml_urls:
    # fish IDs out of the URL
    ids = url.rsplit('/',1)[1].rsplit('.',1)[0]    
    work_id, expression_id = wetsuite.helpers.koop_parse.cvdr_parse_identifier(ids)
    version_int = int( expression_id.split('_',1)[1], 10)  # made an integer, mainly for sorting
    #print (work_id, expression_id, url)
    #TODO:TOFINISH
    groups[work_id].append(  ( version_int, expression_id, url )  )

lasts_only = []    
for work_id in groups:
    expression_and_urls = sorted( groups[work_id], key=lambda x: x[0] )
    version_int, expression_id, url = expression_and_urls[-1] #right now we just pick the last revision
    #TODO: check the actual validity date.   Right now it will pick a few of the 'planned for the near future' ones
    lasts_only.append( url )

print( len( unique_xml_urls ), len(groups), len(lasts_only) )

283379 236527 236527


In [None]:
## This was some double checking 
#  that these not-quite-unique URLs they have identical content (they do)
#  and count how many of these duplicates are there
# 
# def nxml(bytedata): # reindent as a form of normalization
#     tree = wetsuite.helpers.etree.fromstring(bytedata)
#     tree = wetsuite.helpers.etree.indent( tree )
#     return wetsuite.helpers.etree.tostring( tree, encoding='UTF-8' ) 
# counts = collections.defaultdict(int)        # how many -> how often
# for lurl in list(dedupdata):
#     url_list = dedupdata[lurl]
#     xml_list = list( cvdr_fetched.get(url)   for url in url_list )
#     lxl = len(xml_list)
#     counts[lxl]+=1
#     if lxl > 1:
#         print( url_list )
#         norm_xml_list = list( nxml( cvdr_fetched.get(url) )    for url in url_list )
#         for i in range( 1, len(norm_xml_list) ):
#             is_same = norm_xml_list[0] == norm_xml_list[i]
#             #print("cmp(0,%d)==%s"%(i, is_same))
#             if not is_same:
#                 print('%s not the same as %s'%(url_list[0], url_list[i]))
#                 print(difflib.context_diff(norm_xml_list[0].splitlines(), norm_xml_list[i].splitlines()))
#pprint.pprint(counts)

Start a store that intends to contain just the most recent expression XML for each work.
(Maybe do the same for HTML?)

In [19]:
# takes minute or to just to write that much data  (order of a few GB)
cvdr_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'cvdr-latestonly-xml.db', str, bytes )

cvdr_latestonly_xml._put_meta('description_short',  'Raw XML for the latest expression within each CVDR work set')
cvdr_latestonly_xml._put_meta('description',''' ''')

#pbar = wetsuite.helpers.notebook.progress_bar( numrecs, description='fetching' )
for url in wetsuite.helpers.notebook.ProgressBar( lasts_only ):
    cvdr_latestonly_xml.put( url, cvdr_fetched.get( url ), commit=False )
cvdr_latestonly_xml.commit() # reducing that many commits is factor dozens speed difference (on SSD)
cvdr_latestonly_xml.close()

  0%|          | 0/236527 [00:00<?, ?it/s]

...and stores that contain the plain text, and the metadata, for the same latest expressions. 

These three stores should have exactly the same keys (unless maybe we forget to clean the lastest leftoves betwen rerunning this).

In [21]:
cvdr_latestonly_text = wetsuite.helpers.localdata.LocalKV( 'cvdr-latestonly-text.db', str, str )
cvdr_latestonly_text._put_meta('description_short','Flattened plain text for the latest expression within each CVDR work set') 
cvdr_latestonly_text._put_meta('description','') 

cvdr_latestonly_meta = wetsuite.helpers.localdata.MsgpackKV( 'cvdr-latestonly-meta-struc.db', str, None)
cvdr_latestonly_meta._put_meta('description_short','Metadata for the latest expression within each CVDR work set') 
cvdr_latestonly_meta._put_meta('description','') 

cvdr_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'cvdr_latestonly_xml.db', str, bytes, read_only=True )

for url, xml_bytes in wetsuite.helpers.notebook.ProgressBar( cvdr_latestonly_xml.items() ):
        tree = wetsuite.helpers.etree.fromstring( xml_bytes )
        
        meta = wetsuite.helpers.koop_parse.cvdr_meta(tree, flatten=True)
        #pprint.pprint(meta)
        cvdr_latestonly_meta.put(url, meta, commit=False)

        text = wetsuite.helpers.koop_parse.cvdr_text(tree)
        #print(repr(text))
        cvdr_latestonly_text.put(url, text, commit=False)
        
        #break
cvdr_latestonly_meta.commit()
cvdr_latestonly_text.commit()

  0%|          | 0/169421 [00:00<?, ?it/s]

In [24]:
# examples of the metadata
cvdr_latestonly_meta.random_sample(3)

[('https://repository.officiele-overheidspublicaties.nl/CVDR/144619/1/xml/144619_1.xml',
  {'identifier': '144619_1',
   'title': 'LANDSBESLUIT, HOUDENDE ALGEMENE MAATREGELEN, van de 17de februari 2006 ter uitvoering van artikel 12 van de Regeling Muntstelsel van de Nederlandse Antillen (P.B. 1989, no. 70)',
   'language': 'nl',
   'type': 'regeling (overheid:Informatietype)',
   'creator': 'Curaçao (overheid:Koninkrijksdeel)',
   'modified': '2009-09-23',
   'isRatifiedBy': 'Gouverneur van de Nederlandse Antillen (overheid:BestuursorgaanKoninkrijksdeel)',
   'isFormatOf': 'A.B. 2010, no. 86 en A.B. 2010, no. 87 ()',
   'issued': '2010-10-10',
   'alternative': 'Onbekend',
   'source': 'Regeling Muntstelsel van de Nederlandse Antillen ()',
   'subject': 'financiën en economie',
   'rights': 'De tekst in dit document is vrij van auteursrecht en databankrecht',
   'inwerkingtredingDatum': '2010-10-10',
   'betreft': 'bestendiging Antilliaanse regelgeving in Curaçao',
   'kenmerk': 'onbek

In [None]:
# TODO: See if there is anyything useful in the below that should go above


print( 'PARSING' )
for work_id in groups:

    print( '\n== %s =='%( work_id ) )
    wl = sorted( groups[work_id], key=lambda x:x[0])
    for expression_id, url in wl:
        print( '  %10s  %s'%( expression_id, url ) )

    # pick and parse the most recent (we might want previous versions later, though)
    expression_id, url = wl[-1] 

    bytestring,_,_ = wetsuite.datacollect.db.cached_fetch(url, dbonly=True, given_conn=conn)
    tree = wetsuite.helpers.etree.fromstring( bytestring )
    tree = wetsuite.helpers.etree.strip_namespace( tree )

    #work_id, expression_id = wetsuite.helpers.koop_parse.cvdr_parse_identifier( tree.find('meta/owmskern/identifier').text )

    meta = wetsuite.helpers.koop_parse.cvdr_meta(tree)


    indat  = meta.get('inwerkingtredingDatum')
    if indat is not None:
        indat = indat[0]['text']
    if indat is not None:
        #print(indat)
        indat = parse_date( indat )

    uitdat = meta.get('uitwerkingtredingDatum')
    if uitdat is not None: 
        uitdat = uitdat[0]['text']
    if uitdat is not None: 
        #print(uitdat)
        uitdat = parse_date( uitdat )

    ###  
    # collect things into a dict
    doc = {
        'xml_url':url, 
        'web_url':'https://lokaleregelgeving.overheid.nl/CVDR%s'%( expression_id.replace('_','/') ) # presumably?
    }

    doc['title']      = meta.get('title')[0]['text'] # assumes there's always exactly one

    for fetch_as_list in (
            'alternative', 'subject', 'issued', 'modified', 'onderwerp','betreft',
            'inwerkingtredingDatum', 'uitwerkingtredingDatum', 
            'kenmerk', 'redactioneleToevoeging',
        ):
        dict_list = meta.get(fetch_as_list)
        if dict_list is not None:
            doc[fetch_as_list] = []
            for d in dict_list:
                dtext = d.get('text')
                if dtext is not None:
                    doc[fetch_as_list].append( dtext )

    for fetch_as_list_with_attr in ( 
            ('creator', 'scheme'),
            ('spatial', 'scheme'),
            ('isRatifiedBy', 'scheme'),
            ('source', 'resourceIdentifier'),
            ('isFormatOf', 'resourceIdentifier'),
        ):
        want_key, want_attrkey = fetch_as_list_with_attr
        dict_list = meta.get(want_key)
        if dict_list is not None:
            doc[want_key] = []
            for d in dict_list:
                dtext = d.get('text')
                if dtext is not None:
                    attr  = d.get('attr')
                    if want_attrkey in attr:
                        doc[want_key].append( (attr.get(want_attrkey), dtext) )


    # for 'print what haven't I handled yet' purposes:
    for rem in ['title', 'alternative', 'subject', 'issued', 'modified',
                'language', 'format', 'rights', 'identifier', 'type',
                'creator', 'spatial', 'isRatifiedBy', 'source', 'isFormatOf',
                'onderwerp','betreft', 'kenmerk', 'redactioneleToevoeging',
                'inwerkingtredingDatum', 'uitwerkingtredingDatum', 
                ]:
        if rem in meta:
            meta.pop(rem)

    text = wetsuite.helpers.koop_parse.cvdr_text( tree )
    doc['text']       = text

    #pprint.pprint( doc )

    def tuple_or_none(val):
        ' '
        if val is not None:
            val = tuple(v  for v in val)
        return val

    def get_tuple_or_none(key, join_if_sequence=' '):
        ' '
        val = doc.get(key)
        ret = []
        if val is not None:
            for item in val:
                if type(item) in (list, tuple):
                    ret.append( join_if_sequence.join(item) )
                else:
                    ret.append( item )
        return ret
    
    #print( inwerkingtreding )

    alternative            = get_tuple_or_none( 'alternative' )
    inwerkingtredingDatum  = tuple_or_none( doc.get( 'inwerkingtredingDatum'  ) )
    uitwerkingtredingDatum = tuple_or_none( doc.get( 'uitwerkingtredingDatum' ) )
    issued                 = get_tuple_or_none( 'issued' )
    subject                = get_tuple_or_none( 'subject' )
    creator                = get_tuple_or_none( 'creator' )
    spatial                = get_tuple_or_none( 'spatial' )

    curs2.execute('''INSERT INTO cvdr  (work_id, expression_id, title, alternative, inwerkingtreding, uitwerkingtreding, 
                                        issued, subject, creator, spatial, web_url, xml_url, plaintext)
                        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''', (
        work_id, expression_id,  doc['title'], alternative, inwerkingtredingDatum, uitwerkingtredingDatum,
        issued, subject, creator, spatial, doc['web_url'], doc['xml_url'], text,
    ) )

    count+=1
    if count%1000==0:
        conn2.commit()
        gc.collect()

conn2.commit()


#with open('cvdr.json','w', encoding='utf8') as f:
#    f.write( json.dumps(dataset) )


if 0:

    if 0:
        for resource_name, dds in meta.items():
            print( resource_name, dds)
            for dd in dds:
                text = dd.get('text')
                if text is None:
                    print('text is None for %r'%dd)
                    continue
                tlow = text.lower()

                #attr = dd.get('attr')
                #def normalize_isformatof(text):

                if resource_name in ('isFormatOf',):
                    if tlow.startswith('wsb-'):
                        pass
                    elif tlow.startswith('gmb-'):
                        pass
                    elif tlow.startswith('prb-'):
                        pass
                    elif tlow.startswith('bgr-'):
                        pass
                    elif tlow.startswith('stcrt-'):
                        pass
                    elif tlow.startswith('gemeenteblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('digitaal gemeenteblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('elektronisch gemeenteblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('waterschapsblad'):
                        pass # TODO: parse 
                    elif tlow.startswith('provinciaal blad'):
                        pass # TODO: parse 
                    #else:
                    #    print("TODO: handle isFormatOf %r"%text)

    if 0:
        refs = wetsuite.helpers.koop_parse.cvdr_sourcerefs( tree )
        if len(refs)>0:
            doc['refs'] = []
            for typ, raw, bwb, params, reftext in refs:
                #print( [typ, raw,bwb,params,reftext] )
                if typ=='BWB':
                    shortref = bwb
                    #print(params)
                    if 'hoofdstuk' in params:
                        shortref += ' hoofdstuk '+params['hoofdstuk'][0]
                    if 'artikel' in params:
                        shortref += ' artikel '+params['artikel'][0]
                    if 'lid' in params:
                        shortref += ' lid '+params['lid'][0]

                    if 1:
                        print('RAW:      %s'%raw)
                        print('BWB-ID:   %s'%bwb)
                        print('PARAMS:   %s'%params)
                        print('SHORTREF: %s'%shortref)
                        print('TEXT:     %s'%reftext)
                        print('')
                    for k in params:
                        params[k]=params[k][0] # probably usually good enough
                    doc['refs'].append( (bwb, params, reftext) )