# Purpose of this notebook

Show how we fetch data from the BWB repository to be used to create our corresponding datasets

TODO: finish, this is a copy-paste from a script

## Fetching

In [5]:
import re
import collections
import datetime
import pprint
import random

import wetsuite.helpers.etree
import wetsuite.helpers.notebook
import wetsuite.helpers.localdata
import wetsuite.helpers.koop_parse
import wetsuite.datacollect.koop_repositories 

In [2]:
# contains toestand, manifest, and wti downloads
bwb_fetched = wetsuite.helpers.localdata.LocalKV( 'bwb_fetched.db', str, bytes )

In [6]:
def bwb_search_callback( search_record_node ):
    ''' BWB records follow http://standaarden.overheid.nl/sru/gzd.xsd
        
        Right now we merge all the parts of a record into one dict, 
            which throws away some structure (on top of the already removed namespaces)
            but is easier to deal with.
    '''
    #print( wetsuite.helpers.etree.debug_pretty( record ) ) # for later reference, if you want to extract more out of these search records
    meta_dict = wetsuite.helpers.koop_parse.bwb_searchresult_meta( search_record_node )

    # toestand XML
    _, toestand_came_from_cache = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta_dict['locatie_toestand'],  force_refetch=False )

    # manifest and WTI - assume these probably changed (so need to be refetched) if we got a toestand we didn't previously have
    force_refetch_meta = (not toestand_came_from_cache)     
    _, man_cached = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta_dict['locatie_manifest'],  force_refetch=force_refetch_meta )
    _, wti_cached = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta_dict['locatie_wti'],       force_refetch=force_refetch_meta )

    if (not toestand_came_from_cache or not man_cached or not wti_cached): # fetched anything new? Mention that.
        print( "FETCHED new data for %s - %r"%(meta_dict['identifier'],meta_dict) )


# This is a "add recent changes", after previously haveing done a lot more fetching
sru_bwb = wetsuite.datacollect.koop_repositories.BWB( verbose=True )
_ = sru_bwb.search_retrieve_many('dcterms.modified >= %s'%(
    (datetime.date.today() - datetime.timedelta(days=30)).strftime('%Y-%m-%d') # one month ago
    ), up_to=20000, at_a_time=500, callback=bwb_search_callback)

[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=1&maximumRecords=500&query=dcterms.modified%20%3E%3D%202023-12-24'
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=501&maximumRecords=500&query=dcterms.modified%20%3E%3D%202023-12-24'
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=1001&maximumRecords=500&query=dcterms.modified%20%3E%3D%202023-12-24'
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=1501&maximumRecords=500&query=dcterms.modified%20%3E%3D%202023-12-24'
[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=2001&maximumRecords=500&

## Take that downloaded store, extract useful things into datasets

CONSIDER: smaller subset to start with, e.g. just 2023

In [5]:
from importlib import reload
reload( wetsuite.helpers.notebook )

# go through all fetched URLS and group   manifest, wti, and all toestand,   per BWB-id
# assume URL structure is consistent, which they seem to be.

bwbr_groups = collections.defaultdict(dict)  #  bwbr -> { toestanden:   latest_toestand:    wti:    manifest:  }

print("Grouping relevant URLs")

for url in wetsuite.helpers.notebook.ProgressBar( bwb_fetched.keys() ):

    # both filters for basic URLs we care about at all (in case other things got dropped in),
    # and filters for URLs with BWBR  - which implies skipping BWBV (verdragen/treaties), BWBW (?)
    # (the matching here and below is a little hacky, though, clean up?)
    bwbr = re.search('/bwb/(BWBR[0-9]{7})', url)
    if bwbr is not None:
        bwbr = bwbr.groups()[0] # the BWBR-and-number text

        if url.endswith('manifest.xml'): # e.g. https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0019805/manifest.xml
            bwbr_groups[bwbr]['manifest_url'] = url
            continue

        if url.endswith('.WTI'):         # e.g.  https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0016700/BWBR0016700.WTI
            bwbr_groups[bwbr]['wti_url'] = url
            continue

        toestand_match =  re.search('/bwb/(BWBR[0-9]{7})(/[0-9].*[.]xml)', url) 
        if toestand_match is not None: # e.g. #https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2002-03-21_0/xml/BWBR0001840_2002-03-21_0.xml
            _, sortname = toestand_match.groups() # assume that date is lexically sortable
            # those will be something like 'BWBR0001821'  and  '/1998-01-01_0/xml/BWBR0001821_1998-01-01_0.xml'
            if 'toestanden' not in bwbr_groups[bwbr]:
                bwbr_groups[bwbr]['toestanden'] = []
            bwbr_groups[bwbr]['toestanden'].append( (sortname,url) )
            continue
        
        print( "SKIP / LOOKAT   %s"%url )


print( 'We have %d Unique BWB-id groups'%len(bwbr_groups) )


print( "Finding latest versions" )
for bwbr, details in wetsuite.helpers.notebook.ProgressBar( list( bwbr_groups.items() ) ): # within each BWB-id
    for key, url in sorted( details['toestanden'], reverse=True ): # latest first, then use only the first
        bwbr_groups[bwbr]['latest_toestand_url'] = url
        break




Grouping relevant URLs


  0%|          | 0/207001 [00:00<?, ?it/s]

We have 37778 Unique BWB-id groups
Finding latest versions


  0%|          | 0/37778 [00:00<?, ?it/s]

In [6]:
# Dataset: The latest toestand per BWB-id, as its original XML
print("Writing latest-toestand-XML dataset")

bwb_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'bwb_latestonly_xml.db', str, bytes ) # bwbr -> xmlbytes
bwb_latestonly_xml._put_meta('description','TODO')

for bwbr, details in wetsuite.helpers.notebook.ProgressBar( bwbr_groups.items() ): # within each BWB-id
    bwb_latestonly_xml.put(bwbr, bwb_fetched.get( details['latest_toestand_url'] ),  commit=False)
    
bwb_latestonly_xml.commit()

Writing latest-toestand-XML dataset


  0%|          | 0/37778 [00:00<?, ?it/s]

In [16]:
# Now do some extraction and also make that datasets
from importlib import reload
reload(wetsuite.helpers.koop_parse)

autocommit = False  # faster but locks database if you stop it, so less handy during debug
#autocommit = True

bwb_latestonly_text = wetsuite.helpers.localdata.LocalKV( 'bwb_latestonly_text.db', str, str )
bwb_latestonly_text._put_meta('description','TODO')

bwb_latestonly_meta = wetsuite.helpers.localdata.MsgpackKV( 'bwb_latestonly_meta.db', str, None )
bwb_latestonly_meta._put_meta('description','TODO')


#for bwbr, details in wetsuite.helpers.notebook.ProgressBar( random.sample( list(bwbr_groups.items()), 100) ): # debug: test on a few
for bwbr, details in wetsuite.helpers.notebook.ProgressBar( bwbr_groups.items() ): # within each BWB-id

    toestand_tree = wetsuite.helpers.etree.fromstring( bwb_fetched.get( details['latest_toestand_url'] ) )

    text          = wetsuite.helpers.koop_parse.bwb_toestand_text(toestand_tree)

    meta_dict     = wetsuite.helpers.koop_parse.bwb_toestand_usefuls(toestand_tree)

    wti_url       = details['wti_url']
    if wti_url is not None:
        wti_tree              = wetsuite.helpers.etree.fromstring( bwb_fetched.get( wti_url ) )
        meta_dict['wti']      = wetsuite.helpers.koop_parse.bwb_wti_usefuls(wti_tree)


    manifest_url  = details['manifest_url']
    if manifest_url is not None:
        manifest_tree         = wetsuite.helpers.etree.fromstring( bwb_fetched.get( manifest_url ) )
        meta_dict['manifest'] = wetsuite.helpers.koop_parse.bwb_manifest_usefuls(manifest_tree)

        # redundant, but sometimes nice to have more accessible
        version_dates = list()
        for expression in manifest_tree.findall('expression'):
            version_dates.append( expression.find('metadata/datum_inwerkingtreding').text )
            meta_dict['version_dates'] = version_dates


    # intitule     = merged.get('intitule', None)
    # citeertitel  = merged.get('citeertitel', None)
    # soort        = merged.get('soort', None)

    # authority          = merged.get('authority', None)
    # overheidsdomeinen  = merged.get('overheidsdomeinen', None)
    # rechtsgebieden     = merged.get('rechtsgebieden', None)
    # if rechtsgebieden is not None:
    #     #list of tuples, e.g.
    #     #  (Belastingrecht,Vennootschapsbelastingrecht),  (Belastingrecht,Dividendbelasting)
    #     # Typically just one tuple, but let's handle it multiple properly, because the database expects a text[]
    #     temp = []
    #     for rgtup in rechtsgebieden:
    #         temp.append( ' - '.join( list( rg   for rg in rgtup   if rg is not None ) ) ) # flattened, but in a recoverable way
    #     rechtsgebieden = temp
                
    #pprint.pprint(meta_dict)

    bwb_latestonly_text.put(bwbr, text, commit=autocommit)

    bwb_latestonly_meta.put(bwbr, meta_dict, commit=autocommit)

bwb_latestonly_meta.commit()
bwb_latestonly_text.commit()

  0%|          | 0/37778 [00:00<?, ?it/s]