# Purpose of this notebook

Show how we fetch data from the CVDR repository to be used to create our corresponding datasets

## Fetching

In [3]:
import re, collections
import tqdm
import wetsuite.helpers.etree
import wetsuite.helpers.localdata
import wetsuite.helpers.koop_parse
import wetsuite.datacollect.koop_repositories 

In [4]:
# from all fetched:
bwb_fetched         = wetsuite.helpers.localdata.LocalKV( 'bwb_fetched.db', str, bytes )

In [9]:
def bwb_callback(record):
    ''' BWB records follow http://standaarden.overheid.nl/sru/gzd.xsd
        
        Right now we merge all the parts of a record into one dict, 
            which throws away some structure (on top of the already removed namespaces)
            but is easier to deal with.
    '''
    meta = wetsuite.helpers.koop_parse.bwb_searchresult_meta( record )

    # toestand xml
    _, toestand_came_from_cache = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta['locatie_toestand'],  force_refetch=False )
    force_refetch_meta = (not toestand_came_from_cache) # (only) if we got a toestand we didn't previously have, assume the metadata also changed
    
    # manifest and WTI
    _, man_cached = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta['locatie_manifest'],  force_refetch=force_refetch_meta )
    _, wti_cached = wetsuite.helpers.localdata.cached_fetch( bwb_fetched,  meta['locatie_wti'],       force_refetch=force_refetch_meta )

    if (not toestand_came_from_cache or not man_cached or not wti_cached):
        print( "FETCHED new data for %s: %s"%(meta['identifier'], [not toestand_came_from_cache, not man_cached, not wti_cached]) )
        print('    ',meta)


sru_bwb = wetsuite.datacollect.koop_repositories.BWB( verbose=True )
_ = sru_bwb.search_retrieve_many('dcterms.modified>=2023-11-01', up_to=20000, at_a_time=1000, callback=bwb_callback)

#sru_bwb.search_retrieve_many('dcterms.identifier==BWBR0004825', callback=bwb_callback) # Reglement verkeersregels en verkeerstekens, to see how images work
#sru_bwb.search_retrieve_many('dcterms.identifier==BWBR0001840', callback=bwb_callback) # Grondwet

[SRU searchRetrieve] fetching 'http://zoekservice.overheid.nl/sru/Search?&version=1.2&x-connection=BWB&operation=searchRetrieve&startRecord=1&maximumRecords=1000&query=dcterms.modified%3E%3D2023-11-01'


## Creating dataset

In [None]:
if 0: # CONSIDERING: smaller subset to start with
    bwb_2023_xml = wetsuite.helpers.localdata.LocalKV( 'bwb_2023_xml.db', str, bytes )

    for key in bwb_fetched.keys():
        if key.endswith('xml') and '/2023-' in key: # not as good as reading the metadata, but good enough for now
            bwb_2023_xml.put( key, bwb_fetched.get(key) )
    len( bwb_2023_xml )

In [None]:
# go through all fetched, figure out latest XML for each BWBR (also manifest, wti)

bwbr_toestanden = collections.defaultdict(list)  # BWB-id -> (toestand_sortname, toestand_url)
bwbr_wti        = {}
bwbr_manifest   = {}

for url in bwb_fetched.keys():
    bwbr = re.search('/bwb/(BWBR[0-9]{7})', url) # CONSIDER: seeing if we can separate collections less hackily. Maybe just mark source when inserting?
    if bwbr is not None:
        bwbr = bwbr.groups()[0] # the text

        toestand_match =  re.search('/bwb/(BWBR[0-9]{7})(/[0-9].*[.]xml)', url) # TODO: this is too hacky, clean up

        if url.endswith('manifest.xml'): # e.g. https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0019805/manifest.xml
            bwbr_manifest[bwbr] = url

        elif url.endswith('.WTI'): # e.g.  https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0016700/BWBR0016700.WTI
            bwbr_wti[bwbr] = url

        elif toestand_match is not None: # e.g. #https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2002-03-21_0/xml/BWBR0001840_2002-03-21_0.xml
            bwbr, sortname = toestand_match.groups() # assume that date is lexically sortable
            bwbr_toestanden[bwbr].append( (sortname,url) )
        else:
            pass
            print( "SKIP         %s"%url )

print( 'Unique BWB-ids:', len(bwbr_toestanden) )

In [None]:
bwb_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'bwb_latestonly_xml.db', str, bytes )
bwb_latestonly_xml._put_meta('description','')

In [None]:
# what we're making:

print("Finding latest versions")
for bwbr in tqdm.tqdm( bwbr_toestanden ):
    #print(bwbr)
    for key, url in sorted(bwbr_toestanden[bwbr], reverse=True): # latest first
        #print('  ',url)
        bwb_latestonly_xml.put(url, bwb_fetched.get(url), commit=False)
        break
    
bwb_latestonly_xml.commit()


In [None]:
# Extract from those latest versions

bwb_latestonly_text = wetsuite.helpers.localdata.LocalKV( 'bwb_latestonly_text.db', str, str )
bwb_latestonly_text._put_meta('description','')

bwb_latestonly_meta = wetsuite.helpers.localdata.MsgpackKV( 'bwb_latestonly_meta.db', str, None )
bwb_latestonly_meta._put_meta('valtype','msgpack')
bwb_latestonly_meta._put_meta('description','')

for bwbr, xml_bytes in tqdm.tqdm( bwb_latestonly_xml.items() ):
        tree = wetsuite.helpers.etree.fromstring( xml_bytes )

        meta = wetsuite.helpers.koop_parse.bwb_toestand_usefuls(tree)
        #print(meta)
        bwb_latestonly_meta.put(bwbr, meta, commit=False)

        text = wetsuite.helpers.koop_parse.bwb_toestand_text(tree)
        #print(text)
        bwb_latestonly_text.put(bwbr, text, commit=False)

        #break
bwb_latestonly_meta.commit()
bwb_latestonly_text.commit()