# Purpose of this notebook

Show how we fetch data from the CVDR repository to be used to create our corresponding datasets

## Fetching

In [2]:
import collections, pprint
import tqdm
import wetsuite.helpers.etree
import wetsuite.helpers.localdata
import wetsuite.datacollect.koop_repositories 
import wetsuite.helpers.koop_parse
import wetsuite.helpers.notebook

In [3]:
# A fetch script has previously saved downloads into:
cvdr_fetched         = wetsuite.helpers.localdata.LocalKV( 'cvdr_fetched.db', str, bytes )

In [5]:
from importlib import reload
reload( wetsuite.helpers.notebook )

sru_cvdr = wetsuite.datacollect.koop_repositories.CVDR()

queries = [ 
    # the time period is balanced with the up_to later.  No real reason to the choices.
    #'dcterms.modified>=2023-01-01',

    #'dcterms.modified>=2023-06-01 and dcterms.modified<=2023-12-31',
    #'dcterms.modified>=2023-01-01 and dcterms.modified<=2023-06-01',
    #'dcterms.modified>=2022-06-01 and dcterms.modified<=2022-12-31',
    #'dcterms.modified>=2022-01-01 and dcterms.modified<=2022-06-01',
    #'dcterms.modified>=2021-06-01 and dcterms.modified<=2021-12-31',
    #'dcterms.modified>=2021-01-01 and dcterms.modified<=2021-06-01',
    #'dcterms.modified>=2020-06-01 and dcterms.modified<=2020-12-31',
    #'dcterms.modified>=2020-01-01 and dcterms.modified<=2020-06-01',
    'dcterms.modified>=2019-06-01 and dcterms.modified<=2019-12-31',
    'dcterms.modified>=2019-01-01 and dcterms.modified<=2019-06-01',
]

for query in queries:
    sru_cvdr.search_retrieve( query ) # purely for the number of records, itself only for the progress bar
    numrecs = sru_cvdr.num_records()
    pbar = wetsuite.helpers.notebook.progress_bar( max=numrecs, description='fetching')

    count_cached, count_fetched = 0,0
    def cvdr_callback(record):
        global count_cached, count_fetched
        merged = {}

        #recordSchema   = record.find('recordSchema')      # e.g. <recordSchema>http://standaarden.overheid.nl/sru/</recordSchema>
        #recordPacking  = record.find('recordPacking')     # probably <recordPacking>xml</recordPacking>
        recordData     = record.find('recordData')        # the actual record 
        recordPosition = record.find('recordPosition')    # e.g. <recordPosition>12</recordPosition>

        gzd = recordData[0]
        originalData = gzd.find('originalData')
        merged.update( wetsuite.helpers.etree.kvelements_to_dict( originalData.find('meta/owmskern')   ) )
        merged.update( wetsuite.helpers.etree.kvelements_to_dict( originalData.find('meta/owmsmantel') ) )
        merged.update( wetsuite.helpers.etree.kvelements_to_dict( originalData.find('meta/cvdripm')     )  )
        merged.update( wetsuite.helpers.etree.kvelements_to_dict( gzd.find('enrichedData') ) )
        #pprint.pprint( merged )

        try:
            _, came_from_cache = wetsuite.helpers.localdata.cached_fetch( cvdr_fetched, merged['publicatieurl_xml'] ) # we currently care only about the XML it links to
            if not came_from_cache:
                count_fetched += 1
                #print( "FETCHED : %r"%(merged['publicatieurl_xml']))
                #time.sleep( 1 ) # be somewhat nice to server
            else:
                count_cached += 1
                #print( "CACHED : %r"%(merged['publicatieurl_xml']))
        except ValueError as ve:
            print( "ERROR downloading: %s  for %r"%(ve, merged['publicatieurl_xml']))

        pbar.value += 1
        if pbar.value % 25 == 0:
            pbar.description = '%d cached, %d fetched'%(count_cached, count_fetched)
        
    _ = sru_cvdr.search_retrieve_many( query, at_a_time=1000, up_to=50000, callback=cvdr_callback)
    #_ = sru_cvdr.search_retrieve_many( query, at_a_time=1000, up_to=50000, callback=cvdr_callback)

fetching:   0%|          | 0/8179 [00:00<?, ?it/s]

fetching:   0%|          | 0/14254 [00:00<?, ?it/s]

In [None]:

# -----------------------------------------------

#sru_cvdr.search_retrieve_many("creator any Delft", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("creator any Amsterdam", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("creator any Utrecht", at_a_time=1000, up_to=50000, callback=cvdr_callback)

#sru_cvdr.search_retrieve_many("title any Damocles or title any damoclesbeleid", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("title any Opiumwet and title any 13b", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("title any Opiumwet and title any 13", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("dcterms.source any BWBR0001941", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("dcterms.source any 13b", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("dcterms.source any opiumwet", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("isFormatOf='CVDR640125'", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#  https://repository.officiele-overheidspublicaties.nl/CVDR/CVDR640125/1/xml/CVDR640125_1.xml


#sru_cvdr.search_retrieve_many('dcterms.modified>=2022-06-01', at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many('dcterms.modified>=2022-01-01 and dcterms.modified<=2022-06-01', at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many('dcterms.modified>=2021-01-01 and dcterms.modified<=2021-12-31', up_to=50000, callback=cvdr_callback) 
#sru_cvdr.search_retrieve_many('dcterms.modified>=2013-01-01 and dcterms.modified<=2013-12-31', up_to=50000, callback=cvdr_callback) 
#sru_cvdr.search_retrieve_many('dcterms.modified<=2012-12-31', up_to=50000, callback=cvdr_callback) 

# doesn't seem to let you search for "all versions of"
#sru_cvdr.search_retrieve_many("dcterms.identifier=CVDR272112_2", at_a_time=1000, up_to=50000, callback=cvdr_callback)
#sru_cvdr.search_retrieve_many("dcterms.identifier=CVDR272112", at_a_time=1000, up_to=50000, callback=cvdr_callback)

# ERROR case
#sru_cvdr.search_retrieve_many("dcterms.identifier=CVDR7915_1", at_a_time=1000, up_to=50000, callback=cvdr_callback)

## Creating dataset

We'll spare you the full contents of that store,
because it contains most versions of most things, is even more overcomplete than that, and probably not something you want to fetch yourself.

Mostly for our own reference, it contains keys that are URLs like:
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/html/100078_1.html
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/xml/100078_1.xml
and values that are said files as bytestrings.

Right now we care about data, so just the XML, so we ignore everything else.

Also, it seems that KOOP search results expose some variation in the capitalisation, led to duplicate URLs in the above, e.g. 
- https://repository.officiele-overheidspublicaties.nl/CVDR/100078/1/xml/100078_1.xml
- https://repository.officiele-overheidspublicaties.nl/cvdr/100078/1/xml/100078_1.xml

...so we also ensure we pick just one.

In [9]:

dedupdata   = collections.defaultdict(list)  # lowercased version of URL -> actual URLs
for url in cvdr_fetched:
    if not url.endswith('.xml'):
        continue
    dedupdata[ url.lower() ].append( url )

unique_xml_urls = []
for lurl in list(dedupdata):
     url_list = sorted( dedupdata[lurl] ) #some consistency in which one we pick
     unique_xml_urls.append( url_list[0] )
     #print( url_list )

print( len(cvdr_fetched), len( unique_xml_urls ) )

## This was some double checking they have identical content   (and count how many of these duplicates are there)
# def nxml(bytedata): # reindent as a form of normalization
#     tree = wetsuite.helpers.etree.fromstring(bytedata)
#     tree = wetsuite.helpers.etree.indent( tree )
#     return wetsuite.helpers.etree.tostring( tree, encoding='UTF-8' ) 
# counts = collections.defaultdict(int)        # how many -> how often
# for lurl in list(dedupdata):
#     url_list = dedupdata[lurl]
#     xml_list = list( cvdr_fetched.get(url)   for url in url_list )
#     lxl = len(xml_list)
#     counts[lxl]+=1
#     if lxl > 1:
#         print( url_list )
#         norm_xml_list = list( nxml( cvdr_fetched.get(url) )    for url in url_list )
#         for i in range( 1, len(norm_xml_list) ):
#             is_same = norm_xml_list[0] == norm_xml_list[i]
#             #print("cmp(0,%d)==%s"%(i, is_same))
#             if not is_same:
#                 print('%s not the same as %s'%(url_list[0], url_list[i]))
#                 print(difflib.context_diff(norm_xml_list[0].splitlines(), norm_xml_list[i].splitlines()))
#pprint.pprint(counts)

432294 182492


In [10]:
cvdr_groups = collections.defaultdict(list)  # CVDR-workid -> [ (expression_id, xml_url), ... ]

# Now group expressions by their work ID 
groups = collections.defaultdict(list)   # work_id -> [ (version_in_expression_id, expression_id, url), ... ]

for url in unique_xml_urls:
    ids = url.rsplit('/',1)[1].rsplit('.',1)[0]
    work_id, expression_id = wetsuite.helpers.koop_parse.cvdr_parse_identifier(ids)
    version_int = int( expression_id.split('_',1)[1], 10)  # for sorting
    #print (work_id, expression_id, url)
    groups[work_id].append( (version_int, expression_id, url) )

lasts_only = []   # was dict  url -> data
for work_id in groups:
    expression_and_urls = sorted( groups[work_id] )   # (sorting defaults to first column)
    version_int, expression_id, url = expression_and_urls[-1] #right now we just pick the last revision
    #TODO: check the actual validity date.   Right now it will pick a few of the 'planned for the near future' ones
    lasts_only.append( url )

print( len( unique_xml_urls ), len(groups), len(lasts_only) )

182492 167416 167416


Start a store that intends to contain just the most recent expression XML for each work.

In [11]:
# takes minute or to just to write that much data  (order of a few GB)
cvdr_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'cvdr_latestonly_xml.db', str, bytes )
for url in tqdm.tqdm( lasts_only ):
    cvdr_latestonly_xml.put( url, cvdr_fetched.get( url ), commit=False )
cvdr_latestonly_xml.commit() # reducing that many commits is factor dozens speed difference (on SSD)
cvdr_latestonly_xml.close()

100%|██████████| 167416/167416 [00:51<00:00, 3249.45it/s]


In [14]:
cvdr_latestonly_xml.bytesize()

6405173248

...and stores that contain the plain text, and the metadata, for the same latest expressions. 

These three stores should have exactly the same keys (unless maybe we forget to clean the lastest leftoves betwen rerunning this).

In [13]:
cvdr_latestonly_text = wetsuite.helpers.localdata.LocalKV( 'cvdr_latestonly_text.db', str, str )
cvdr_latestonly_text._put_meta('description','') # metadata for each CVDR item. Mostly just the output of wetsuite.helpers.koop_parse.cvdr_text

cvdr_latestonly_meta = wetsuite.helpers.localdata.MsgpackKV( 'cvdr_latestonly_meta.db', str, None)
cvdr_latestonly_meta._put_meta('valtype','msgpack')
cvdr_latestonly_meta._put_meta('description','') # metadata for each CVDR item. Mostly just the output of wetsuite.helpers.koop_parse.cvdr_meta

cvdr_latestonly_xml = wetsuite.helpers.localdata.LocalKV( 'cvdr_latestonly_xml.db', str, bytes, read_only=True )

for url, xml_bytes in tqdm.tqdm( cvdr_latestonly_xml.items() ):
        tree = wetsuite.helpers.etree.fromstring( xml_bytes )
        
        meta = wetsuite.helpers.koop_parse.cvdr_meta(tree, flatten=True)
        #pprint.pprint(meta)
        cvdr_latestonly_meta.put(url, meta, commit=False)

        text = wetsuite.helpers.koop_parse.cvdr_text(tree)
        #print(repr(text))
        cvdr_latestonly_text.put(url, text, commit=False)
        
        #break
cvdr_latestonly_meta.commit()
cvdr_latestonly_text.commit()

100%|██████████| 169421/169421 [10:06<00:00, 279.53it/s]
