# Purpose of this notebook

Figure out what POD (PUC Open Data) does.

This has previously not worked, and has overlap with PLOOI,
so I'm not quite sure about the status and future of this one
(or whether it was always fine and just a bug of ours).

Right now it's useful, to find 

In [4]:
import datetime
import pprint

import requests

import wetsuite.helpers.etree
import wetsuite.helpers.localdata
import wetsuite.helpers.koop_parse
import wetsuite.helpers.format
import wetsuite.helpers.date
import wetsuite.helpers.notebook

from wetsuite.datacollect.koop_repositories import PUCOpenData

In [None]:
# for an SRU index summary, run:
pprint.pprint( PUCOpenData().explain_parsed() )

In [2]:
# getting slightly ahead of ourselves, documents we want to store go here:
frbr_fetched  = wetsuite.helpers.localdata.LocalKV('frbr_fetched.db', key_type=str, value_type=bytes )

In [4]:
count_cached, count_fetched, count_errors = 0,0,0

def puc_callback(record):
    global count_cached, count_fetched, count_errors
    recordData     = record.find('recordData')        # the actual record 
    payload = recordData[0]
    
    # TODO: figure out data model, parse other parts - the below just throws disinct section's keys into the same dict. 
    # this is hoping for lack of collision, and bad practice in general, but good enough for debug
    merged = {} 
    originalData = payload.find('originalData')
    owmskern     = wetsuite.helpers.etree.kvelements_to_dict( originalData.find('meta/owmskern')   )
    owmsmantel   = wetsuite.helpers.etree.kvelements_to_dict( originalData.find('meta/owmsmantel') ) 
    tpmeta       = wetsuite.helpers.etree.kvelements_to_dict( originalData.find('meta/tpmeta')     )
    merged.update( owmskern )
    merged.update( owmsmantel )
    merged.update( tpmeta ) 

    docs_fetched = 0
    # enrichedData will mention a list of (document type, document URL) 
    enriched = {}  # doctype -> url       assumption: at most one of each type
    for ediu in payload.find('enrichedData').findall('itemUrl'):
        enriched[ ediu.get('manifestation') ] = ediu.text.strip()   
    merged.update( enriched )   # should arguably be more structured

    # given a list of types like  html, metadata, odt, pdf,  choose just preferred forms to avoid duplication
    for chosen_type in wetsuite.helpers.koop_parse.prefer_types( enriched.keys() ):
        itemurl = enriched[chosen_type]
        docs_fetched += 1
        try:
            _, came_from_cache = wetsuite.helpers.localdata.cached_fetch( frbr_fetched, itemurl )
            if came_from_cache:
                #print('CACHED', itemurl)
                count_cached += 1
            else:
                #print('FETCHED', itemurl)
                count_fetched += 1
        except requests.exceptions.ReadTimeout:
                print('TIMEOUT', itemurl)
                count_errors += 1
        except ValueError as ve:
                print('error %s'%ve, itemurl)
                count_errors += 1

    if docs_fetched == 0:
        print("Didn't select any documents from")
        payload = wetsuite.helpers.etree.indent(payload)
        print( wetsuite.helpers.etree.tostring( payload, encoding='unicode' ) )
        #pprint.pprint( merged )

In [None]:
sru_puc = PUCOpenData()

def search_retrieve_progressbar(query, callback):
    global count_cached, count_fetched, count_errors 
    count_cached, count_fetched, count_errors = 0,0,0
    sru_puc.search_retrieve(query) # only really for the amount
    numrec = sru_puc.num_records()
    print( "%d items for %r"%(numrec, query) )
    pb = wetsuite.helpers.notebook.progress_bar( numrec, description=query )
    def cbwrap(record):
        pb.value += 1
        pb.description = '%d fetched, %d cached'%( count_fetched, count_cached )
        callback(record)
    sru_puc.search_retrieve_many( query, up_to=50000, at_a_time=1000, callback=cbwrap, wait_between_sec=0.01) 


for frd, tod in reversed( wetsuite.helpers.date.date_ranges('2021-01-01', datetime.datetime.now(), 5, '%Y-%m-%d') ): 
    search_retrieve_progressbar('dt.modified >= %s and dt.modified <= %s'%(frd,tod), callback=puc_callback)    


#search_retrieve_progressbar('dt.modified >= 2024-01-01', callback=puc_callback)    


In [None]:
for query in ['dt.modified >= 2024-01-01']:
    sru_puc.search_retrieve(query)
    print("%d results for %r"%(sru_puc.num_records(), query))

    sru_puc.search_retrieve_many( query, up_to=50000, callback=puc_callback, wait_between_sec=0.01) 

#sru_puc.search_retrieve_many('dt.modified>=2022-01-01 and dt.modified<=2022-12-31', up_to=50000, callback=puc_callback)
#sru_puc.search_retrieve_many('dcterms.modified>=2021-01-01 and dcterms.modified<=2021-12-31', up_to=50000, callback=puc_callback) 
#sru_puc.search_retrieve_many('dcterms.modified>=2020-01-01 and dcterms.modified<=2020-12-31', up_to=50000, callback=puc_callback) 
#sru_puc.search_retrieve_many('dcterms.modified>=2019-01-01 and dcterms.modified<=2019-12-31', up_to=50000, callback=puc_callback) 
#sru_puc.search_retrieve_many('dcterms.modified>=2018-01-01 and dcterms.modified<=2018-12-31', up_to=50000, callback=puc_callback) 
#sru_puc.search_retrieve_many('dcterms.modified>=2017-01-01 and dcterms.modified<=2017-12-31', up_to=50000, callback=puc_callback) 
#sru_puc.search_retrieve_many('dcterms.modified>=2016-01-01 and dcterms.modified<=2016-12-31', up_to=50000, callback=puc_callback) 
#sru_puc.search_retrieve_many('dcterms.modified>=2015-01-01 and dcterms.modified<=2015-12-31', up_to=50000, callback=puc_callback) 

#sru_puc.search_retrieve_many('dcterms.identifier==BWBR0004825', callback=puc_callback) # Reglement verkeersregels en verkeerstekens, to see how images work

## 