<a href="https://colab.research.google.com/github/knobs-dials/wetsuite-dev/blob/main/notebooks/extras/datacollect/extras_datacollect_kamervragen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Purpose of this notebook

Take kamervragen text out of the data fetched by [extras_datacollect_frbr](extras_datacollect_frbr.ipynb) / _pod.

In [2]:
import re
import collections
import random
import pprint

import wetsuite.helpers.etree
import wetsuite.helpers.format
import wetsuite.helpers.localdata
import wetsuite.helpers.strings

In [3]:
frbr_fetched  = wetsuite.helpers.localdata.LocalKV('frbr_fetched.db', key_type=str, value_type=bytes )

### remind ourselves what's in there


In [4]:
for url, docbytes in frbr_fetched.random_sample( 20 ):
    print( url )

https://repository.overheid.nl/frbr/lokalebekendmakingen/aabf23c3a051212503616f9285587d4d/1/metadata/metadata.xml
https://repository.overheid.nl/frbr/officielepublicaties/stcrt/2008/stcrt-2008-334/1/metadata/metadata.xml
https://repository.overheid.nl/frbr/datacollecties/2023/dc-2023-221/1/metadata/metadata.xml
https://repository.overheid.nl/frbr/officielepublicaties/gmb/2023/gmb-2023-286484/1/metadata/metadata.xml
https://repository.overheid.nl/frbr/officielepublicaties/gmb/2022/gmb-2022-401314/1/html/gmb-2022-401314.html
https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20202021/ah-tk-20202021-1298/1/metadataowms/metadata_owms.xml
https://repository.overheid.nl/frbr/officielepublicaties/gmb/2022/gmb-2022-571127/1/html/gmb-2022-571127.html
https://repository.overheid.nl/frbr/officielepublicaties/wsb/2014/wsb-2014-6756/1/html/maintiendrai-grijs.svg
https://repository.overheid.nl/frbr/officielepublicaties/h-ek/20182019/h-ek-20182019-22-8/1/metadataowms/metadata_owms.xml
htt

In [5]:
# okay, lots of different things, can we quickly get a count of what seem to be the the collections?
count_collections = collections.defaultdict(int)

for url in list( frbr_fetched.keys() ):
    if url.startswith( 'https://repository.overheid.nl/frbr/'):
        sp = url.split('/')[4]
        count_collections[ sp ] += 1
    else:
        count_collections[ 'unsorted' ] += 1

dict( count_collections )

{'cga': 132,
 'datacollecties': 9296,
 'lokalebekendmakingen': 659002,
 'officielepublicaties': 5882603,
 'samenwerkendecatalogi': 71599,
 'sgd': 422798,
 'tuchtrecht': 40846,
 'vd': 3580}

In [5]:
# focus on just officiele publicaties - get the the document categories in there
count_optypes = collections.defaultdict(int)

# remind ourselves what's in there
for url in frbr_fetched.keys():
    if url.startswith( 'https://repository.overheid.nl/frbr/officielepublicaties/'):
        sp = url.split('/')[5]
        count_optypes[ sp ] += 1

dict( count_optypes )

{'ag-ek': 3896,
 'ag-tk': 4596,
 'ag-vv': 120,
 'ag': 204,
 'ah-ek': 1748,
 'ah-tk': 334152,
 'ah': 333,
 'bgr': 40055,
 'blg': 181016,
 'gmb': 2687167,
 'h-ek': 37926,
 'h-tk': 88110,
 'h-vv': 208,
 'h': 20,
 'kst': 373683,
 'kv-ek': 460,
 'kv-tk': 153628,
 'kv': 93974,
 'nds-tk': 20254,
 'nds': 13234,
 'prb': 224810,
 'stb': 167642,
 'stcrt': 1049026,
 'trb': 64569,
 'wsb': 341776}

In [6]:
# it seems that we care only about ah-tk  (detailed elsewhere TODO: mention where)
# so collect those ang group that documents for each identifier into sets

# the URLs look something like   
#   'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20012002/ah-tk-20012002-1580/1/metadata/metadata.xml'
# where 
#   we ignore '20012002'
#   and consider 'ah-tk-20012002-1580' an identifier
#   we consider 'metadata' the type

ah_tk_sets = collections.defaultdict( list )

for url in frbr_fetched.keys():  #                                                              date  ident    typ 
    m = re.match('https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/[^/]+/([^/]+)/[^/]+/([^/]+)/', url)
    if m is not None:
        id, typ = m.groups()
        ah_tk_sets[id].append( (typ, url) )


print( "Amount of document sets: %s"%len(ah_tk_sets) )

# as an illustration of what we just made:
random.sample(  list(ah_tk_sets.items()), 2  )

Amount of document sets: 83254


[('ah-tk-20142015-3240',
  [('html',
    'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20142015/ah-tk-20142015-3240/1/html/ah-tk-20142015-3240.html'),
   ('metadata',
    'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20142015/ah-tk-20142015-3240/1/metadata/metadata.xml'),
   ('metadataowms',
    'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20142015/ah-tk-20142015-3240/1/metadataowms/metadata_owms.xml'),
   ('xml',
    'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20142015/ah-tk-20142015-3240/1/xml/ah-tk-20142015-3240.xml')]),
 ('ah-tk-20022003-918',
  [('html',
    'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20022003/ah-tk-20022003-918/1/html/ah-tk-20022003-918.html'),
   ('metadata',
    'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20022003/ah-tk-20022003-918/1/metadata/metadata.xml'),
   ('metadataowms',
    'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20

In [7]:
# define some helpers functions that we're about to use
def first_of_type(set, typ):
    " given a list-of-tuples as just mentioned, and e.g. 'xml', return the according URL - the first/any, if there happen to be multiple "
    for styp, surl in set:
        if typ==styp:
            return surl
    return None


def simpler_metadata(tree):
    ' eases reading metadata.xml '
    ret = collections.defaultdict(list) # returned as key -> list of values (often just one)
    for metadata in tree.findall('metadata'):
        #print( wetsuite.helpers.etree.tostring(metadata) )
        name = metadata.get('name')
        if name==None:
            continue
        name = name.split('.')[-1]
        ret[ name ].append( metadata.get('content') )
    return ret


def simpler_metadata_owms(tree):
    ' eases reading metadata_owms.xml '
    ret = collections.defaultdict(list) # returned as key -> list of values (often just one)
    tree = wetsuite.helpers.etree.strip_namespace(tree)

    for (path, key) in (
        ('owmskern/type',        'type'),
        ('owmsmantel/available', 'available'),
        ('owmsmantel/available', 'issued'),
        ('oep/available',        'aanhangselNummer'),
        ('oep/indiener',         'indiener'),
        ('oep/ontvanger',        'ontvanger'),
    ):
        tags = tree.findall(path)
        if len(tags)>0:
            ret[key] = list( tag.text   for tag in tags)
    for metadata in tree.findall('metadata'):
        ret[ metadata.get('name') ].append( metadata.get('content') )
    return ret

In [8]:
# most of the data going into the dataset later, which we are about to start putting in there
collected       = {}  # identifier -> detail_dict


count_postponed = 0

for id in ah_tk_sets:
    urls_in_set = ah_tk_sets[id]

    # start collecting the information going to the dataset
    item_merged = { 
        'urls':[]
    }

    ### Fetch metadata, and check whether it tells us there are answers in this set
    metadata_url      = first_of_type(urls_in_set, 'metadata')
    xml_url           = first_of_type(urls_in_set, 'xml')
    if metadata_url is None:
        #print( "SKIP, INCOMPLETE set %s, no metadata in %r"%(id, urls_in_set))
        continue
    if xml_url is None:
        #print( "SKIP, INCOMPLETE set %s, no xml in %r"%(id, urls_in_set))
        continue

    item_merged['urls'].append(metadata_url)
    item_merged['urls'].append(xml_url)
    
    metadata_bytestring = frbr_fetched.get( metadata_url )
    metadata_tree       = wetsuite.helpers.etree.fromstring( metadata_bytestring )
    metadata_dict       = simpler_metadata( metadata_tree )
    for key in metadata_dict:
        if key in ('issued', 'category', 'indiener', 'ontvanger', 'available','vergaderjaar', 'identifier', 'type'):
            v = metadata_dict[key]
            if key in ('available', 'issued', 'identifier' ,'category', 'vergaderjaar'):
                v = v[0]
            item_merged[key] = v

    if not 'Antwoord' in item_merged['type']:
        #print( "SKIP, no answers in document set %s   (%s)"%(id, merged['type']))
        continue

    ### There are answers, so actually fetch, parse, and collect
    xml_bytestring = frbr_fetched.get( xml_url )
    xml_tree       = wetsuite.helpers.etree.fromstring( xml_bytestring )
    xml_dict       = simpler_metadata( xml_tree )

    va = collections.defaultdict(dict) # number -> {details}

    # TODO/CONSIDER:
    # * what is kamervraagomschrijving?

    vragen     = xml_tree.findall('kamervragen/vraag')+xml_tree.findall('body/vragen/vraag')
    antwoorden = xml_tree.findall('kamervragen/antwoord')+xml_tree.findall('body/reactie/antwoord')
    if len(vragen)==0 or len(antwoorden)==0:
        # To see whether we are failing to parse any that are present, we filter out the "we will answer / did answer that later"
        #   and print what's left after that
        if wetsuite.helpers.strings.contains_any_of( xml_bytestring, [
            'gestelde termijn ','gebruikelijke termijn', 
            'zo spoedig mogelijk',
            'zijn beantwoord',
            'als ingetrokken worden beschouwd',
            ]):
            count_postponed += 1
        else:
            print( f'WARN: {len(vragen)} questions, {len(antwoorden)} answers in %r'%xml_url )

    for vraag in vragen:
        nr = vraag.find('nr')
        if nr is None or nr.text is None: # this looks like mild abuse of the format
            continue
        nr = re.sub('[^0-9]+', ' ', nr.text).strip() # removes the words and commas
        text = []
        for al in vraag.findall('al'):
            text.extend( wetsuite.helpers.etree.all_text_fragments( under_node=al, ignore_empty=True, ignore_tags=('extref','noot', 'noot.nr', 'noot.al') ) )
        va[ nr ]['vraag'] = ( ('\n\n'.join(text)).strip().replace('\xa0',' '),  wetsuite.helpers.etree.tostring(vraag, encoding='unicode') )


    for antwoord in antwoorden:
        nr = antwoord.find('nr')
        if nr is None or nr.text is None: # this looks like mild abuse of the format
            continue
        nr = re.sub('[^0-9]+', ' ', nr.text).strip()
        text = [] 
        for al in antwoord.findall('al'):
            text.extend( wetsuite.helpers.etree.all_text_fragments( under_node=al, ignore_empty=True, ignore_tags=('extref','noot', 'noot.nr', 'noot.al') ) )
        va[ nr ]['antwoord'] = ( ('\n\n'.join(text)).strip().replace('\xa0',' '),  wetsuite.helpers.etree.tostring(antwoord, encoding='unicode') )

    if len(va)==0:
        print("WARN: No useful question data in %r"%xml_url)
    #print( va )
    item_merged['vraagdata'] = dict(va) # defaultdict to dict

    collected[id] = item_merged   # TODO: finish

    # TODO: put amounts of document sets, and amount of questions, in description.
    #pprint.pprint( merged )

WARN: No useful question data in 'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20022003/ah-tk-20022003-1391-h1/1/xml/ah-tk-20022003-1391-h1.xml'
WARN: No useful question data in 'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20022003/ah-tk-20022003-486/1/xml/ah-tk-20022003-486.xml'
WARN: No useful question data in 'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20072008/ah-tk-20072008-1362/1/xml/ah-tk-20072008-1362.xml'
WARN: No useful question data in 'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20092010/ah-tk-20092010-1000/1/xml/ah-tk-20092010-1000.xml'
WARN: No useful question data in 'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20092010/ah-tk-20092010-1001/1/xml/ah-tk-20092010-1001.xml'
WARN: No useful question data in 'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20092010/ah-tk-20092010-1002/1/xml/ah-tk-20092010-1002.xml'
WARN: No useful question data in 'https://repository.o

In [9]:
print( "Number of document sets: %d\n"%len(collected) )

# print a few examples example
for id, detail_dict in random.sample( collected.items(), 3):
    print(id)
    pprint.pprint( detail_dict )

Number of document sets: 38247

ah-tk-20202021-876
{'available': '2020-11-24',
 'category': 'Zorg en gezondheid | Organisatie en beleid',
 'identifier': 'ah-tk-20202021-876',
 'indiener': ['A.E. Diertens'],
 'issued': '2020-11-23',
 'ontvanger': ['P. Blokhuis'],
 'type': ['officiële publicatie', 'Antwoord', 'Aanhangsel van de Handelingen'],
 'urls': ['https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20202021/ah-tk-20202021-876/1/metadata/metadata.xml',
          'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20202021/ah-tk-20202021-876/1/xml/ah-tk-20202021-876.xml'],
 'vergaderjaar': '2020-2021',
 'vraagdata': {'1': {'antwoord': ('Ja',
                                  '<antwoord '
                                  'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n'
                                  '      <nr status="officieel">Antwoord '
                                  '1</nr>\n'
                                  '      <al>Ja</al>\n'
         

In [13]:

kamervragen_dataset = wetsuite.helpers.localdata.MsgpackKV('tweedekamer-kamervragen-struc.db', key_type=str )     # URL -> dict

kamervragen_dataset._put_meta( 'description_short', ''' Questions from members of the parliament (tweede kamer) to the government. ''' )

kamervragen_dataset._put_meta( 'description', '''
    Questions from members of the parliament (tweede kamer) to the government.

    Source: Officiele publicaties, specifically the ah-tk documents (aanhangsels, tweede kamer), as fetched from the KOOP repositories,
    and further restricted to the documents that actually contain answers 
    (so ignoring documents that are only the questions, or only the questions and a note that they will be answered later).

    The structure of .data looks something like:
       {'ah-tk-20102011-2857': {
            'identifier': 'ah-tk-20102011-2857',
            'urls':   [ 'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20102011/ah-tk-20102011-2857/1/metadata/metadata.xml',
                        'https://repository.overheid.nl/frbr/officielepublicaties/ah-tk/20102011/ah-tk-20102011-2857/1/xml/ah-tk-20102011-2857.xml',
                        ... 
                      ],
            'issued':       '2011-06-17',
            'available':    '2011-06-23',
            'vergaderjaar': '2010-2011',
            'category':     'Zorg en gezondheid | Jongeren',
            'type':         ['officiële publicatie', 'Antwoord', 'Aanhangsel van de Handelingen'],
            'indiener':     ['A.G. Wolbert'],
            'ontvanger':    ['M.L.L.E. Veldhuijzen van Zanten-Hyllner'],
            'vraagdata':    {
                '1': {
                      'vraag':    ['Bent u bekend met de tv-uitzending, waarin o.a. de moeder van Lucas haar verhaal doet?', '<vraag> Bent ...']
                   'antwoord':    ['Ja.',  '<antwoord> Ja. ...']
                },
                '2': {
                      'vraag':    ['Staat u nog steeds achter het principe dat mensen mogen kiezen uit zorg in een instelling en/of natura?', '<vraag> ...']]
                   'antwoord':    ['Mijn uitgangspunt blijft dat cliënten mogen kiezen bij welke zorgaanbieder zij de zorg in natura willen afnemen.', '<antwoord> ...']]
                },
       }}

    Note that 
    * the 'vraag' and 'antwoord' give a list of two items:
      * plain text, stripped of some references and notes, to have more natural text
      * the XML fragment it was taken from, in case you care about those references and notes

    * the keys in vraagdata are strings, not numbers directly, because...

    * questions and/or answers may be grouped, so you may e.g. see questions numbered '5' and '6' and an answer '5 6'.
      In this data structure, that means '5' and '6' will have only a 'vraag', and '5 6' will have only an 'antwoord'.
      This currently affects roughly 5% of questions.

    * The URLs mentioned are those that were actually used.  There are more, e.g. .pdf, .odt, html, and owms metadata


    TODO:
    * There are non-numbered questions in the XML, which are currently omitted - investigate and fix
    * summarize how complete this data is with its source in general
    * there are mistakes in the XML we could fix (e.g. use of al instead of nr) 
                              
    ''') 

for id, detail_dict in collected.items():
    kamervragen_dataset.put( id, detail_dict )