In [16]:
import collections, pprint, random

import wetsuite.datasets
import wetsuite.helpers.etree
import wetsuite.helpers.koop_parse

## CVDR

In [28]:

cvdr_xml = wetsuite.datasets.load('cvdr-mostrecent-xml')

cvdr_parsed = [] # list of (source url, etree object) tuples

cvdr_urls = cvdr_xml.data.keys()
cvdr_urls_subset = random.sample(cvdr_urls, 2000) # 160K is a bit much in RAM, and a sizeable random selection should be enough

for cvdr_url in cvdr_urls_subset: 
    bytestring = cvdr_xml.data.get( cvdr_url )
    tree = wetsuite.helpers.etree.fromstring( bytestring )
    tree = wetsuite.helpers.etree.strip_namespace( tree )
    meta = wetsuite.helpers.koop_parse.cvdr_meta( tree, flatten=True )
    tables = tree.xpath('//table')
    if len(tables)>0:
        cvdr_parsed.append( (cvdr_url, tree, meta['identifier'], tables) )

#cvdr_parsed

### Figure out which things have tables

and roughly how many and roughly are where they are in the document

In [29]:
print("%d tables in %d (of %d) CVDRs "%( 
    sum( len(tables)   for cvdr_url, tree, exprid, tables in cvdr_parsed ),
    len( cvdr_parsed ),
    len( cvdr_urls_subset )
))

2693 tables in 519 (of 2000) CVDRs 


In [42]:
cvdr_parent_names_count = collections.defaultdict( int )

for cvdr_url, tree, exprid, tables in cvdr_parsed:
    parent_names = list(table.getparent().tag  for table in tables)
    #print( '%-20s %s'%( exprid, ', '.join( parent_names  ) ))
    for parent_name in parent_names:
        cvdr_parent_names_count[parent_name] += 1

#pprint.pprint( list(cvdrs_with_table.items()) )

In [43]:
cvdr_parent_names_count

defaultdict(int,
            {'artikel': 724,
             'structuurtekst': 439,
             'bijlage': 1122,
             'lid': 97,
             'nota-toelichting': 144,
             'li': 79,
             'tekst': 19,
             'divisie': 69})

## BWB

Make a preselection of BWBRs with tables, so that we don't have to this each time

At the same time, get an idea of what node that table is nested in.

In [None]:
# load BWB documents
#   there are currently roughly 37k active toestanden.   
#   all of it in one go takes a while, and takes a lot of RAM, so let's make a selection.

bwb_parsed = []

bwb_xml = wetsuite.datasets.load('bwb-mostrecent-xml')
bwb_urls = bwb_xml.data.keys()
bwb_urls_subset = random.sample(bwb_urls, 50)

for bwb_url in bwb_urls_subset:
    bytestring = bwb_xml.data.get( bwb_url )
    tree = wetsuite.helpers.etree.fromstring( bytestring )
    tree = wetsuite.helpers.etree.strip_namespace( tree )
    bwb_parsed.append( (bwb_url, tree) )

    meta = wetsuite.helpers.koop_parse.bwb_toestand_usefuls( tree )
    tables = tree.xpath('//table')
    print(meta)
    if len(tables)>0:
        bwb_parsed.append( (bwb_url, tree, meta['bwb-id'], tables) )

print('DONE parsing %d items'%len(bwb_parsed))

In [40]:
bwb_parent_names_count = collections.defaultdict( int )

for cvdr_url, tree, bwbid, tables in cvdr_parsed:
    parent_names = list(table.getparent().tag   for table in tables)
    for parent_name in parent_names:
        bwb_parent_names_count[parent_name] += 1

bwb_parent_names_count
#pprint.pprint( list(cvdrs_with_table.items()) )

defaultdict(int,
            {'artikel': 724,
             'structuurtekst': 439,
             'bijlage': 1122,
             'lid': 97,
             'nota-toelichting': 144,
             'li': 79,
             'tekst': 19,
             'divisie': 69})

The tables seem to be DocBook style, see e.g. 
https://www.oasis-open.org/docbook/documentation/reference/html/table.html
https://tdg.docbook.org/tdg/4.5/table.html

but maybe just grab the rendered version and use pandas?
https://pbpython.com/pandas-html-table.html