In [1]:
import re, datetime, time, json, random, pprint, urllib.parse

import bs4

import wetsuite.helpers.localdata
import wetsuite.helpers.etree
import wetsuite.helpers.net
import wetsuite.helpers.format
import wetsuite.helpers.notebook

In [2]:
rvs_fetched = wetsuite.helpers.localdata.LocalKV('rvs_fetched.db', str, bytes)

In [3]:
class ExtractParagraphs:
    ''' Takes an advice HTML page and outputs text.

        Helps fetch the "Volledige tekst" part of a etree'd HTML page, with minor interpretation of its HTML 
        (that node logic is somewhat explicit to be able to refine that - and to have it explicit what we're not handling yet)

        This is a class mostly to be a collection of related data without cluttering a scope.
    '''
    def __init__(self):
        self.pars   = []
        self.curpar = []
        self.links  = []

    def flush_curpar(self):
        if len(self.curpar)>0:
            self.pars.append( ' '.join(self.curpar) )
            self.curpar = []

    def _handle(self, elem, page_url):
        ' Generally expects to be handed a <p> '
        for thing in elem.contents:
            if type(thing) is bs4.NavigableString:
                self.curpar.append( str(thing) ) # TODO: is that the best way to get the text?

            elif thing.name == 'em':
                self.curpar.append( thing.text )

            elif thing.name == 'span':     # TODO: check what these actually are
                self.flush_curpar()
                #print( thing )
                self.curpar.append( thing.text )
                self.flush_curpar()

            elif thing.name == 'sub':      # TODO: current code will separate that, which it probably shouldn't be.
                self.curpar.append( thing.text )
            elif thing.name == 'sup':
                self.curpar.append( thing.text )

            elif thing.name == 'br':
                self.flush_curpar()

            elif thing.name == 'hr':
                self.flush_curpar()

            elif thing.name == 'strong':
                # TODO: see if it's a header. Currently sort of assumed to be at least a paragraph splitter
                # note: headers generally are a <strong> - but they may not be part of the <p> you think.
                self.flush_curpar()
                self.curpar.append( 
                        #'[%s]'%
                        thing.text
                )
                self.flush_curpar()

            elif thing.name == 'h2':
                self.flush_curpar()
                self.curpar.append( thing.text )
                self.flush_curpar()

            elif thing.name == 'h3':
                self.flush_curpar()
                self.curpar.append( thing.text )
                self.flush_curpar()

            elif thing.name == 'h4':
                self.flush_curpar()
                self.curpar.append( thing.text )
                self.flush_curpar()


            elif thing.name == 'a':
                #print( 'A', thing )
                self.links.append( urllib.parse.urljoin(page_url, thing.get('href') ) ) # resolve in context of page
                self.curpar.append( thing.text )

            elif thing.name == 'img':
                pass # maybe add to self.images ?
                #img_abs = urllib.parse.urljoin( page_url , thing.get('src'))  # TODO: look at all the attributes if we want to use it
            elif thing.name == 'figure':
                pass # maybe add to self.images ?
                #img_abs = urllib.parse.urljoin( page_url , thing.get('src'))  # TODO: look at all the attributes if we want to use it

            # not the cleanest; TODO: check that this works
            elif thing.name == 'ol':
                self._handle(thing, page_url)
            elif thing.name == 'ul':
                self._handle(thing, page_url)
            elif thing.name == 'li':
                self._handle(thing, page_url)


            # this is dirty, because it's an extra cases, not recursion
            elif thing.name == 'p':
                #print('PPP')
                self._handle(thing, page_url)
            elif thing.name == 'div':
                #print('DDD')
                self._handle(thing, page_url)

            # even nastier workaround
            elif thing.name == 'table':
                self._handle(thing, page_url)
            elif thing.name == 'tbody':
                self._handle(thing, page_url)
            elif thing.name == 'thead': # maybe ignore?
                self._handle(thing, page_url)
            elif thing.name == 'tr':
                self._handle(thing, page_url)
            elif thing.name == 'td':
                self._handle(thing, page_url)
                self.flush_curpar() # TODO: check this makes sense.
            elif thing.name == 'th':
                self._handle(thing, page_url)
                self.flush_curpar() # TODO: check this makes sense.

            #elif type(thing) is bs4.NavigableString:
            #    self.curpar.append( thing.text )
            else:
                raise ValueError( "Don't know what to do with %r (on %r)"%(thing, page_url))

        self.flush_curpar() # TODO: check this always makes sense.


def extract_via_store( page_url ):
    ''' Given the URL to an advice, 
        this fetches it from a store,
        and extracts all interesting metadata, and text.
    
        CONSIDER: separate voetnoten
    '''
    ret = {}
    page_data = rvs_fetched.get( page_url )

    kenmerk = None
    soup = bs4.BeautifulSoup( page_data, 'lxml' )
    
    title = soup.select('div.rol-paginatitel h1.grid-title')[0].text        

    meta = {'trefwoorden':[]}
    
    ## Pick out metadata
    last_dt = ''
    metadata_blok_dl = soup.find('div',  attrs={'class':re.compile(r'\brol-metadata-blok\b')}).find('dl')
    for ch in metadata_blok_dl.findAll(['dt','dd']):
        if ch.name == 'dt':
            last_dt = ch.text.strip()
        elif ch.name == 'dd':
            val = ch.text.strip()
            try:
                maanden = {
                    'januari':1, 'februari':2, 'maart':3, 'april':4,  'mei':5,  'juni':6, 'juli':7,
                    'augustus':8, 'september':9, 'october':10, 'oktober':10, 'november':11, 'december':12,
                }
                if 'Datum' in last_dt:
                    day, month, year = val.split()
                    if month.lower() in maanden:
                        month = '%02d'%maanden[month.lower()]
                    else:
                        raise ValueError('not doing half a parse')
                    val = '%04d-%s-%02d'%(int(year), month, int(day))

                meta[last_dt] = val

            except ValueError as e: # assume the date didn't manage to parse - we set the string as we got it
                print("Didn't parse %r as date: %e"%(val, e))
                meta[last_dt] = val 
                
        else:
            raise ValueError("Don't understand dd child %r"%ch.name)

    trefwoorden_ul = soup.find('ul',  attrs={'class':re.compile(r'\btrefwoorden\b')})
    for ch in trefwoorden_ul.findAll('li'):
        meta['trefwoorden'].append( ch.text.strip() )

    kenmerk = meta['Kenmerk']
    if kenmerk is None:
        raise ValueError( "No kenmerk for %u"%page_url )


    ## Pick out "Volledige tekst" paragraphs
    ep = ExtractParagraphs()

    volledigetekst = soup.find(id='volledigetekst')
    if volledigetekst is not None:
        # TODO: fix this, it's not working

        volledigetekst_div = volledigetekst.find( attrs={'class':re.compile(r'\biprox-content\b')} )
        ep._handle( volledigetekst_div, page_url )

        ret = {
            'url':    page_url,
            'kenmerk':kenmerk,
            'title':  title,
            'meta':   meta,
            'body':   [],
            'links':  ep.links,
        }
        for par in ep.pars:
            ret['body'].append( par.replace('\xa0',' ').strip() )
        return ret
    else:
        raise ValueError('ERROR: no #volledigetekst')

## Go through the webpage pages listing of all advices, and fetch each case

In [4]:
maxpage  = 0        # will soon be set to the real number, from the pages we fetch
cur_page = 0        # note: zero-based counting in the pagination

pbar = None         # slight trickery with the progress bar (creating it after we've started work),
                    # because we won't know the max until afterthe first fetch

count_fetched, count_cached = 0, 0

while cur_page <= maxpage:
    page_url = 'https://www.raadvanstate.nl/adviezen/?pager_page=%d&pager_rows=100'%cur_page
    page_data = wetsuite.helpers.net.download(page_url)
    soup = bs4.BeautifulSoup( page_data, 'lxml' )

    # get the amount of pages, from the pagination links
    pager_links = soup.select('a.pager_step')
    for pager_link in pager_links:
        try:
            data_page = pager_link.get('data-page')
            maxpage = max( maxpage, int(data_page))
            if pbar==None and maxpage > 5:
                pbar = wetsuite.helpers.notebook.progress_bar( maxpage, description='fetching pages...')

        except ValueError as ve:
            print( "WARNING: didn't understand %r as page number (%s)"%(data_page, ve) )

    #print( "\nPAGE %d of %d"%( cur_page+1, maxpage+1 ) ) # numbering is zero-based,  print out one-based for humans

    # fetch all links to specific case detail pages  -- URLs that look like https://www.raadvanstate.nl/adviezen/@133837/w02-22-00162-ii/
    detail_pages = {} # url -> text
    for detail_page_a in soup.select('a[href*="/adviezen/@"]'):
        url  = detail_page_a.get('href').split('#')[0]   # already absolute  (otherwise we'd have to urljoin them).   Anchor strip is also dedupe
        text = detail_page_a.text.strip()
        detail_pages[ url ] = text  

    for detail_page_url, detail_page_linktext in detail_pages.items():
        # there seems to be nothing on the search result page that isn't on the detail pages, so we can just fetch now, and handle each individually later
        # Such pages contain both samenvatting and volledigetekst, and the links to them are just #hash that presumably scripting pays attention to
        bytedata, was_cached = wetsuite.helpers.localdata.cached_fetch( rvs_fetched, detail_page_url )
        if not was_cached:
            count_fetched += 1
            #print(' FETCHED  - ', end='')
            time.sleep( 1 ) # be somewhat nice to the server
        else:
            count_cached += 1
            #print(' CACHED   - ', end='')
            # TODO: if "Vindplaats" is just "Website Raad van State" (and date is relatively recent), try re-fetching.            

        if 0: # debug - print a list of URLs with their titles
            print('%-70s '%detail_page_url, end='')
            print('%s '%detail_page_linktext)
            print( extract_via_store( detail_page_url ) )
            print()

    cur_page += 1
    if pbar is not None:
        pbar.value = cur_page
        pbar.description = 'fetching pages (%d cases fetched, %d cases cached)...'%(count_fetched, count_cached)

    #if cur_page > 2: # when testing changes
    #    break

fetching pages...:   0%|          | 0/119 [00:00<?, ?it/s]

## Go through fetched pages, massage into dataset

In [5]:
collected = {}

pbar = wetsuite.helpers.notebook.progress_bar( len(rvs_fetched), description='massaging...' )

for page_url in rvs_fetched.keys():
    item = extract_via_store( page_url ) # we are doing just the one thing in this loop
    collected[ item.get('kenmerk') ] = item
    pbar.value += 1

massaging...:   0%|          | 0/11928 [00:00<?, ?it/s]

In [27]:
# print some random items, to inspect whether that parse is working
pprint.pprint( random.sample( list( collected.items() ), 2) )

[('W01.00.0162/I',
  {'body': ['Voorstel van Rijkswet inzake het verlenen van toestemming aan '
            'Prins Bernhard Lucas Emmanuel van Oranje-Nassau, Van Vollenhoven '
            'om een huwelijk aan te gaan met Annette Sekrève.',
            'Dit advies is een zogenoemd advies conform.',
            'Dit betekent dat de tekst van het advies "zonder meer instemmend '
            'luidt, dan wel uitsluitend opmerkingen van redactionele aard '
            'bevat". Openbaarmaking van een advies conform blijft achterwege '
            '(artikel 25a, vierde lid, van de Wet op de Raad van State). De '
            'tekst van het advies wordt dus nergens gepubliceerd, niet in het '
            'Bijvoegsel van de Staatscourant en niet in de Kamerstukken.',
            ''],
   'kenmerk': 'W01.00.0162/I',
   'links': [],
   'meta': {'Datum advies': '2000-04-26',
            'Kenmerk': 'W01.00.0162/I',
            'Vindplaats': 'Niet van toepassing',
            'trefwoorden': ['Algemene 

## Write dataset into file

In [18]:
rvs_extracted = wetsuite.helpers.localdata.MsgpackKV('rvs_extracted.db', str, None)

In [28]:
rvs_extracted._put_meta('descrition', '''
These are a parsed form of Raad van State (state council) advice,
specifically the set of documents under https://www.raadvanstate.nl/adviezen
scraped into plain-text documents. 

Items look like:    
'W01.19.0027/I': {'title': 'Voorstel van wet van het lid [...]',
                'url': 'https://www.raadvanstate.nl/adviezen/@113252/w01-19-0027/'
                'body': ['Bij brief van de voorzitter van de [...]',  # a list of paragraph-like fragments. 
                        ],
                'links': ['http://www.rijksoverheid.nl/documenten/rapporten/2015/11/19/het-lokale-referendum-in-Nederland,).(156'],
                'meta': {'Kenmerk': 'W01.19.0027/I',
                            'trefwoorden': ['Algemene zaken', 'Initiatiefwet']
                            'Datum aanhangig': '2019-01-30',
                            'Datum advies': '2019-09-18',
                            'Datum vastgesteld': '2019-09-18',
                            'Datum publicatie': '2019-10-28',
                            'Vindplaats': 'Kamerstukken II 2019/20, 35129, nr. 4', #  if at scraping time this was not settled, it will probably say "Website Raad van State" instead
                        },
                },

This dataset generated on %s
    '''%datetime.date.today().strftime('%Y-%m-%d'))

In [29]:
for k, v in collected.items():
    rvs_extracted.put(k, v, commit=False)
rvs_extracted.commit()
#with open('raadvanstate_adviezen.json', 'w') as wf:
#    wf.write( json.dumps(write_dataset) ) 
#wf.close()

In [30]:
# double check that worked
print( len(rvs_extracted) )
rvs_extracted.random_sample(3)

11872


[('W06.19.0380/III',
  {'url': 'https://www.raadvanstate.nl/adviezen/@118751/w06-19-0380-iii/',
   'kenmerk': 'W06.19.0380/III',
   'title': 'Fiscale verzamelwet 2021.',
   'meta': {'trefwoorden': ['Financiën', 'Wet'],
    'Kenmerk': 'W06.19.0380/III',
    'Datum aanhangig': '2019-11-26',
    'Datum vastgesteld': '2020-02-19',
    'Datum advies': '2020-02-20',
    'Datum publicatie': '2020-04-14',
    'Vindplaats': 'Kamerstukken II 2019/20, 35437, nr. 4'},
   'body': ['Bij Kabinetsmissive van 26 november 2019, no.2019002486, heeft Uwe Majesteit, op voordracht van de Staatssecretaris van Financiën, bij de Afdeling advisering van de Raad van State ter overweging aanhangig gemaakt het voorstel van wet tot wijziging van enkele belastingwetten en enige andere wetten, alsmede invoering grondslag voor compensatieregeling (Fiscale verzamelwet 2021), met memorie van toelichting.',
    'Het voorstel van wet omvat wijzigingen van verschillende wetten. Naast enkele wijzigingen van meer inhoudelijk