In [3]:
import re, datetime, time, json

import bs4

import wetsuite.helpers.localdata
import wetsuite.helpers.net
import wetsuite.helpers.format

In [4]:
rvs_fetched = wetsuite.helpers.localdata.open_store('rvs_fetched.db', str, bytes)

## Go through the webpage list of all advices; fetch

In [None]:
maxpage  = 0        # will be set to the real number from the first (well, every) page we fetch
cur_page = 0        # zero-based counting in the pagination

while cur_page <= maxpage:
    page_url = 'https://www.raadvanstate.nl/adviezen/?pager_page=%d&pager_rows=100'%cur_page
    page_data = wetsuite.helpers.net.download(page_url)
    soup = bs4.BeautifulSoup( page_data, 'lxml' )

    # get the amount of pages, from the pagination links
    pager_links = soup.select('a.pager_step')
    for pager_link in pager_links:
        try:
            data_page = pager_link.get('data-page')
            maxpage = max( maxpage, int(data_page))
        except ValueError as ve:
            print( "WARNING: didn't understand %r as page number (%s)"%(data_page, ve) )

    print( "\nPAGE %d of %d"%( cur_page+1, maxpage+1 ) ) # numbering is zero-based,  print out one-based for humans


    # fetch all links to specific case detail pages
    for detail_page_a in soup.select('a[href*="/adviezen/@"]'):   # links that look like https://www.raadvanstate.nl/adviezen/@133837/w02-22-00162-ii/
        detail_page_url = detail_page_a.get('href') # these are already absolute  (otherwise we'd have to urljoin them)

        if '#' in detail_page_url:
            detail_page_url = detail_page_url.split('#',1)[0]

        # there seems to be nothing on the search result page that isn't on the detail pages, so we can just fetch now, and handle each individually later
        # Such pages contain both samenvatting and volledigetekst, and the links to them are just #hash that presumably scripting pays attention to
        bytedata, is_cached = wetsuite.helpers.localdata.cached_fetch( rvs_fetched, detail_page_url )
        if not is_cached:
            print(' FETCHED  - ', end='')
            time.sleep(0.2)
        else:
            print(' CACHED   - ', end='')

        # debug - print a list of URLs with their titles
        print('%-70s '%detail_page_url, end='')
        print('%s '%detail_page_a.text)

    cur_page += 1

## Go through fetched pages, massage into dataset

In [18]:
dataset = {}

import os
os.environ['PATH'] = os.environ['PATH']+':/var/www/coding'
import helpers_progressbar
pb = helpers_progressbar.ProgressBar(len(rvs_fetched))

for page_url in rvs_fetched.keys():
    pb.increment()
    pb.simple()
    page_data = rvs_fetched[page_url]

    kenmerk = None
    soup = bs4.BeautifulSoup( page_data, 'lxml' )

    title = soup.select('div.rol-paginatitel h1.grid-title')[0].text        

    meta = {'trefwoorden':[]}
    links = []
    #images = []

    last_dt = ''
    metadata_blok_dl = soup.find('div',  attrs={'class':re.compile(r'\brol-metadata-blok\b')}).find('dl')
    for ch in metadata_blok_dl.findAll(['dt','dd']):
        if ch.name == 'dt':
            last_dt = ch.text.strip()
        elif ch.name == 'dd':
            val = ch.text.strip()
            try:
                maanden = {
                    'januari':1, 'februari':2, 'maart':3, 'april':4,  'mei':5,  'juni':6, 'juli':7,
                    'augustus':8, 'september':9, 'october':10, 'oktober':10, 'november':11, 'december':12,
                }
                if 'Datum' in last_dt:
                    day, month, year = val.split()
                    if month.lower() in maanden:
                        month = '%02d'%maanden[month.lower()]
                    else:
                        raise ValueError('not doing half a parse')
                    val = '%04d-%s-%02d'%(int(year), month, int(day))

                meta[last_dt] = val

            except ValueError as e: # assume the date didn't manage to parse - we set the string as we got it
                print("Didn't parse %r as date: %e"%(val, e))
                meta[last_dt] = val 
                
        else:
            raise ValueError("Don't understand dd child %r"%ch.name)

    trefwoorden_ul = soup.find('ul',  attrs={'class':re.compile(r'\btrefwoorden\b')})
    for ch in trefwoorden_ul.findAll('li'):
        meta['trefwoorden'].append( ch.text.strip() )

    kenmerk = meta['Kenmerk']

    if 0:
        samenvatting_pars = []
        samenvatting   = soup.find(id='samenvatting')
        if samenvatting is not None:
            samenvatting_div = samenvatting.find(attrs={'class':re.compile(r'\biprox-content\b')})
            for ch in samenvatting_div.children:
                if ch.name not in ('p h2 h3 ol'.split()):
                    pass
                    #print( [ch.name, ch]  )
                else:
                    samenvatting_pars.append( ' '.join( ch.findAll(text=True) ) )
        
    pars = []
    curpar = []
    def flush_curpar():
        global pars, curpar
        if len(curpar)>0:
            pars.append( ' '.join(curpar) )
            curpar=[]

    # note: headers generally are a <strong> - but they may not be part of the <p> you think.
    volledigetekst = soup.find(id='volledigetekst')
    if volledigetekst is not None:
        volledigetekst_div = volledigetekst.find(attrs={'class':re.compile(r'\biprox-content\b')})
        #print( volledigetekst )
        for ch in volledigetekst_div.children:
            if ch.name not in ('p'.split()):
                pass
                #print( [ch.name, ch]  )
            else:
                # CONSIDER: separate voetnoten
                for thing in ch.contents:
                    if type(thing) is bs4.NavigableString:
                        curpar.append( str(thing) ) # TODO: is that the best way to get the text?

                    elif thing.name == 'em':
                        curpar.append( thing.text )

                    elif thing.name == 'span': # TODO: check what these actually are
                        curpar.append( thing.text )

                    elif thing.name == 'sub': # TODO: current code will separate that, which it probably shouldn't be.
                        curpar.append( thing.text )

                    elif thing.name == 'br':
                        flush_curpar()

                    elif thing.name == 'a':
                        links.append( thing.get('href') )
                        curpar.append( thing.text )

                    elif thing.name == 'img':
                        pass
                        #img_abs = urllib.parse.urljoin( page_url , thing.get('src'))  # TODO: look at all the attributes if we want to use it

                    elif thing.name == 'strong':
                        # see if it's a header
                        flush_curpar()
                        curpar.append( 
                                #'[%s]'%
                                thing.text
                        )
                        flush_curpar()

                    #elif type(thing) is bs4.NavigableString:
                    #    curpar.append( thing.text )
                    else:
                        raise ValueError( "Don't know what to do with %r"%thing)
                flush_curpar()

        #for par in pars:       
        #   print( par )
        #    print()

    if kenmerk is None:
        raise ValueError( "No kenmerk for %u"%page_url )

    dataset[kenmerk] = {
        'title':title,
        'url':page_url,
        'meta':meta,
        'body':pars,
        'links':links,
    }
    #pprint.pprint(dataset[kenmerk])




ModuleNotFoundError: No module named 'helpers_progressbar'

## Write dataset into file

In [None]:

dataset= {
    'descrition':'''
These are a parsed form of Raad van State (state council) advice,
specifically the set of documents under https://www.raadvanstate.nl/adviezen
scraped into plain-text documents. 

Items look like:    
'W01.19.0027/I': {'title': 'Voorstel van wet van het lid [...]',
                'url': 'https://www.raadvanstate.nl/adviezen/@113252/w01-19-0027/'
                'body': ['Bij brief van de voorzitter van de [...]',  # a list of paragraph-like fragments. 
                        ],
                'links': ['http://www.rijksoverheid.nl/documenten/rapporten/2015/11/19/het-lokale-referendum-in-Nederland,).(156'],
                'meta': {'Kenmerk': 'W01.19.0027/I',
                            'trefwoorden': ['Algemene zaken', 'Initiatiefwet']
                            'Datum aanhangig': '2019-01-30',
                            'Datum advies': '2019-09-18',
                            'Datum vastgesteld': '2019-09-18',
                            'Datum publicatie': '2019-10-28',
                            'Vindplaats': 'Kamerstukken II 2019/20, 35129, nr. 4', #  if at scraping time this was not settled, it will probably say "Website Raad van State" instead
                        },
                },

This dataset generated on %s
    '''%datetime.date.today().strftime('%Y-%m-%d'),
    'data':dataset,
}


with open('raadvanstate_adviezen.json', 'w') as wf:
    wf.write( json.dumps(dataset) ) 
wf.close()