# Load python packages

In [None]:
import sys
print(sys.prefix)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import urllib, os,sys, pdfplumber, glob, requests, re

# Set up working dir

In [None]:
base_dir = os.path.realpath('../..')
print(base_dir)
data_dir = base_dir + '\Data'

In [None]:
data_dir

# Find documents on website

Get URLs from TweedeKamer.nl search function

In [None]:
base_search_page_URL = 'https://www.tweedekamer.nl/kamerstukken/commissieverslagen?qry=%2A&fld_prl_kamerstuk=Commissieverslagen&fld_tk_categorie=kamerstukken&srt=date%3Adesc%3Adate&fld_prl_voortouwcommissie=vaste+commissie+voor+Volksgezondheid%2C+Welzijn+en+Sport&fromdate=01%2F01%2F2005&clusterName=Tweedekamer.nl&sta='
base_doc_URL = 'https://www.tweedekamer.nl/downloads/document?id='

In [None]:
all_doc_URLs = pd.DataFrame(columns = ['page','doc','doc_date','doc_id','URL'])
search_start_indices = np.arange(1,800,15) #613
for pi,si in enumerate(search_start_indices):
    print(pi,si)
    search_page_URL = base_search_page_URL + str(si)
    print(search_page_URL)
    print('')
    req = requests.get(search_page_URL, 'html.parser')
    doc_dates = re.findall(r'>([a-zA-Z0-9 ]+)</time>',req.text)
    doc_ids = re.findall(r'href="/downloads/document\?id=([0-9]+D[0-9]+)"',req.text)
    doc_URLs = [base_doc_URL + a for a in doc_ids]
    if len(doc_dates) != len(doc_URLs):
        raise ValueError('unequal length of recovered dates and files')
    if len(doc_URLs) != len(np.unique(doc_URLs)):
        raise ValueError('duplicate URLs recovered')
    tmp = pd.DataFrame.from_dict({'doc_date':doc_dates,'doc_id':doc_ids, 'URL':doc_URLs})
    tmp['page'] = pi
    tmp['doc'] = np.arange(si,si+tmp.shape[0])
    all_doc_URLs = all_doc_URLs.append(tmp[['page','doc','doc_date','doc_id','URL']])
all_doc_URLs = all_doc_URLs.reset_index(drop=True)

In [None]:
all_doc_URLs.head()

In [None]:
all_doc_URLs.shape

In [None]:
len(all_doc_URLs['URL'].unique())

In [None]:
all_doc_URLs.to_csv(data_dir + '\TK_commissieVWS\\all_doc_URLs.csv')

# Download docs to Data folder

In [None]:
all_doc_URLs = pd.read_csv(data_dir + '\TK_commissieVWS\\all_doc_URLs.csv', index_col = 0)
all_doc_URLs.head()

In [None]:
def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(filename, 'wb')
    file.write(response.read())
    file.close()

In [None]:
forbidden_chars = '<>:"/\|?*!@/#$,.'

In [None]:
from datetime import date
datestr = date.today().strftime('%Y%m%d')
print(datestr)

In [None]:
out_dir = data_dir + '\TK_commissieVWS\\auto_download_%s\original'%datestr
print('saving to: ',out_dir)
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
    print('Out dir created')

In [None]:
downloaded_doc_info = pd.DataFrame()

In [None]:
for doci,doc_info in all_doc_URLs.iloc[0:,:].iterrows():
    print(doci, end = '')
    URL = doc_info['URL']
    
    # Find info about file from headers
    response = urllib.request.urlopen(URL)
    headers = dict(response.getheaders())
    ftype = headers['Content-Type']
    
    if ftype == 'application/pdf':
        doc_nr = doc_info['doc']
        doc_id = doc_info['doc_id']
        doc_date = doc_info['doc_date']
        if 'Content-Disposition' in headers.keys():
            fname = headers['Content-Disposition'].split('filename=')[-1].strip('\"\'')
            extracted_dates = re.findall(r'gehouden op ([0-9]+ *[a-zA-Z]+ *[0-9]+)', fname)
            if len(extracted_dates) > 0:
                debate_date = extracted_dates[0]
            else:
                debate_date = 'nodate'
        else:
            fname = 'noname.pdf'
            debate_date = 'nodate'
        # Clean filename: not too long and without forbidden characters such as / \ 
        cleaned_doc_name = fname.translate(str.maketrans('', '', forbidden_chars))
        cleaned_doc_name = cleaned_doc_name[:-3] + '.pdf'
        short_doc_name = cleaned_doc_name[:np.min([40,len(cleaned_doc_name)-10])] + '_' + cleaned_doc_name[-10:]
        out_fname = '\doc%04d_%s_%s_%s'%(doc_nr, doc_id, debate_date, short_doc_name)
        download_file(URL, out_dir + out_fname)
        
        doc_info['debate_date'] = debate_date
        doc_info['debate_name'] = fname
        doc_info['short_name'] = short_doc_name
        doc_info['fname'] = out_fname
        
        downloaded_doc_info = downloaded_doc_info.append(doc_info)
    else:
        print('doc', end = '')
    print(', ', end = '')

In [None]:
downloaded_doc_info.head(n=3).append(downloaded_doc_info.tail(n=3))

In [None]:
downloaded_doc_info.to_csv(out_dir + '\downloaded_doc_info.csv')