In [2]:
%matplotlib inline
from os import listdir, path
from requests import get
import urllib3
from lxml import etree
import pandas as pd
import warnings
from tqdm import tqdm
import itertools
warnings.filterwarnings('ignore')
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
for year in tqdm(range(2011, 2018)):
    endOfAssembly = False
    assemblyNr = 0
    while endOfAssembly == False:
        assemblyNr += 1
        for debateNr in range(1, 200):
            file = "h-tk-%s-%s-%s" %(str(year)+str(year+1), str(assemblyNr), str(debateNr))
            if not path.isfile('HandelingenTK/%s.xml' %file):
                url = "https://zoek.officielebekendmakingen.nl/%s" %file
                resp = get(url + ".xml", verify=False, allow_redirects=False)
                if resp.status_code == 200:
                    # Download the Handelingen file
                    hand_file = open('HandelingenTK/%s.xml' %file, 'wb')
                    hand_file.write(resp.content)
                    hand_file.close()
                    resp = get(url+"/metadata.xml", verify=False, \
                                                        allow_redirects=False)
                    # Download the corresponding meta file
                    meta_file = open('HandelingenTKmeta/%smetadata.xml' %file, 'wb')
                    meta_file.write(resp.content)
                    meta_file.close()    
                elif debateNr == 1:
                    endOfAssembly = True
                    break
                else:
                    break

100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [24:38<00:00, 211.15s/it]


In [8]:
enddict = {}
index = 0
for file in listdir('HandelingenTK/'):
    context = etree.iterparse('HandelingenTK/' + file,  tag='spreekbeurt')
    mainSpeaker = None
    for _, elem in context:
        if elem.get('nieuw') == 'ja':
            speechCategory = "Main Speech"
            mainSpeaker = elem.findtext('spreker/naam/achternaam')
        elif elem.findtext('spreker/naam/achternaam') == mainSpeaker:
            speechCategory = "Response"
        else:
            speechCategory = "Interruption"
        surname = elem.findtext('spreker/naam/achternaam')
        party = elem.findtext('spreker/politiek')
        text = ' '.join(elem.find('tekst').itertext())
        enddict[index] = [surname, party, text, file, speechCategory]
        del surname, party, text, speechCategory
        index += 1
df = pd.DataFrame.from_dict(enddict, orient='index')
df = df.rename(columns={0:'surname', 1:'party', 2:'text', 3:'file', 4:'speech category'})
df.head(5)

Unnamed: 0,surname,party,text,file,speech category
0,voorzitter,,\n Ik deel aan de Kamer mee dat er g...,h-tk-20102011-100-1.xml,Main Speech
1,voorzitter,,\n Ik constateer dat dit amendement ...,h-tk-20102011-100-10.xml,Interruption
2,voorzitter,,\n Ik constateer dat de aanwezige le...,h-tk-20102011-100-10.xml,Interruption
3,voorzitter,,\n Ik constateer dat dit wetsvoorste...,h-tk-20102011-100-10.xml,Interruption
4,voorzitter,,\n Ik constateer dat de aanwezige le...,h-tk-20102011-100-11.xml,Interruption


In [10]:
metaDict = {}
for file in listdir('HandelingenTKmeta'):
    tags = []
    date = None
    context = etree.iterparse('HandelingenTKmeta/' + file,  tag='metadata')
    for _, elem in context:
        if elem.get('name') == "OVERHEIDop.datumVergadering":
            date = elem.get('content')
        elif elem.get('name') == "OVERHEID.category":
            tags.append(elem.get('content'))
        metaDict[file[:-12] + '.xml'] = [date, tags]
metaDF = pd.DataFrame.from_dict(metaDict, orient='index')
metaDF = metaDF.rename(columns={0:'date', 1:'tags'})
metaDF.head(5)

Unnamed: 0,date,tags
h-tk-19971998-1-1.xml,1998-03-31,[Bestuur | Parlement]
h-tk-19981999-1-1.xml,1998-09-15,[Bestuur | Parlement]
h-tk-19981999-1-2.xml,1998-09-15,"[Financiën | Begroting, Financiën | Organisati..."
h-tk-19992000-1-1.xml,1999-09-21,[Bestuur | Parlement]
h-tk-19992000-1-2.xml,1999-09-21,[Bestuur | Parlement]


In [12]:
combineddf = df.join(metaDF, on='file')
combineddf = combineddf.dropna()
#https://stackoverflow.com/questions/29370057/select-dataframe-rows-between-two-dates
combineddf['date'] = pd.to_datetime(combineddf['date'])
mask = (combineddf['date'] > '2012-11-05') & (combineddf['date'] <= '2017-03-23')
combineddf = combineddf.loc[mask]
combineddf = combineddf[combineddf['party'].isin(['50PLUS', 'CDA', 'ChristenUnie', 'D66', 'GroenLinks', 'PVV', 'PvdA', 'PvdD', 'SGP', 'SP', 'VVD'])]
combineddf.to_csv("Handelingen.csv")
combineddf.head(5)

['PVV' 'SP' 'D66' 'CDA' 'PvdD' 'ChristenUnie' 'PvdA' 'GroenLinks' 'SGP'
 'VVD' '50PLUS' 'EP/VVD' 'EP/CU' 'EP/Christenunie' 'EP/PvdA' 'EP/SP'
 'EP/D66' 'EP/CDA' 'EP/Onafhankelijk' 'EP/GroenLinks' 'Van Vliet'
 '50PLUS/Klein' 'Groep Bontes/Van Klaveren' '50PLUS/Baay-Timmerman'
 '50PLUS/Krol' 'Bontes' 'Groep Kuzu/Öztürk' 'Houwers' 'Klein' 'EP/CU/SGP'
 'EP/PvdD' 'Monasch' 'Forum voor Democratie' 'DENK']


Unnamed: 0,surname,party,text,file,speech category,date,tags
107136,Wilders,PVV,\n \n Mevrouw de voorzit...,h-tk-20122013-100-3.xml,Main Speech,2013-06-26,"[Bestuur | Parlement, Financiën | Begroting]"
107137,Roemer,SP,\n \n Voorzitter. Vorige...,h-tk-20122013-100-3.xml,Main Speech,2013-06-26,"[Bestuur | Parlement, Financiën | Begroting]"
107138,Pechtold,D66,\n \n Voorzitter. Twee m...,h-tk-20122013-100-3.xml,Main Speech,2013-06-26,"[Bestuur | Parlement, Financiën | Begroting]"
107139,Van Haersma Buma,CDA,\n \n Voorzitter. Alleen...,h-tk-20122013-100-3.xml,Main Speech,2013-06-26,"[Bestuur | Parlement, Financiën | Begroting]"
107141,Van Haersma Buma,CDA,\n \n Dan het sociaal ak...,h-tk-20122013-100-3.xml,Response,2013-06-26,"[Bestuur | Parlement, Financiën | Begroting]"
