In [4]:
# std library imports
from datetime import datetime as Datetime
import datetime
# from copy import deepcopy
# import html
import json
# from json import JSONDecodeError
from pathlib import Path
# import re  # regex
import ssl
# from socket import timeout
# import sys
from typing import List
import urllib.request
from urllib.error import HTTPError, URLError


# stuff needed for parsing and manipulating XML
# this moduel does not come with python and needs to be installed with pip
from lxml import etree  # type: ignore
# from lxml.etree import QName, Element, SubElement, iselement  # type: ignore
# from lxml import html as lhtml


FILEEXTENSION = '.xml'

BASE_URL = 'http://services.vnp.parliament.uk/voteitems'

# xml namespaces used
# AID = 'http://ns.adobe.com/AdobeInDesign/4.0/'
# AID5 = 'http://ns.adobe.com/AdobeInDesign/5.0/'

# NS_ADOBE = {'aid': AID, 'aid5': AID5}

# ns2 = 'http://www.w3.org/2001/XMLSchema-instance'
# ns1 = 'http://www.w3.org/2001/XMLSchema'

# Text before the following should get the speaker style
# chair_titles = ('SPEAKER', 'CHAIRMAN OF WAYS AND MEANS', 'SPEAKER ELECT')

In [5]:
CONTEXT = ssl._create_unverified_context()
def json_from_uri(uri: str, default=None, showerror=True):
    headers = {'Content-Type': 'application/json'}
    request = urllib.request.Request(uri, headers=headers)
    try:
        response = urllib.request.urlopen(request, context=CONTEXT, timeout=30)
        json_obj = json.load(response)
    except (HTTPError, URLError, timeout, JSONDecodeError) as e:
        if showerror:
            warning(f'Error getting data from:\n{uri}\n{e}')
        return default
    else:
        return json_obj

In [7]:
def get_sitting_dates_in_range(from_date: Datetime, to_date: Datetime) -> List[Datetime]:
    """get return a list of sitting day"""
    
    # date
    cal_api_url_template = 'http://service.calendar.parliament.uk/calendar/proceduraldates/commons/nextsittingdate.json?dateToCheck={}'

    
    # the calendar api gives you the next sitting day so we need to start form the day before
    start_date = from_date - datetime.timedelta(days=1)
    
    current_date = start_date
    dates = []
    count = 0
    while current_date < to_date:
        current_date = start_date + datetime.timedelta(days=count)
        dates.append(current_date)
        count += 1
    
    sitting_dates = []
    for date in dates:
        
        next_sitting_date_str = json_from_uri(cal_api_url_template.format(date.strftime('%Y-%m-%d')))
        next_sitting_date = Datetime.strptime(next_sitting_date_str[:10], '%Y-%m-%d')
        sitting_dates.append(next_sitting_date)
    
    return sitting_dates

In [8]:
sitting_dates = get_sitting_dates_in_range(Datetime(2019, 10, 14), Datetime(2019, 11, 14))
sitting_dates = list(dict.fromkeys(sitting_dates))
print(sitting_dates)

[datetime.datetime(2019, 10, 14, 0, 0), datetime.datetime(2019, 10, 15, 0, 0), datetime.datetime(2019, 10, 16, 0, 0), datetime.datetime(2019, 10, 17, 0, 0), datetime.datetime(2019, 10, 21, 0, 0), datetime.datetime(2019, 10, 22, 0, 0), datetime.datetime(2019, 10, 23, 0, 0), datetime.datetime(2019, 10, 24, 0, 0), datetime.datetime(2019, 10, 28, 0, 0), datetime.datetime(2019, 10, 29, 0, 0), datetime.datetime(2019, 10, 30, 0, 0), datetime.datetime(2019, 10, 31, 0, 0), datetime.datetime(2019, 11, 4, 0, 0), datetime.datetime(2019, 11, 5, 0, 0), datetime.datetime(2019, 12, 13, 0, 0)]


In [13]:
# Save a bunch of XML files for future use
def save_xml_from_dates(dates: List[Datetime]):

    for date in [datetime.datetime(2019, 10, 14, 0, 0), datetime.datetime(2019, 10, 15, 0, 0), datetime.datetime(2019, 10, 16, 0, 0), datetime.datetime(2019, 10, 17, 0, 0), datetime.datetime(2019, 10, 21, 0, 0), datetime.datetime(2019, 10, 22, 0, 0), datetime.datetime(2019, 10, 23, 0, 0), datetime.datetime(2019, 10, 24, 0, 0), datetime.datetime(2019, 10, 28, 0, 0), datetime.datetime(2019, 10, 29, 0, 0), datetime.datetime(2019, 10, 30, 0, 0), datetime.datetime(2019, 10, 31, 0, 0), datetime.datetime(2019, 11, 4, 0, 0), datetime.datetime(2019, 11, 5, 0, 0), datetime.datetime(2019, 12, 13, 0, 0)]:
        url = f'{BASE_URL}/{date.strftime("%Y-%m-%d")}.xml'
        # parse and build up a tree for the input file
        try:
            print(url)
            xml = urllib.request.urlopen(url, context=CONTEXT, timeout=30)
        except URLError:
            print('Can\'t seem to get XML. Are you on a parliamentary computer?')
            return
        output_path = Path(f'datedJournalFragemnts/{date.strftime("%Y-%m-%d")}.xml')
        with open(output_path, 'wb') as f:
            f.write(xml.read())

In [16]:
save_xml_from_dates(sitting_dates)

http://services.vnp.parliament.uk/voteitems/2019-10-14.xml
http://services.vnp.parliament.uk/voteitems/2019-10-15.xml
http://services.vnp.parliament.uk/voteitems/2019-10-16.xml
http://services.vnp.parliament.uk/voteitems/2019-10-17.xml
http://services.vnp.parliament.uk/voteitems/2019-10-21.xml
http://services.vnp.parliament.uk/voteitems/2019-10-22.xml
http://services.vnp.parliament.uk/voteitems/2019-10-23.xml
http://services.vnp.parliament.uk/voteitems/2019-10-24.xml
http://services.vnp.parliament.uk/voteitems/2019-10-28.xml
http://services.vnp.parliament.uk/voteitems/2019-10-29.xml
http://services.vnp.parliament.uk/voteitems/2019-10-30.xml
http://services.vnp.parliament.uk/voteitems/2019-10-31.xml
http://services.vnp.parliament.uk/voteitems/2019-11-04.xml
http://services.vnp.parliament.uk/voteitems/2019-11-05.xml
http://services.vnp.parliament.uk/voteitems/2019-12-13.xml


In [18]:
from create_journal import main
main()

[datetime.datetime(2019, 10, 14, 0, 0), datetime.datetime(2019, 10, 15, 0, 0), datetime.datetime(2019, 10, 16, 0, 0), datetime.datetime(2019, 10, 17, 0, 0), datetime.datetime(2019, 10, 21, 0, 0), datetime.datetime(2019, 10, 22, 0, 0), datetime.datetime(2019, 10, 23, 0, 0), datetime.datetime(2019, 10, 24, 0, 0), datetime.datetime(2019, 10, 28, 0, 0), datetime.datetime(2019, 10, 29, 0, 0), datetime.datetime(2019, 10, 30, 0, 0), datetime.datetime(2019, 10, 31, 0, 0), datetime.datetime(2019, 11, 4, 0, 0), datetime.datetime(2019, 11, 5, 0, 0), datetime.datetime(2019, 12, 13, 0, 0)]


OSError: Error reading file '2019-10-14.xml': failed to load external entity "2019-10-14.xml"