# Extract XML to JSON (documents > sections > subsections) w/ links

This notebook cleans up, combines and improves on the code of the previous 2 notebooks.

It produces a large JSON file which is used in the following notebooks.

In [1]:
import json
import os
import random
import re

import lxml.etree as ET
import tqdm
from joblib import Parallel, delayed
from markdownify import markdownify as md

current_dir = os.getcwd()
root_dir = os.path.dirname(current_dir)
laws_dir = os.path.join(root_dir, "laws-lois-xml")
xsl_filename = os.path.join(laws_dir, "xslt", "LIMS2HTML.xsl")

en_acts = [
    os.path.join(os.path.join(laws_dir, "eng", "acts"), f)
    for f in os.listdir(os.path.join(laws_dir, "eng", "acts"))
    if f.endswith(".xml")
]
en_regs = [
    os.path.join(os.path.join(laws_dir, "eng", "regulations"), f)
    for f in os.listdir(os.path.join(laws_dir, "eng", "regulations"))
    if f.endswith(".xml")
]
fr_acts = [
    os.path.join(os.path.join(laws_dir, "fra", "lois"), f)
    for f in os.listdir(os.path.join(laws_dir, "fra", "lois"))
    if f.endswith(".xml")
]
fr_regs = [
    os.path.join(os.path.join(laws_dir, "fra", "reglements"), f)
    for f in os.listdir(os.path.join(laws_dir, "fra", "reglements"))
    if f.endswith(".xml")
]

In [3]:
def _get_text(element):
    return element.text if element is not None else None


def _get_link(element):
    return (
        element.attrib["link"]
        if element is not None and "link" in element.attrib.keys()
        else None
    )


def _get_joined_text(element_list, exclude_tags=["MarginalNote"]):
    return (
        "\n".join(
            [t.text for t in element_list if t.tag not in exclude_tags and t.text]
        )
        if element_list
        else ""
    )


def get_dict_from_xml(xml_filename, extract_full_text=False):
    # Extract a JSON serializable dictionary from a act/regulation XML file
    dom = ET.parse(xml_filename)
    root = dom.getroot()
    d = {
        "id": os.path.basename(xml_filename).replace(".xml", ""),
        "lang": os.path.basename(os.path.dirname(os.path.dirname(xml_filename))),
        "type": "act" if root.tag == "Statute" else "regulation",
        "short_title": _get_text(root.find(".//ShortTitle")),
        "long_title": _get_text(root.find(".//LongTitle")),
        "bill_number": _get_text(root.find(".//BillNumber")),
        "instrument_number": _get_text(root.find(".//InstrumentNumber")),
        "consolidated_number": _get_text(root.find(".//ConsolidatedNumber")),
        "last_amended_date": root.attrib.get(
            "{http://justice.gc.ca/lims}lastAmendedDate", None
        ),
        "current_date": root.attrib.get(
            "{http://justice.gc.ca/lims}current-date", None
        ),
        "in_force_start_date": root.attrib.get(
            "{http://justice.gc.ca/lims}inforce-start-date", None
        ),
        "enabling_authority": {
            "link": _get_link(root.find(".//EnablingAuthority/XRefExternal")),
            "text": _get_text(root.find(".//EnablingAuthority/XRefExternal")),
        },
        "sections": get_preamble(root)
        + [get_section(section) for section in root.findall(".//Section")],
    }
    # Aggregate all internal and external references and count instances of each
    for ref_name in ["internal_refs", "external_refs"]:
        ref_list = [
            ref
            for section in d["sections"]
            for ref in section[ref_name]
            if ref["link"] is not None
        ]
        ref_list_set = set([ref["link"] for ref in ref_list])
        d[ref_name] = [
            {
                "link": link,
                "count": len([ref for ref in ref_list if ref["link"] == link]),
            }
            for link in ref_list_set
        ]
    if extract_full_text:
        # Sometimes the XML will not parse correctly so we need to catch the error
        try:
            d["full_text"] = xml_to_markdown(xml_filename)
        except:
            d["full_text"] = _get_joined_text(root.findall(".//*"))
    return d


def get_section(section):
    return {
        "id": str(section.find("Label").text),
        "text": _get_joined_text(section.findall(".//*")),
        "marginal_note": _get_text(section.find("MarginalNote")),
        "lims_id": section.attrib.get("{http://justice.gc.ca/lims}id", None),
        "subsections": [
            get_section(subsection) for subsection in section.findall(".//Subsection")
        ]
        if section.tag == "Section"
        else [],
        "headings": get_headings(section) if section.tag == "Section" else [],
        "external_refs": get_external_xrefs(section),
        "internal_refs": get_internal_xrefs(section),
        # "xml": ET.tostring(section, encoding="unicode"),
    }


def get_headings(section):
    # Headings are sibling elements which may precede Sections
    headings = []
    prev_elem = section.getprevious()
    # Loop while the previous element is a Heading
    while prev_elem is not None and prev_elem.tag == "Heading":
        headings.append(
            {
                "level": prev_elem.get("level"),
                "text": _get_joined_text(prev_elem.findall(".//*")),
            }
        )
        # Continue with the previous sibling
        prev_elem = prev_elem.getprevious()
    return headings


def get_external_xrefs(section):
    # External references have an explicit link attribute
    return [
        {
            "link": xref.attrib.get("link", None),
            "reference_type": xref.attrib.get("reference-type", None),
            "text": xref.text,
        }
        for xref in section.findall(".//XRefExternal")
    ]


def get_internal_xrefs(section):
    # Internal references are always a section number which is the text
    return [
        {
            "link": xref.text,
        }
        for xref in section.findall(".//XRefInternal")
    ]


def get_preamble(root):
    # Returns an array with a single element, the preamble, or no elements
    # so that it can be easily prepended to the sections array
    preamble = root.find(".//Preamble")
    if preamble is None:
        return []
    preamble.findall(".//Provision")
    return [
        {
            "id": "0",
            "text": _get_joined_text(preamble.findall(".//*")),
            "subsections": [
                {
                    "id": i,
                    "text": _get_joined_text(provision.findall(".//*")),
                }
                for i, provision in enumerate(preamble.findall(".//Provision"))
            ],
            "internal_refs": get_internal_xrefs(preamble),
            "external_refs": get_external_xrefs(preamble),
        }
    ]


def xslt_transform(xml_filename, xsl_filename):
    # https://stackoverflow.com/questions/16698935/how-to-transform-an-xml-file-using-xslt-in-python
    dom = ET.parse(xml_filename)
    xslt = ET.parse(xsl_filename)
    transform = ET.XSLT(xslt)
    newdom = transform(dom)
    return ET.tostring(newdom, pretty_print=True)


def xml_to_markdown(xml_filename, xsl_filename=xsl_filename, remove_links=True):
    html = xslt_transform(xml_filename, xsl_filename)
    html = html.decode("utf-8")
    # To correct an issue where words are concatenated...
    html = html.replace("</span>", "</span> ")
    markdown = md(html)

    # Now remove multiple spaces and > 2 newlines
    # There should be at most 1 space in a row
    markdown = re.sub(" +", " ", markdown)
    markdown = re.sub("\n{2,}", "\n\n", markdown)

    if remove_links:
        # Replace links with the link text only
        markdown = re.sub(r"\[(.*?)\]\(.*?\)", r"\1", markdown)

    return markdown

In [7]:
all_xml = en_acts + en_regs + fr_acts + fr_regs


def extract_xml_no_fulltext(filename):
    return get_dict_from_xml(filename, extract_full_text=False)


legislation = Parallel(n_jobs=-1)(
    delayed(extract_xml_no_fulltext)(filename) for filename in tqdm.tqdm(all_xml)
)

100%|██████████| 11136/11136 [00:19<00:00, 563.85it/s] 


In [8]:
# 537,021 KB

# with open(os.path.join(current_dir, "legislation.json"), "w") as f:
#     json.dump(legislation, f, indent=2)

In [4]:
def extract_xml_with_fulltext(filename):
    return get_dict_from_xml(filename, extract_full_text=True)


legislation_full = Parallel(n_jobs=-1)(
    delayed(extract_xml_with_fulltext)(filename) for filename in tqdm.tqdm(all_xml)
)

100%|██████████| 11136/11136 [05:10<00:00, 35.83it/s] 


In [6]:
# 859,755 KB

with open(os.path.join(current_dir, "legislation_fulltext.json"), "w") as f:
    json.dump(legislation_full, f, indent=2)