In [366]:
import os
import json
import lxml
from lxml import etree as ET


base_dir = os.path.dirname(os.getcwd())

# Using a subsample of the data for testing purposes
law_ids = [
    "A-0.6",  # Accessible Canada Act
    "SOR-2021-241",  # Accessible Canada Regulations
    "A-2",  # Aeronautics Act
    "B-9.01",  # Broadcasting Act
    "SOR-97-555",  # Broadcasting Distribution Regulations
    "SOR-96-433",  # Canadian Aviation Regulations
    "SOR-2011-318",  # Canadian Aviation Security Regulations, 2012
    "C-15.1",  # Canadian Energy Regulator Act
    "C-15.31",  # Canadian Environmental Protection Act, 1999
    "C-24.5",  # Cannabis Act
    "SOR-2018-144",  # Cannabis Regulations
    "C-46",  # Criminal Code
    "SOR-2021-25",  # Cross-border Movement of Hazardous Waste and Hazardous Recyclable Material Regulations
    "F-14",  # Fisheries Act
    "SOR-93-53",  # Fishery (General) Regulations
    "C.R.C.,_c._870",  # Food and Drug Regulations
    "F-27",  # Food and Drugs Act
    "I-2.5",  # Immigration and Refugee Protection Act
    "SOR-2002-227",  # Immigration and Refugee Protection Regulations
    "I-21",  # Interpretation Act
    "SOR-2016-151",  # Multi-Sector Air Pollutants Regulations
    "SOR-2010-189",  # Renewable Fuels Regulations
    "S-22",  # Statutory Instruments Act
    "C.R.C.,_c._1509",  # Statutory Instruments Regulations
]


def get_file_paths(eng_law_ids, base_dir):
    """
    Search for the English and French file paths for each law ID
    """
    laws_dir = os.path.join(base_dir, "laws-lois-xml")
    file_paths = []
    # French regulations have different names, unfortunately
    # Replace "SOR-" with "DORS-", "SI-" with "TR-" and "_c." with "_ch."
    filenames = set(
        [f"{law_id}.xml" for law_id in eng_law_ids]
        + [f"{law_id.replace('_c.', '_ch.')}.xml" for law_id in eng_law_ids]
        + [f"{law_id.replace('SOR-', 'DORS-')}.xml" for law_id in eng_law_ids]
        + [f"{law_id.replace('SI-', 'TR-')}.xml" for law_id in eng_law_ids]
    )
    for lang in ["eng", "fra"]:
        categories = (
            ["acts", "regulations"] if lang == "eng" else ["lois", "reglements"]
        )
        for category in categories:
            for file in os.listdir(os.path.join(laws_dir, lang, category)):
                if file in filenames:
                    file_paths.append(
                        os.path.join(base_dir, "laws-lois-xml", lang, category, file)
                    )

    print(len(eng_law_ids), "law IDs provided")
    print(len(file_paths), "files found (should be 2x the number of law IDs)")
    return file_paths

In [367]:
file_paths = get_file_paths(law_ids, base_dir)

24 law IDs provided
48 files found (should be 2x the number of law IDs)


In [377]:
def _get_text(element):
    return element.text if element is not None else None


def _get_link(element):
    return (
        element.attrib["link"]
        if element is not None and "link" in element.attrib.keys()
        else None
    )


def _get_joined_text(
    element,
    exclude_tags=["MarginalNote", "Label", "OriginatingRef"],
    break_tags=[
        "Provision",
        "Subsection",
        "Paragraph",
        "Definition",
        "row",
        "TableGroup",
        "HistoricalNote",
        "MarginalNote",
    ],
    double_break_tags=["Subsection", "TableGroup"],
    pipe_tags=["entry"],
    em_tags=["DefinedTermEn", "DefinedTermFr", "XRefExternal", "XRefInternal", "Emphasis"],
    strong_tags=["MarginalNote", "TitleText"],
    underline_tags=[],
):
    # TODO: Improve table parsing
    def stylized_text(text, tag):
        if tag in em_tags:
            return f"*{text}*"
        if tag in strong_tags:
            return f"**{text}**"
        if tag in underline_tags:
            return f"__{text}__"
        # if tag in strike_tags:
        #     return f"~~{text}~~"
        return text

    all_text = []
    exclude_tags = exclude_tags.copy()
    for e in element.iter():
        if e.tag in exclude_tags:
            exclude_tags.remove(e.tag)
            continue
        if e.text and e.text.strip():
            all_text.append(stylized_text(e.text.strip(), e.tag))
        if e.tail and e.tail.strip():
            all_text.append(e.tail.strip())
        if e.tag in break_tags:
            all_text.append("\n")
        elif e.tag in double_break_tags:
            all_text.append("\n\n")
        if e.tag in pipe_tags:
            all_text.append("|")
        if e.tag == "tbody":
            all_text.append("\n<tbody>")
    text = (
        " ".join(all_text)
        .replace(" \n ", "\n")
        .strip()
        .replace("\u2002", " ")
        .replace("( ", "(")
        .replace(" )", ")")
        .replace(" .", ".")
        .replace("* ;", "*;")
        .strip()
    )
    # When a line ends in a pipe, it should also start with a pipe and space
    lines = text.split("\n")
    for i, line in enumerate(lines):
        line = line.strip()
        if line.endswith("|"):
            lines[i] = "| " + line
        # Replace the <tbody> tag with | --- | --- | --- | etc. for tables
        if line == "<tbody>" and i > 0 and lines[i - 1].strip().endswith("|"):
            lines[i] = "| --- " * (len(lines[i - 1].split("|")) - 2) + "|"
        elif line == "<tbody>":
            lines[i] = ""
    text = "\n".join(lines)
    return text


def get_dict_from_xml(xml_filename):
    # Extract a JSON serializable dictionary from a act/regulation XML file
    dom = ET.parse(xml_filename)
    root = dom.getroot()
    # French regulations have slightly different filenames, but we want a unique ID
    # to link the English and French versions
    filename = os.path.basename(xml_filename).replace(".xml", "")
    # Replace "DORS-" with "SOR-", "TR-" with "SI-" and "_ch." with "_c."
    eng_id = (
        filename.replace("DORS-", "SOR-").replace("TR-", "SI-").replace("_ch.", "_c.")
    )
    d = {
        "id": eng_id,
        "lang": os.path.basename(os.path.dirname(os.path.dirname(xml_filename))),
        "filename": filename,
        "type": "act" if root.tag == "Statute" else "regulation",
        "short_title": _get_text(root.find(".//ShortTitle")),
        "long_title": _get_text(root.find(".//LongTitle")),
        "bill_number": _get_text(root.find(".//BillNumber")),
        "instrument_number": _get_text(root.find(".//InstrumentNumber")),
        "consolidated_number": _get_text(root.find(".//ConsolidatedNumber")),
        "last_amended_date": root.attrib.get(
            "{http://justice.gc.ca/lims}lastAmendedDate", None
        ),
        "current_date": root.attrib.get(
            "{http://justice.gc.ca/lims}current-date", None
        ),
        "in_force_start_date": root.attrib.get(
            "{http://justice.gc.ca/lims}inforce-start-date", None
        ),
        "enabling_authority": {
            "link": _get_link(root.find(".//EnablingAuthority/XRefExternal")),
            "text": _get_text(root.find(".//EnablingAuthority/XRefExternal")),
        },
        "preamble": get_preamble(root),
        "sections": [
            section
            for section in [
                get_section(section) for section in root.findall(".//Section")
            ]
            if section is not None
        ],
        "schedules": [
            schedule
            for schedule in [
                get_schedule(schedule) for schedule in root.findall(".//Schedule")
            ]
            if schedule is not None
        ],
    }
    # Aggregate all internal and external references and count instances of each
    for ref_name in ["internal_refs", "external_refs"]:
        ref_list = [
            ref
            for section in d["sections"]
            for ref in section[ref_name]
            if ref["link"] is not None
        ]
        ref_list_set = set([ref["link"] for ref in ref_list])
        d[ref_name] = [
            {
                "link": link,
                "count": len([ref for ref in ref_list if ref["link"] == link]),
            }
            for link in ref_list_set
        ]
    # Some pretty-print and/or unique versions of the fields
    d["doc_id"] = f'{d["id"]}_{d["lang"]}'
    d["title_str"] = d["short_title"] if d["short_title"] else d["long_title"]
    for section in d["sections"]:
        section["section_id"] = f'{d["doc_id"]}_section_{section["id"]}'
        section["heading_str"] = get_heading_str(section)
        section["section_str"] = f"Section {section['id']}"
        section["all_str"] = "\n".join(
            [
                d["title_str"],
                " " + section["section_str"],
                section["heading_str"],
                section["text"],
            ]
        )
        for subsection in section["subsections"]:
            subsection[
                "section_id"
            ] = f'{d["doc_id"]}_subsection_{section["id"]}{subsection["id"]}'
            subsection["parent_id"] = section["section_id"]
            subsection["heading_str"] = get_heading_str(subsection)
            subsection[
                "section_str"
            ] = f"Sub{section['section_str'].lower()}{subsection['id']}"
            subsection["all_str"] = "\n".join(
                [
                    d["title_str"],
                    " " + subsection["section_str"],
                    subsection["heading_str"],
                    subsection["text"],
                ]
            )
    for schedule in d["schedules"]:
        schedule["section_id"] = f'{d["doc_id"]}_schedule_{schedule["id"]}'
        schedule["all_str"] = "\n".join(
            [
                d["title_str"],
                " " + (schedule["id"] if schedule["id"] else "Schedule"),
                "",
                schedule["text"],
            ]
        )
    # Finally, the preamble also needs a "all_str" field
    if d["preamble"]:
        d["preamble"][0]["section_id"] = f'{d["doc_id"]}_preamble'
        d["preamble"][0]["all_str"] = "\n".join(
            [
                d["title_str"],
                " Preamble",
                "",
                d["preamble"][0]["text"],
            ]
        )
        for section in d["preamble"][0]["subsections"]:
            section["section_id"] = f'{d["doc_id"]}_preamble_provision_{section["id"]}'
            section["parent_id"] = d["preamble"][0]["section_id"]
            section["heading_str"] = get_heading_str(section)
            section["section_str"] = f"Preamble provision {section['id']}"
            section["all_str"] = "\n".join(
                [
                    d["title_str"],
                    " " + section["section_str"],
                    section["heading_str"],
                    section["text"],
                ]
            )
    # Add a list of all sections, including preamble and schedules and subsections
    d["all_chunkable_sections"] = []
    keep_keys = ["section_id", "parent_id", "section_str", "headings", "text", "id", "marginal_note", "in_force_start_date", "last_amended_date", "internal_refs", "external_refs", "lims_id"]
    if d["preamble"]:
        # Keep only the keys we need from d["preamble"][0]
        d["all_chunkable_sections"].append({k: v for k, v in d["preamble"][0].items() if k in keep_keys})
        for p in d["preamble"][0]["subsections"]:
            d["all_chunkable_sections"].append({k: v for k, v in p.items() if k in keep_keys})
    for s in d["sections"]:
        d["all_chunkable_sections"].append({k: v for k, v in s.items() if k in keep_keys})
        for ss in s["subsections"]:
            d["all_chunkable_sections"].append({k: v for k, v in ss.items() if k in keep_keys})
    for s in d["schedules"]:
        d["all_chunkable_sections"].append({k: v for k, v in s.items() if k in keep_keys})
    for i, s in enumerate(d["all_chunkable_sections"]):
        s["doc_id"] = d["doc_id"]
        s["index"] = i
        if s["marginal_note"]:
            s["text"] = f"**{s['marginal_note']}**\n{s['text']}"
    return d


def get_heading_str(section):
    # heading_str = ""
    # for i, heading in enumerate(section["headings"]):
    #     heading_str += f"{' ' * (i+2)}{heading}\n"
    # if section["marginal_note"]:
    #     # heading_str += f"{' ' * (len(section['headings'])+2)}{section['marginal_note']}\n"
    #     heading_str += f"\n**{section['marginal_note']}**"
    # return heading_str
    return " > ".join(section["headings"])


def get_section(section):
    # If the section has an ancestor <Schedule> tag, skip it
    if section.xpath(".//Schedule"):
        return None
    return {
        "id": _get_text(section.find("Label")),
        "headings": get_headings(section),
        "marginal_note": _get_text(section.find("MarginalNote")),
        "text": _get_joined_text(section),
        "in_force_start_date": section.attrib.get(
            "{http://justice.gc.ca/lims}inforce-start-date", None
        ),
        "last_amended_date": section.attrib.get(
            "{http://justice.gc.ca/lims}lastAmendedDate", None
        ),
        "subsections": [
            get_section(subsection) for subsection in section.findall(".//Subsection")
        ],
        "external_refs": get_external_xrefs(section),
        "internal_refs": get_internal_xrefs(section),
        "lims_id": section.attrib.get("{http://justice.gc.ca/lims}id", None),
    }


def get_external_xrefs(section):
    # External references have an explicit link attribute
    return [
        {
            "link": xref.attrib.get("link", None),
            "reference_type": xref.attrib.get("reference-type", None),
            "text": xref.text,
        }
        for xref in section.findall(".//XRefExternal")
    ]


def get_internal_xrefs(section):
    # Internal references are always a section number which is the text
    return [
        {
            "link": xref.text,
        }
        for xref in section.findall(".//XRefInternal")
    ]


def get_preamble(root):
    # Returns an array with a single element, the preamble, or no elements
    # so that it can be easily prepended to the sections array
    preamble = root.find(".//Preamble")
    if preamble is None:
        return []
    preamble.findall(".//Provision")
    return [{
        "id": "preamble",
        "headings": get_headings(preamble),
        "marginal_note": None,
        "text": _get_joined_text(preamble),
        "in_force_start_date": preamble.attrib.get(
            "{http://justice.gc.ca/lims}inforce-start-date", None
        ),
        "last_amended_date": preamble.attrib.get(
            "{http://justice.gc.ca/lims}lastAmendedDate", None
        ),
        "subsections": [
            {
                "id": i,
                "text": _get_joined_text(provision),
                "headings": get_headings(provision),
                "marginal_note": None,
                "in_force_start_date": provision.attrib.get(
                    "{http://justice.gc.ca/lims}inforce-start-date", None
                ),
                "last_amended_date": provision.attrib.get(
                    "{http://justice.gc.ca/lims}lastAmendedDate", None
                ),
                "internal_refs": get_internal_xrefs(provision),
                "external_refs": get_external_xrefs(provision),
                "lims_id": provision.attrib.get("{http://justice.gc.ca/lims}id", None),
            }
            for i, provision in enumerate(preamble.findall(".//Provision"))
        ],
        "internal_refs": get_internal_xrefs(preamble),
        "external_refs": get_external_xrefs(preamble),
        "lims_id": preamble.attrib.get("{http://justice.gc.ca/lims}id", None),
    }]


def get_schedule(schedule):
    # if schedule "id" attribute is RelatedProvs or NifProvs, skip it
    if schedule.attrib.get("id", None) in ["RelatedProvs", "NifProvs"]:
        return None
    return {
        "id": _get_text(schedule.find(".//Label")),
        # "headings": get_headings(schedule),
        "marginal_note": _get_text(schedule.find(".//MarginalNote")),
        "text": _get_joined_text(schedule),
        "in_force_start_date": schedule.attrib.get(
            "{http://justice.gc.ca/lims}inforce-start-date", None
        ),
        "last_amended_date": schedule.attrib.get(
            "{http://justice.gc.ca/lims}lastAmendedDate", None
        ),
        "subsections": [],
        "internal_refs": get_internal_xrefs(schedule),
        "external_refs": get_external_xrefs(schedule),
        "originating_ref": _get_text(schedule.find(".//OriginatingRef")),
        "lims_id": schedule.attrib.get("{http://justice.gc.ca/lims}id", None),
    }


def get_headings(element):
    """
    Headings are found in the inner text of <Heading> tags.
    Returns an array of headings, i.e. ["HeadingLevel1", "HeadingLevel2", "HeadingLevel3"]
    In each case (level 1, 2, 3), the returned heading is always the one CLOSEST (i.e. above) the element
    Note that headings are NOT correctly nested in the hierarchy
    They may be siblings to the element etc. We cannot rely on xpath
    """
    # Brute force solution: Traverse document from top to bottom, keeping track of headings until we hit the element
    headings = [None, None, None, None, None, None]  # 6 levels of headings
    root = element.getroottree().getroot()
    for e in root.iter():
        if e.tag == "Heading":
            level = int(e.attrib.get("level", 1))
            headings[level - 1] = _get_joined_text(e)
            # Remove formatting (e.g. bold) from headings
            headings[level - 1] = (
                headings[level - 1].replace("**", "").replace("__", "")
            )
            for i in range(level, 6):
                headings[i] = None
        if e == element:
            break
    return [h for h in headings if h is not None]

In [378]:
# Create output directory if it doesn't exist; otherwise, clear it

file_paths = [os.path.join(base_dir, r"laws-lois-xml\eng\acts\D-3.4.xml")]

output_dir = os.path.join(base_dir, "laws-lois-json")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
else:
    for file in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, file))
for file_path in file_paths:
    print(file_path)
    d = get_dict_from_xml(file_path)
    with open(
        os.path.join(base_dir, "laws-lois-json", f"{d['id']}_{d['lang']}.json"),
        "w",
        encoding="utf8",
    ) as f:
        json.dump(d, f, indent=2, ensure_ascii=False)

c:\Users\jkuehn\git_repos\laws-qna\laws-lois-xml\eng\acts\D-3.4.xml


In [388]:
import textwrap as tr

print(d["all_chunkable_sections"][15]["text"])

**Jurisdiction if two proceedings commenced on same day**
If corollary relief proceedings between the same former spouses and in respect of the same matter are pending in two courts that would otherwise have jurisdiction under subsection (1) and were commenced on the same day, and neither proceeding is discontinued within 40 days after it was commenced, the Federal Court shall, on application by either or both former spouses, determine which court retains jurisdiction by applying the following rules:
(a) if at least one of the proceedings includes an application for a parenting order, the court that retains jurisdiction is the court in the province in which the child is habitually resident;
(b) if neither of the proceedings includes an application for a parenting order, the court that retains jurisdiction is the court in the province in which the former spouses last maintained a habitual residence in common if one of the former spouses is habitually resident in that province; and
(c) i

In [372]:

for section in d["sections"]:
    print(section["all_str"])
    print("\n")
    for subsection in section["subsections"]:
        # Indent subsections
        print(tr.indent(subsection["all_str"], "    "))
        print("\n")
    print("\n")

Bretton Woods and Related Agreements Act
 Section 1

This Act may be cited as the *Bretton Woods and Related Agreements Act*.
R.S., 1985, c. B-7, s. 1; R.S., 1985, c. 24 (1st Supp.), s. 3




Bretton Woods and Related Agreements Act
 Section 2

The Agreements for an International Monetary Fund, an International Bank for Reconstruction and Development, an International Development Association and an International Finance Corporation and the Convention establishing the Multilateral Investment Guarantee Agency, in this Act referred to as “the Agreements”, set out in Schedules I to V, respectively, are hereby approved.
R.S., 1985, c. B-7, s. 2; R.S., 1985, c. 24 (1st Supp.), s. 4, c. 32 (3rd Supp.), s. 3




Bretton Woods and Related Agreements Act
 Section 3

The Governor in Council may authorize the acceptance on behalf of Canada of the Agreements and may make such appointments, do and authorize such acts and things and make such orders and regulations as are necessary for that purpose a

In [None]:
def write_chunk_txt(json_path):
    with open(json_path, "r", encoding="utf8") as f:
        d = json.load(f)
    all_text = f'# {d["title_str"]} ({d["lang"]})'
    
    # Add some metadata
    all_text += "\n\n\n## Metadata\n\n"
    all_text += f"**ID:** {d['id']}\n"
    all_text += f"**Short Title:** {d['short_title']}\n"
    all_text += f"**Long Title:** {d['long_title']}\n"
    all_text += f"**Type:** {d['type']}\n"
    all_text += f"**Filename:** {d['filename']}\n"
    all_text += f"**Instrument Number:** {d['instrument_number']}\n"
    all_text += f"**Consolidated Number:** {d['consolidated_number']}\n"
    all_text += f"**Last Amended Date:** {d['last_amended_date']}\n"
    all_text += f"**Current Date:** {d['current_date']}\n"
    all_text += f"**In Force Start Date:** {d['in_force_start_date']}\n"
    all_text += f"**Enabling Authority:** {d['enabling_authority']['link']}\n"
    all_text += "\n\n\n## Preamble, Sections and Schedules\n\n"

    if d["preamble"]:
        all_text += d["preamble"][0]["all_str"]
        all_text += "\n\n---\n\n"
    for section in d["sections"]:
        all_text += section["all_str"]
        all_text += "\n\n---\n\n"
    for schedule in d["schedules"]:
        all_text += schedule["all_str"]
        all_text += "\n\n---\n\n"
    with open(json_path.replace(".json", ".txt"), "w", encoding="utf8") as f:
        f.write(all_text)

In [None]:
# Write a single text file for each JSON file produced (in laws-lois-json)
json_file_paths = [
    os.path.join(base_dir, "laws-lois-json", f)
    for f in os.listdir(os.path.join(base_dir, "laws-lois-json"))
    if f.endswith(".json")
]
for json_file_path in json_file_paths:
    write_chunk_txt(json_file_path)