In [1]:

from datetime import datetime
import os
import re
from pathlib import Path
import sys
import traceback
from typing import Optional
import webbrowser


# 3rd party saxon imports
import saxonche

In [2]:

XSLT_MARSHAL_PARAM_NAME = 'marsh-path'

DASH_XML_URL = "https://hopuk.sharepoint.com/sites/bct-ppu/_api/web/lists/" \
               "GetByTitle('Added%20Names')/items?$filter=Checked_x003f_%20eq%20%27false%27"

XSL_1_NAME = 'added-names-spo-rest.xsl'
XSL_2_NAME = 'post-processing-html.xsl'

XML_FOLDER = 'Amendment_Paper_XML'
DASHBOARD_DATA_FOLDER = 'Dashboard_Data'

# path to folder containing the XSLT files
PARENT_FOLDER = Path("/Users/mark/projects/added-names/")

XSL_FOLDER = PARENT_FOLDER / 'XSLT'

REPORTS_FOLDER = PARENT_FOLDER / '_Reports'

XSL_1_PATH = XSL_FOLDER / XSL_1_NAME
XSL_2_PATH = XSL_FOLDER / XSL_2_NAME

# WORKING_FOLDER: Optional[Path] = None
WORKING_FOLDER: Optional[Path] = Path("/Users/mark/projects/added-names/_Reports/2023-05-17")


In [3]:
def remove_docstring(parameter: Path):
    """Process FM XML files to remove docstring and overwrite original"""

    print('hi')

    # lets also turn it into an absolute path
    parameter_abs = parameter.resolve()

    # now let's go through all the XML files in the folder and remove the doctypes
    fm_xml_files = list(parameter_abs.glob('*.xml'))

    # loop through XML files
    for file in fm_xml_files:
        print(file.name)
        with open(file, 'r', encoding='utf-8') as f:
            file_lines = f.readlines()

        root_start = 0  # line the root element starts at
        # remove anything before the root element
        for i, line in enumerate(file_lines):
            line_content = line.strip()
            if re.match(r'<[A-Za-z0-9._]', line_content):
                # found root
                root_start = i
                break

        # if LawMaker XML we expect the root to be akomaNtoso
        # in which case completely ignore
        if file_lines[root_start].find('akomaNtoso') != -1:
            print('LM bill')
            continue
        else:
            print('FM bill')

        # try to overwrite file
        with open(file, 'w', encoding='UTF-8') as fi:
            fi.writelines(file_lines[root_start:])

In [4]:
def extract_date(input_Path: Path) -> str:
    """Extract date form input XML from SharePoint"""

    with open(input_Path) as f:
        input_xml_str = f.read()

    # print(f'input_xml_str length is: {len(input_xml_str)}')
    # get the updated date
    match = re.search(r'(<updated>)([A-Z0-9:+-]+)(</updated>)', input_xml_str)
    date_str = ''
    if match:
        date_str = match.group(2)
        print(f'Date extracted from input xml (from SharePoint) {date_str}')

    try:
        dt = datetime.strptime(date_str[:19], '%Y-%m-%dT%H:%M:%S')
        formated_date = dt.strftime("%Y-%m-%d__%H-%M")
    except Exception as e:
        print(repr(e))
        formated_date = ''

    return formated_date

In [5]:
def check_xsl_paths(*xsls: Path) -> bool:

    for xsl_Path in xsls:
        try:
            # check xsl paths are valid
            xsl_Path = xsl_Path.resolve(strict=True)

        except FileNotFoundError as e:
            err_txt = ('The following required XSLT file is missing:'
                    f'\n\n{xsl_Path}'
                    '\n\nUsually you should have two XSL files in a folder called \'XSLT\''
                    ' and that folder should be in the same folder as this program.')
            print('Error:', err_txt)
            if USE_GUI:
                # this can be caught in the GUI code and the Error message displayed in a GUI window
                raise Exception(err_txt) from e
            return False

    return True

In [16]:
input_Path=Path("/Users/mark/projects/added-names/_Reports/2023-05-17/Dashboard_Data/items.txt")
xsl_1_Path = XSL_1_PATH
xsl_2_Path = XSL_2_PATH
parameter = Path("/Users/mark/projects/added-names/_Reports/2023-05-17/XML_Files")

formated_date = extract_date(input_Path)

intermediate_file_name = f"{formated_date}_intermediate.xml"
input_file_resave_name = f"{formated_date}_input_from_SP.xml"
output_file_name = f"Added_Names_Report.html"


if WORKING_FOLDER is None:
    dated_folder_Path = REPORTS_FOLDER.joinpath(formated_date).resolve()
else:
    dated_folder_Path = WORKING_FOLDER.resolve()  # working folder selected in UI
dated_folder_Path.mkdir(parents=True, exist_ok=True)

xml_folder_Path = dated_folder_Path.joinpath(DASHBOARD_DATA_FOLDER)
xml_folder_Path.mkdir(parents=True, exist_ok=True)

intermidiate_Path = xml_folder_Path.joinpath(intermediate_file_name)
out_html_Path     = dated_folder_Path.joinpath(output_file_name)

print(f'{intermidiate_Path=}')
print(f'{out_html_Path=}')

# resave the input file
resave_Path = xml_folder_Path.joinpath(input_file_resave_name)
with open(resave_Path, 'w') as f:
    f.write(input_Path.read_text())

with saxonche.PySaxonProcessor(license=False) as proc:

    print(proc.version)

    # need to be as uri in case there are spaces in the path
    input_path = input_Path.as_uri()
    # intermidiate_path = intermidiate_Path.as_uri()
    intermidiate_path = str(Path('/Users/mark/projects/added-names/_Reports/2023-05-17/Dashboard Data/intermediate2.xml'))
    outfilepath = out_html_Path.as_uri()

    # --- 1st XSLT ---
    xsltproc = proc.new_xslt30_processor()

    executable = xsltproc.compile_stylesheet(stylesheet_file=str(xsl_1_Path))
    print(
        f"\nsource_file={input_path=}\n"
        f"output_file={intermidiate_path=}\n"
    )
    executable.set_output_file(intermidiate_path)
    executable.transform_to_file(source_file=input_path)

    print("Before or after error?")


    # --- 2nd XSLT ---
    # xsltproc2 = proc.new_xslt30_processor()

    # executable2 = xsltproc2.compile_stylesheet(stylesheet_file=str(xsl_2_Path))

    # if parameter:
    #     # get path to folder containing /FrameMaker XML file(s)
    #     # and pass this to XSLT processor as a parameter.
    #     # This is for for marshelling.
    #     remove_docstring(parameter)
    #     parameter_str = parameter.as_uri()  # uri works best with Saxon
    #     param = proc.make_string_value(parameter_str)

    #     executable2.set_parameter(XSLT_MARSHAL_PARAM_NAME, param)

    # executable2.transform_to_file(source_file=intermidiate_path, output_file=outfilepath)

    # # --- finished transforms ---

    # print(f'Created: {out_html_Path}')

    # webbrowser.open(outfilepath)

    print('Done.')

Date extracted from input xml (from SharePoint) 2023-05-17T17:02:38Z
intermidiate_Path=PosixPath('/Users/mark/projects/added-names/_Reports/2023-05-17/Dashboard_Data/2023-05-17__17-02_intermediate.xml')
out_html_Path=PosixPath('/Users/mark/projects/added-names/_Reports/2023-05-17/Added_Names_Report.html')
SaxonC-HE 12.0 from Saxonica

source_file=input_path='file:///Users/mark/projects/added-names/_Reports/2023-05-17/Dashboard_Data/items.txt'
output_file=intermidiate_path='/Users/mark/projects/added-names/_Reports/2023-05-17/Dashboard Data/intermediate2.xml'

Before or after error?
Done.


In [20]:
remove_docstring(Path('/Users/mark/projects/added-names/_Reports/2023-05-17/XML_Files'))

hi
non_domestic_rm_cwh_0517.xml
LM bill
procurement_rm_rep_0517.xml
FM bill
