In [None]:
import xml.etree.ElementTree as ET
from typing import NamedTuple
import itertools
import json
from decimal import Decimal
from itertools import groupby, starmap, takewhile
from functools import partial
from typing import List, Iterator
import logging
import sys

from parsel import Selector

In [None]:
log = logging.getLogger("parser")
logging.basicConfig(stream=sys.stdout, level=logging.WARN)

In [None]:
# Generate XML file:
# pdf2txt.py --output_type xml --outfile - -A -L 0.51 -F +0.8 -V test_may_2023.pdf | xmllint --format - > test_may_2023_new.xml

In [None]:
from parsel import Selector
with open("test_may_2023_new.xml", "r") as fp:
    xmlraw = fp.read()

s = Selector(text=xmlraw, type="xml")

In [None]:
textline = s.xpath("./page[@id='1']//textbox[@id='21']/textline")[0]
"".join(textline.xpath("./text/text()").getall()).strip()

In [None]:
BBox = NamedTuple("BBox", [("left", float), ("bottom", float), ("right", float), ("top", float)])

def process_bbox(tag: Selector) -> BBox:
    # Expect a comma-separated list of 4 coordinates in order: left, top, right, bottom
    # Parse them into a list of strings
    str_coords = tag.attrib.get("bbox", "0,0,0,0").split(",")
    # Use map to convert to a list of floats
    coords = map(float, str_coords)
    # Unpack coords
    return BBox(*coords)

def bounding_bbox(first: BBox, last: BBox) -> BBox:
    return BBox(first.left, first.bottom, last.right, last.top)

In [None]:
TextLine = NamedTuple("TextLine", [("bbox", BBox), ("parent_id", int), ("text", str), ("font", str), ("size", float)])
TextBox = NamedTuple("TextBox", [("bbox", BBox), ("id", int), ("lines", List[TextLine])])

def process_textbox(tag: Selector):
    pass

def process_textline(textline: Selector) -> TextLine:
    bbox = process_bbox(textline)
    text = "".join(textline.xpath("./text/text()").getall()).strip()
    parent_id = int(textline.xpath("parent::textbox/@id").get())
    font = textline.xpath("text/@font").get()
    size = float(textline.xpath("text/@size").get())
    return TextLine(bbox, parent_id, text, font, size)

In [None]:
LinePart = NamedTuple("LinePart", [("bbox", BBox)])
Separator = NamedTuple("Separator", [("bbox", BBox)])

def process_linepart(linepart: Selector) -> LinePart:
    bbox = process_bbox(linepart)
    return LinePart(bbox)

def process_separator(lineparts: List[LinePart]) -> Separator:
    bbox = bounding_bbox(lineparts[0].bbox, lineparts[-1].bbox)
    return Separator(bbox)

In [None]:
details_sentinel = "Details of your account"
Component = TextLine|Separator
Components = List[Component]

def process_page_components(page_tag: Selector) -> Components:
    page_id = page_tag.attrib.get("id", "0")
    log.info(f"Processing page {page_id}")

    page_transaction_components = []
    sentinel_found = None
    
    # Find all textboxes that are not vertical and are larger than size 8
    for textline_tag in page_tag.xpath(".//textbox[not(@wmode = 'vertical')]/textline[text/@size >= 8]"):
        textline = process_textline(textline_tag)
        log.info(f"Textline: {textline.bbox}; {textline.font}@{textline.size}\n{textline.text}")
        page_transaction_components.append(textline)

        if textline.text.startswith(details_sentinel):
            log.info(f"sentinel found: {textline.text!r}")
            sentinel_found = textline

    sep_lines = []
    # Filter for all lines
    for line_tag in page_tag.xpath(".//line"):
        linepart = process_linepart(line_tag)
        
        # Ignore lines that are not below the sentinel
        # Vertical coordinates start from the bottom of the page
        if sentinel_found is not None and linepart.bbox.bottom > sentinel_found.bbox.bottom:
            continue

        first_linepart = next(iter(sep_lines), None)
        if first_linepart is None or first_linepart.bbox.bottom == linepart.bbox.bottom:
            sep_lines.append(linepart)
        else:
            sep = process_separator(sep_lines)
            log.info(f"Separator: {sep}")
            page_transaction_components.append(sep)
            sep_lines = [linepart]
    else:
        if sep_lines:
            sep = process_separator(sep_lines)
            page_transaction_components.append(sep)

    # Sort the parsed transaction components vertically (inversed), then horizontally
    page_transaction_components.sort(key=lambda item: (-(item.bbox.bottom), item.bbox.left))

    # Take all items after sentinel
    sentinel_index = page_transaction_components.index(sentinel_found)
    if sentinel_index == -1:
        raise
    if len(page_transaction_components) < sentinel_index+1:
        raise

    return page_transaction_components[sentinel_index+1:]

In [None]:
transaction_fields = ["date", "description", "withdrawals", "deposits", "balance"]
transaction_field_types = [str, str, Decimal, Decimal, Decimal]
Transaction = NamedTuple("Transaction", list(zip(transaction_fields, transaction_field_types)))

def take_transaction_components(components: Iterator[Component]) -> Components:
    """
    Accumulate all the components considered part of a single transaction.
    A transaction is defined as all the Components until a Separator is encountered.
    """
    not_separator = lambda item: type(item) != Separator
    tx_components = []
    for item in takewhile(not_separator, components):
        tx_components.append(item)
    return tx_components

def validate_transaction_headers(header_text: List[TextLine]) -> bool:
    if len(header_text) != len(transaction_fields):
        print(f"header mismatch: {header_text!r} {transaction_fields!r}")
        return False

    # Ensure headers are Text types
    types_match = [type(text) == TextLine for text in header_text]
    if not all(types_match):
        return False
    
    # Ensure all header text fields match expected transaction fields
    # TODO: sort headers and fields?
    fields_match = [
        header_text.text.lower().startswith(fieldname)
        for header_text, fieldname in zip(header_text, transaction_fields)
    ]
    print(f"fields_match: {fields_match}")
    return all(fields_match)

def map_component_to_field(header_fields: dict[str, TextLine], component: TextLine) -> (str, TextLine):
    threshold = 25
    field_name = ""

    def is_aligned(field: str, a: Component, b: Component):
        a_field = getattr(a.bbox, field)
        b_field = getattr(b.bbox, field)
        return abs(a_field - b_field) < threshold

    for field, field_type in zip(transaction_fields, transaction_field_types):
        header_component = header_fields.get(field, None)
        if not header_component:
            continue

        # Numeric fields will be right-aligned, so check if right bounds are close
        if field_type == Decimal and is_aligned("right", header_component, component):
            field_name = field
            break
        elif is_aligned("left", header_component, component):
            field_name = field
            break

    return field_name, component

def page_components_to_transactions(page_components: Components) -> List[Transaction]:
    it_components = iter(page_components)

    # First line should be headers
    header_components = take_transaction_components(it_components)

    if not validate_transaction_headers(header_components):
        raise Exception(f"Invalid headers! {header_components}")

    # Map fields to header components: {"date": TextLine, "description": TextLine, ...}
    header_fields = dict(zip(transaction_fields, header_components))
    header_mapper = partial(map_component_to_field, header_fields)
    transactions = []

    for component in take_transaction_components(it_components):
        transactions.append(header_mapper(component))

    return transactions

p = s.xpath("./page[@id=1]")
lc = process_page_components(p)
print(lc)
page_components_to_transactions(lc)

In [None]:
for page in s.xpath("./page[@id=1]"):
    page_id = page.attrib.get("id", "0")
    print(f"Page {page_id}\n")
    page_components = process_page_components(page)
    for item in page_components:
        if type(item) == Separator:
            print(f"\n{item}")
        elif type(item) == TextLine:
            print(item.text, end=" ")
    print("\n" + "-"*20 + "\n")