In [125]:
from typing import NamedTuple
import xml.etree.ElementTree as ET
from itertools import accumulate, starmap
from functools import reduce
import random
from parsel import Selector

In [176]:
class BBox(NamedTuple):
    left: float
    bottom: float
    right: float
    top: float

    def to_attr(self):
        return ",".join(map(lambda f: "{0:.3f}".format(getattr(self, f)), self._fields))

    def process_tag(tag: Selector) -> BBox:
        # Expect a comma-separated list of 4 coordinates in order: left, top, right, bottom
        # Parse them into a list of strings
        str_coords = tag.attrib.get("bbox", "0,0,0,0").split(",")
        # Use map to convert to a list of floats
        coords = map(float, str_coords)
        # Unpack coords
        return BBox(*coords)

def gen_textline(text: str, left: float, bottom: float, font: str, size: float, **attrs):
    tag_attrs = {"font": font, "size": str(size)}
    tag_attrs.update(attrs)

    top = bottom + size
    start_left = left
    textline_tag = ET.Element("textline")
    for char in text:
        w = random.uniform(2.3, 7.2)
        right = start_left + w
        bbox = BBox(start_left, bottom, right, top)

        if char in [" ", "\n"]:
            text_tag = ET.Element("text")
            start_left = right if char == " " else start_left
        else:
            text_tag = ET.Element("text", **tag_attrs, bbox=bbox.to_attr())
            start_left = right

        text_tag.text = char
        textline_tag.append(text_tag)

    bbox = BBox(left, bottom, start_left, top)
    textline_tag.set("bbox", bbox.to_attr())
    return textline_tag

attribs = {
    "left": 90.0,
    "bottom": 10.0,
    "font": "CSBRTR+MetaBookLF-Roman",
    "size": 9.0,
    "colourspace": "DeviceGray",
    "ncolour": "0",
}
tl = gen_textline("Hello world\n", **attribs)
ET.indent(tl)
ET.dump(tl)

<textline bbox="90.000,10.000,143.713,19.000">
  <text font="CSBRTR+MetaBookLF-Roman" size="9.0" colourspace="DeviceGray" ncolour="0" bbox="90.000,10.000,96.510,19.000">H</text>
  <text font="CSBRTR+MetaBookLF-Roman" size="9.0" colourspace="DeviceGray" ncolour="0" bbox="96.510,10.000,102.295,19.000">e</text>
  <text font="CSBRTR+MetaBookLF-Roman" size="9.0" colourspace="DeviceGray" ncolour="0" bbox="102.295,10.000,104.842,19.000">l</text>
  <text font="CSBRTR+MetaBookLF-Roman" size="9.0" colourspace="DeviceGray" ncolour="0" bbox="104.842,10.000,111.960,19.000">l</text>
  <text font="CSBRTR+MetaBookLF-Roman" size="9.0" colourspace="DeviceGray" ncolour="0" bbox="111.960,10.000,115.360,19.000">o</text>
  <text> </text>
  <text font="CSBRTR+MetaBookLF-Roman" size="9.0" colourspace="DeviceGray" ncolour="0" bbox="119.426,10.000,123.344,19.000">w</text>
  <text font="CSBRTR+MetaBookLF-Roman" size="9.0" colourspace="DeviceGray" ncolour="0" bbox="123.344,10.000,127.066,19.000">o</text>
  <text 

In [81]:
e = ET.Element("test")
e.extend(iter([ET.Element("tt", text=" ")]))
ET.dump(e)

<test><tt text=" " /></test>


In [167]:
t = 9.0
b = 1.0
bb = BBox(0.0, b, 100.0, t)
def mf(left, bottom, right, top):
    w = random.uniform(2.3, 7.2)
    return BBox(right, bottom, right+w, top)

w_gen = lambda: random.uniform(2.3, 7.2)
list(accumulate(map(lambda c: (w_gen(), c), "Hello"), lambda tot, el: (tot[0] + el[0], el[1])))

[(3.843886154991215, 'H'),
 (9.281238908400304, 'e'),
 (13.258205084992266, 'l'),
 (17.064033711544806, 'l'),
 (20.835528910542003, 'o')]

In [42]:
i_char = tuple(map(float, "97.190,272.163,99.575,281.163".split(",")))
print(f"i width: {i_char[2]-i_char[0]:.3f}")
m_char = tuple(map(float, "90.000,272.163,97.191,281.163".split(",")))
print(f"m width: {m_char[2]-m_char[0]:.3f}")

i width: 2.385
m width: 7.191


In [98]:
with open("test_may_2023_new.xml") as xml_fp:
    xml_text = xml_fp.read()

s = Selector(text=xml_text, type="xml")
s

<Selector query=None data='<pages>\n  <page id="1" bbox="0.000,0....'>

In [168]:
mapping_key = NamedTuple("mapping_key", [("char", str), ("font", str), ("w", float)])
keys = map(lambda t: mapping_key(t.xpath("text()").get(), f"{t.attrib.get('font').split('+')[-1]}@{float(t.attrib.get('size'))}", (bbox := BBox.process_tag(t), round(bbox.right-bbox.left, 3))[-1]), s.xpath(".//text[@bbox]"))
reduce(lambda d, mpk: (d | {mpk.char: (dd := d.get(mpk.char, {}), dd | {mpk.font: dd.get(mpk.font, set()) | {mpk.w}})[-1]}), sorted(keys, key=lambda mpk: mpk.char), {})

{'#': {'MetaBookLF-Roman@9.0': {4.959}},
 '$': {'MetaBoldLF-Roman@8.0': {4.672},
  'MetaBookLF-Roman@9.0': {5.238},
  'MetaBoldLF-Roman@9.0': {5.256}},
 '(': {'Utopia-Regular@8.5': {2.975}, 'MetaBoldLF-Roman@8.0': {2.496}},
 ')': {'Utopia-Regular@8.5': {2.975}, 'MetaBoldLF-Roman@8.0': {2.48}},
 '*': {'C39-Medium24ptLJ3A@14.768': {26.0}},
 '+': {'MetaBookLF-Roman@9.0': {5.085}},
 ',': {'Utopia-Regular@9.0': {2.385},
  'MetaBookLF-Roman@9.0': {2.583},
  'MetaBoldLF-Roman@9.0': {2.556},
  'Utopia-Regular@6.0': {1.59}},
 '-': {'Utopia-Regular@8.5': {3.332},
  'Utopia-Bold@9.0': {3.528},
  'MetaBookLF-Roman@9.0': {2.727},
  'Helvetica@2.664': {8.0},
  'Utopia-Bold@14.0': {5.488},
  'Utopia-Regular@6.0': {2.352}},
 '.': {'Utopia-Regular@8.0': {2.12},
  'Utopia-Regular@8.5': {2.253},
  'MetaBookLF-Roman@9.0': {2.538},
  'MetaBoldLF-Roman@9.0': {2.484},
  'Utopia-Regular@6.0': {1.59}},
 '/': {'Utopia-Regular@8.5': {3.91}},
 '0': {'Utopia-Regular@8.0': {4.24},
  'Utopia-Regular@9.0': {4.77},
  

In [129]:
def reduce_dict(d, kv):
    key = kv[0]
    val = kv[1]
    vals = d.get(key, [])
    vals.append(val)
    d[key] = vals
    return d

reduce(lambda d, kv: (d | {f"{kv[0]}": d.get(kv[0], []) + [kv[1]]}), [('a', 1), ('a', 2), ('d', 5), ('b', 1), ('d', 1)], {})

{'a': [1, 2], 'd': [5, 1], 'b': [1]}