In [2]:
import lxml.etree as ET
from tqdm import tqdm

import sys
print(sys.version)

3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]


In [3]:
xml = ET.parse("cua_scriboniilarg00scri_djvu.xml")

In [33]:
from collections import namedtuple
from statistics import median, stdev
from typing import List

Bbox = namedtuple("Bbox", ["left", "bottom", "right", "top"])
Word = namedtuple("Word", ["text", "bbox"])


class Line:
    def __init__(self, no, p_no):
        self.words = []
        self.id = no
        self.paragraph_id = p_no
        
    def add_word(self, element):
        self.words.append(
            Word(
                element.text.strip(),
                Bbox(*[
                    float(value)
                    for value in element.attrib["coords"].split(",")[:4]
                ])
            )
        )
    
    @property
    def bottom(self) -> float:
        return median([w.bbox.bottom for w in self.words])
    
    @property
    def top(self) -> float:
        return median([w.bbox.top for w in self.words])
    
    @property
    def left(self) -> float:
        return min([w.bbox.left for w in self.words])
    
    @property
    def right(self) -> float:
        return max([w.bbox.right for w in self.words])

    @property
    def coords(self) -> List[float]:
        return [self.left, self.bottom, self.right, self.top]
    
    @property
    def text(self):
        return " ".join([w.text for w in self.words])
    
    def __repr__(self):
        return f"<Line>{self.text}</Line>"
    
class Page:
    def __init__(self, no, uri):
        self.lines = []
        self.uri = uri
        self.id = no
        self._distances = None

    def add_line(self, line):
        self.lines.append(line)
    
    @property
    def distances(self) -> List[float]:
        if not self._distances:
            dists = []
            last_line = None
            
            for line in self.lines:
                if last_line == None:
                    last_line = line.bottom
                    continue
                dists.append(line.bottom - last_line)
                last_line = line.bottom
            
            self._distances = dists
        return self._distances
    
    @property
    def median(self) -> float:
        return median(self._distances)

    def __len__(self):
        return len(self.lines)
    
    @property
    def stdev(self):
        return stdev(self.distances)
    
    def is_break(self, distance) -> bool:
        """ Check if the distance is higher than the stdev """
        return distance > (self.stdev + self.median)
    
    def iter_lines(self):
        for line, dist in zip(self.lines, [None] + self.distances):
            yield (
                dist is None or self.is_break(dist),  # does it break ?
                line,
                dist
            )
            
    @property
    def nb_blocks(self):
        if len(self) <= 2:
            return len(self)
        return sum([
            int(is_breaking)
            for is_breaking, _, _ in self.iter_lines()
        ])

    def iter_blocks(self):
        block = []
        for is_breaking, line, _ in self.iter_lines():
            if is_breaking and block:
                yield block
                block = []
            block.append(line)
        if block:
            yield block
    
pages = []
for page_no, page_element in enumerate(xml.xpath("/DjVuXML/BODY/OBJECT")):
    uri = f"https://archive.org/details/cuascriboniilarg00scri/page/n{page_no}/mode/1up"
    page = Page(page_no, uri=uri)
    line_no = 1
    for p_no, paragraph in enumerate(page_element.xpath(".//PARAGRAPH")):
        for line_element in paragraph.xpath(".//LINE"):
            # print(" ".join([word.text for word in line.xpath(".//WORD")])) 
            words = line_element.xpath(".//WORD")
            if len(words):
                line = Line(line_no, p_no)
                for word in words:
                    line.add_word(word)
                page.add_line(line)
                line_no += 1
    
    nb_breaks = page.nb_blocks
    if len(page) > 2 and nb_breaks > 3:
        for line, dist in zip(page.lines, [None] + page.distances):
            print(dist is None or page.is_break(dist), line.text)
    print("\n=================\n")
    pages.append(page)













True — VII —
True reliquisse ingenue fateor; itaque opto ; ut tandem aut
False Ruellii codex iterum aut si quis alius vetustior et inte-
False grior aetatem tulit in medium proferatur. Quo facto eam
False quoque quaestionem profligatum iri spero ; quae me multum
False diuque torsit deliberantem ; utrum Scribonius hunc librum
False de conpositionibus medicamentorum an ; quod mihi placuit,
False conpositiones inscripserit. Illam enim inscriptionem ; quae
False in fronte editionis principis legitur, a Ruellio ; non ab ipso
False auctore profectam esse putaverim, qui quin potius de
False conpositione medicamentorum, ut Aldus eumque secutus
False Stephanus suo Marte emendaverunt ; scripturus fuerit ; equi-
False dem non dubito ; sicuti Galenum libros decem tceql gvv-
False &i<5Ecog cpccQ^ittKcov xcov KCittt xoitovQ et septem 7CSQL Gvv&e-
False GEcog cpciQ[ittKCQv xcov %ccxtt yivr) inscripsisse constat. Qua
False de causa mihi verisimilius esse visum est alteram inscrip-
False ti













































In [54]:
from xml.sax.saxutils import escape

def write_block(f, block, name=""):
    f.write(f"\t<{name.upper()}>\n")
    f.write("\t\t<FULL_TEXT>\n")
    for line in block:
        f.write(f"\t\t\t{escape(' '.join([w.text for w in line.words]))}\n")
    f.write("\t\t</FULL_TEXT>\n")
    for line in block:
        f.write(f"\t\t<LINE>{' '.join(['<WORD>'+escape(w.text)+'</WORD>' for w in line.words])}</LINE>\n")
    f.write(f"\t</{name.upper()}>\n")

with open("scribonius_largus.xml", "w") as f:
    f.write("<TEXT>\n")
    for page in pages:
        f.write(f"""<PAGE n="{page.id}" type="scan" uri="{escape(page.uri)}">
""")
        if page.nb_blocks >= 3:
            header, body, *notes = page.iter_blocks()
            write_block(f, header, name="header")
            write_block(f, body, name="body")
            for note in notes:
                write_block(f, note, name="note")
        else:
            write_block(f, page.lines, name="UNQUALIFIED")
        #print(list(page.iter_blocks()))
        f.write("""
</PAGE>
""")    
    f.write("</TEXT>\n")

In [145]:
new_xml = ET.parse("scribonius_largus_blocks.xml")

NewWord = namedtuple("NewWord", ["text", "page", "line"])

class NewLine:
    def __init__(self, w_elements, page_no, line_no):
        self.page_no = page_no
        self.line_no = line_no
        self.words = w_elements
        
        self._text = []
        self._num = None
        
    def line_and_text(self):
        if not self._text:
            if len(self.words) <= 1:
                self._num, self._text = None, self.words
            elif self.words[0].text.isnumeric():
                self._num, self._text = self.words[0].text, self.words[1:]
            elif self.words[-1].text.isnumeric():
                self._num, self._text = self.words[-1].text, self.words[:-1]
            else:
                self._num, self._text = None, self.words
        return self._num, self._text
        
    def __repr__(self):
        no, text = self.line_and_text()
        if no:
            return f'<lb n="{no}" />{" ".join([w.text for w in text])}'
        return " ".join([w.text for w in text])
    
    def get_and_pop_first(self):
        line, text = self.line_and_text()
        return text.pop(0)
    
    def last_word(self):
        if self._text:
            return self._text[-1]
        else:
            return None
      
#NewLine = namedtuple("NewLine", ["words", "page", "line"])

lines = []
all_pages = new_xml.xpath("//PAGE")
for page in all_pages:
    page_num = int(page.attrib["n"])
    if page_num < 116 and page_num >= 8:
        for line_no, line in enumerate(page.xpath("./BODY/LINE")):
            words = line.xpath(".//WORD")
            if not len(words):
                print(ET.tostring(line))
                raise Exception
            lines.append(NewLine(words, page_num, line_no))

print(len(lines))

last_p = None
last_l = None

import regex

NUMBER = regex.compile("[XVILCD]+")
GREEK_WORD = regex.compile("([a-z]+[A-Z]+[a-z]*|\w+\W+\w+)\W\s*")

def render_word(word):
    word = word.replace(")>", "❱").replace("<(", "❰")
    if GREEK_WORD.match(word):
        print(word)
        return f'<foreign xml:lang="grc">{escape(word)}</foreign>'
    return escape(word)
    
with open("scribonius_largus_tei.xml", "w") as f:
    for line, line_n1 in zip(lines, lines[1:]+[None]):
        # print(line.page_no, line.line_no)
        no, words = line.line_and_text()

        if last_p != line.page_no:
            f.write(f'\t<pb n="{line.page_no}" facs="https://archive.org/details/cuascriboniilarg00scri/page/n{line.page_no}/mode/1up"/>\n')
            last_p = line.page_no

        f.write(f'\t<lb n="{line.line_no+1}" />\n')

        if line_n1 is not None and \
           line.last_word() is not None and \
           line.last_word().text.endswith("-"):

            words[-1].text = words[-1].text[:-1] + line_n1.get_and_pop_first().text

        line = " "
        for word in words:
            line += render_word(word.text) + " "
        f.write("\t"+line+"\n")

3144
7tdQ"r\
7tccQ<x6Kev<x>v,
6%ot(o^atLKovg
tioXvvevqov,
7tEQi%Qi6t<x
i-rjQocp&cd(ACav
e?sent,
l%iky\nxi%ovg
anoTco(iccnKovg
di%ta^vov
xoloKvv&idct
uq6svi%ov,
nsqmlv^ivov,
6%oIotc£v8qiov,
fiotuvYi,
6't-vTQicpvXkov
T)pp<i£
tpL^iv&iov
ccql<jtoXo%icc
1%ks%uCxi%ov
xctQ%ivcQ[icc
aq<ssvi%ov
oTtiG&orovov,
pei*unguere.
