# The PageXML Document Model and Parsing PageXML files

In [1]:
from pagexml.parser import parse_pagexml_file
from pagexml.helper.pagexml_helper import pretty_print_textregion

data_dir = '../data'
filepath = f'{data_dir}/example.xml'
scan = parse_pagexml_file(filepath)
pretty_print_textregion(scan)

                                              f. 1.

               Inventaris ende beschrijvinge van
                  naerbeschreven Cleedinge van
                 Linne ende Wollen, Item huijsraet
                  naergelaten by Janneken de Veel
                  oude Vrijster alhier overleden;
                  die volgens hare dispositie van
                  uterste wille, door d' eersame

                 Sr. hans van Essen Bouckhouder
                 vande Westinde. Compe. alhier In
                  Amstm. versonden sullen werden
                  naer Ryssel aen Beatricx de
                 Veel suster vande voorn. Janneken
                  de Veel sa: Beschreven door
                 mij Henrick Schaeff Nots etc.
                  in presentie vande ondergess.
                  getuijgen. desen 1e. Januarij
                  @ 1639.

 vijff floppen.
 Twaelf kragen soo goede als quade.
  Twaelf slaeplakens. soo goede als quade
  Twee oude brabantsche Vrouwen houven met har

In [2]:
type(scan)

pagexml.model.physical_document_model.PageXMLScan

In [3]:
# get a dict/JSON representation of the scan content
scan.json

{'id': 'a30768000008.jpg',
 'type': ['pagexml_doc', 'text_region', 'scan'],
 'metadata': {'Creator': 'prov=University of Rostock/Institute of Mathematics/CITlab/Tobias Gruening/tobias.gruening@uni-rostock.de:name=/net_tf/LA73_249_0mod360.pb:de.uros.citlab.segmentation.CITlab_LA_ML:v=?0.1\nprov=University of Rostock/Institute of Mathematics/CITlab/Tobias Gruening/tobias.gruening@uni-rostock.de:name=/net_tf/LA73_249_0mod360.pb:de.uro.citlab.module.la.core.CITlab_LA_ML:v=?0.1\nTRP',
  'Created': datetime.datetime(2018, 7, 19, 10, 33, 16, 721000, tzinfo=tzoffset(None, 7200)),
  'LastChange': datetime.datetime(2018, 12, 10, 17, 9, 22, 188000, tzinfo=tzoffset(None, 3600)),
  'filename': '../data/example.xml'},
 'reading_order': {0: 'r1', 1: 'r2'},
 'coords': [(0, 0), (5429, 0), (5429, 4059), (0, 4059)],
 'text_regions': [{'id': 'r1',
   'type': ['pagexml_doc', 'text_region'],
   'metadata': {'reading_order': {'index': '0'}},
   'coords': [(2890, 476), (2890, 632), (2930, 632), (2930, 476)],


In [4]:
scan.id

'a30768000008.jpg'

In [5]:
# show the types and main type of a scan
print(scan.type)
print(scan.types)
print(scan.main_type)


['pagexml_doc', 'text_region', 'scan']
{'pagexml_doc', 'text_region', 'scan'}
scan


In [6]:
# show the scan metadata extracted from the PageXML file
scan.metadata

{'Creator': 'prov=University of Rostock/Institute of Mathematics/CITlab/Tobias Gruening/tobias.gruening@uni-rostock.de:name=/net_tf/LA73_249_0mod360.pb:de.uros.citlab.segmentation.CITlab_LA_ML:v=?0.1\nprov=University of Rostock/Institute of Mathematics/CITlab/Tobias Gruening/tobias.gruening@uni-rostock.de:name=/net_tf/LA73_249_0mod360.pb:de.uro.citlab.module.la.core.CITlab_LA_ML:v=?0.1\nTRP',
 'Created': datetime.datetime(2018, 7, 19, 10, 33, 16, 721000, tzinfo=tzoffset(None, 7200)),
 'LastChange': datetime.datetime(2018, 12, 10, 17, 9, 22, 188000, tzinfo=tzoffset(None, 3600)),
 'filename': '../data/example.xml'}

In [7]:
# show reading order of text regions if this is indicated in the PageXML file
print(scan.reading_order)
for tr in scan.text_regions:
    print('text region id:', tr.id)

{0: 'r1', 1: 'r2'}
text region id: r1
text region id: r2


In [8]:
print(scan.id, scan.coords)
for tr in scan.text_regions:
    print(tr.id, tr.coords)

a30768000008.jpg Coords(points="0,0 5429,0 5429,4059 0,4059")
r1 Coords(points="2890,476 2890,632 2930,632 2930,476")
r2 Coords(points="3334,251 3334,3722 5172,3722 5172,251")


In [9]:
print(scan.id, scan.stats)
for tr in scan.text_regions:
    print(tr.id, tr.stats)

a30768000008.jpg {'lines': 39, 'words': 155, 'text_regions': 2, 'columns': 0, 'extra': 0, 'pages': 0}
r1 {'lines': 1, 'words': 0, 'text_regions': 0}
r2 {'lines': 38, 'words': 155, 'text_regions': 0}


In [10]:
# show the number of lines
scan.num_lines

39

In [11]:
scan.get_lines()

[<pagexml.model.physical_document_model.PageXMLTextLine at 0x126b127d0>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12860>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12890>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12950>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b129e0>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12a70>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12b00>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12b90>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12c20>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12cb0>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12d40>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12dd0>,
 <pagexml.model.physical_document_model.PageXMLTextLine at 0x126b12e60>,
 <pagexml.model.physical_document_model.PageXMLText

In [12]:
scan.lines

[]

In [13]:
scan.num_words

155

In [14]:
scan.get_words()

['f.',
 '1.',
 'Inventaris',
 'ende',
 'beschrijvinge',
 'van',
 'naerbeschreven',
 'Cleedinge',
 'van',
 'Linne',
 'ende',
 'Wollen,',
 'Item',
 'huijsraet',
 'naergelaten',
 'by',
 'Janneken',
 'de',
 'Veel',
 'oude',
 'Vrijster',
 'alhier',
 'overleden;',
 'die',
 'volgens',
 'hare',
 'dispositie',
 'van',
 'uterste',
 'wille,',
 'door',
 "d'",
 'eersame',
 'Sr.',
 'hans',
 'van',
 'Essen',
 'Bouckhouder',
 'vande',
 'Westinde.',
 'Compe.',
 'alhier',
 'In',
 'Amstm.',
 'versonden',
 'sullen',
 'werden',
 'naer',
 'Ryssel',
 'aen',
 'Beatricx',
 'de',
 'Veel',
 'suster',
 'vande',
 'voorn.',
 'Janneken',
 'de',
 'Veel',
 'sa:',
 'Beschreven',
 'door',
 'mij',
 'Henrick',
 'Schaeff',
 'Nots',
 'etc.',
 'in',
 'presentie',
 'vande',
 'ondergess.',
 'getuijgen.',
 'desen',
 '1e.',
 'Januarij',
 '@',
 '1639.',
 'vijff',
 'floppen.',
 'Twaelf',
 'kragen',
 'soo',
 'goede',
 'als',
 'quade.',
 'Twaelf',
 'slaeplakens.',
 'soo',
 'goede',
 'als',
 'quade',
 'Twee',
 'oude',
 'brabantsche',

In [15]:
scan.num_text_regions

2

In [16]:
scan.text_regions

[<pagexml.model.physical_document_model.PageXMLTextRegion at 0x126b12680>,
 <pagexml.model.physical_document_model.PageXMLTextRegion at 0x126b12800>]

In [17]:
scan.get_inner_text_regions()

[<pagexml.model.physical_document_model.PageXMLTextRegion at 0x126b12680>,
 <pagexml.model.physical_document_model.PageXMLTextRegion at 0x126b12800>]

In [18]:
scan.get_text_regions_in_reading_order()

[<pagexml.model.physical_document_model.PageXMLTextRegion at 0x126b12680>,
 <pagexml.model.physical_document_model.PageXMLTextRegion at 0x126b12800>]

In [19]:
# by default, a scan has no columns and pages, as they are not part of the PageXML spec
scan.columns, scan.pages

([], [])

In [20]:
scan.pages

[]

In [21]:
scan.orientation

In [22]:
print(scan.id, scan.parent)
for tr in scan.text_regions:
    print(tr.id, tr.parent)

a30768000008.jpg None
r1 None
r2 None
