Doe een SAX-style parse van een TEI document(-set) en bepaal alle unieke patronen van relevante inputwaarden voor untanngle. Relevant in de zin van 'hiermee moet kunnen worden bepaald welke handlercode dit SAX-event moet triggeren'.

In [1]:
import sys
sys.path.append('../../packages')

import glob
from pprint import pprint
import tei.util as tei

path = '../../data/vgtest/let*.xml'
datadir = '../../data/output/'

tei_namespace = '{http://www.tei-c.org/ns/1.0}'
vangogh_namespace = '{http://www.vangoghletters.org/ns/}'

def get_file_sequence_for_dir(path):
    tei_file_names = (f for f in glob.glob(path))
    return sorted(tei_file_names)

tei_file_names = get_file_sequence_for_dir(path)
unique_results = tei.find_unique_contexts(tei_file_names)
print(f"There are {len(unique_results)} unique (tag,parent,parent-attr-key) tuples:")
pprint(unique_results)

There are 63 unique (tag,parent,parent-attr-key) tuples:
[{'parent': None,
  'parent_attrib_keys': None,
  'tag': '{http://www.tei-c.org/ns/1.0}TEI'},
 {'parent': '{http://www.tei-c.org/ns/1.0}TEI',
  'parent_attrib_keys': [],
  'tag': '{http://www.tei-c.org/ns/1.0}teiHeader'},
 {'parent': '{http://www.tei-c.org/ns/1.0}teiHeader',
  'parent_attrib_keys': [],
  'tag': '{http://www.tei-c.org/ns/1.0}fileDesc'},
 {'parent': '{http://www.tei-c.org/ns/1.0}fileDesc',
  'parent_attrib_keys': [],
  'tag': '{http://www.tei-c.org/ns/1.0}titleStmt'},
 {'parent': '{http://www.tei-c.org/ns/1.0}titleStmt',
  'parent_attrib_keys': [],
  'tag': '{http://www.tei-c.org/ns/1.0}title'},
 {'parent': '{http://www.tei-c.org/ns/1.0}titleStmt',
  'parent_attrib_keys': [],
  'tag': '{http://www.tei-c.org/ns/1.0}editor'},
 {'parent': '{http://www.tei-c.org/ns/1.0}fileDesc',
  'parent_attrib_keys': [],
  'tag': '{http://www.tei-c.org/ns/1.0}publicationStmt'},
 {'parent': '{http://www.tei-c.org/ns/1.0}publicationSt

Hierboven heb ik alle unieke typen events gevonden, en daar allereerst 'ab' (anonymous block) event typen uitgefilterd. 'ab' komt dus voor binnen div's met een 'type' attribuut en binnen notes (met attributes n, id en target).

Volgende stap: vul deze 'records' handmatig aan met benodigde info voor verdere untanngling, inclusief de namen van handler-functies.
Extra info: elt_type (text_container, milestone, ..., wellicht al vastgelegd door koppeling aan handler), resource_id, annotation_type (label, is eigenlijk 'tag'), custom_info

In [2]:
import uuid

_last_begin_indexes = {}
_last_end_indexes = {}

def text_handler(event, type, action, resource_id, parent, text_segments, annotations):
    print('called text_handler with:')
    print(f"type: {type}\naction: {action}\nresource_id: {resource_id}\nparent: {parent}")
    
    global _last_begin_indexes
    global _last_end_indexes    
        
    if action == 'start':
        _last_begin_indexes[type] = text_segments.len()
    if action == 'end':
        parts = [];
        # leaf text element, add to all_textelements, also include text after possible pb's
        for _, part in enumerate(event.itertext()):
            s = part.strip()
            if len(s) > 0:
                parts.append(s)
        line = ' '.join(parts)
        text_segments.append(line)
            
        _last_end_indexes[type] = text_segments.len()-1
        annotations.append({'resource_id': resource_id,
                            'label':type, 
                            'begin_anchor': text_segments._anchors[_last_begin_indexes[type]],
                            'end_anchor':text_segments._anchors[_last_end_indexes[type]],
                            'id': 'annot_'+str(uuid.uuid4())}) 
    return

def annotate_parent_handler(event, type, action, resource_id, parent, text, annotations):
#    print('called annotate_parent_handler with:')
#    print(f"type: {type}\naction: {action}\nresource_id: {resource_id}\nparent: {parent}")
    return


def page_handler(event, type, action, resource_id, parent, text_segments, annotations):
    global _last_begin_indexes  # FIXME
    
    print('page_begin: ')
    print(f'type=[{type}], action=[{action}], resource_id=[{resource_id}], parent=[{parent}]')
    
    # <pb /> is a milestone tag, so only need to do something at the start (or the end, but not both)
    # See https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-pb.html
    if action == 'start':
        cur_len = text_segments.len() - 1
        if cur_len < 0:
            cur_len = 0
        if 'page' in _last_begin_indexes:
            page_start = _last_begin_indexes['page']
            annotations.append({
                'resource_id': resource_id,
                'label': 'page',
                'begin_anchor': text_segments._anchors[page_start],
                'end_anchor': text_segments._anchors[cur_len],
                'id': 'annot_' + str(uuid.uuid4())
            })
        else:
            print(f'Beginning new page at {cur_len}')
        _last_begin_indexes['page'] = cur_len


page_dispatcher = [
    {
        'condition':{
            'element': 'parent',
            'attribute': 'type',
            'value': 'original'
        },
        'handler': page_handler
    }
]

handler_dispatcher = [
    {
        'condition':{
            'element': 'parent',
            'attribute': 'type',
            'value': 'original'
        },
        'handler': text_handler
    },
    {
        'condition': {
            'element': 'parent',
            'attribute': 'type',
            'value': 'translation'            
        },
        'handler': annotate_parent_handler        
    }
]

handler_dispatcher_p = [
    {
        'condition':{
            'element': 'parent',
            'xml_tag': 'note',
        },
        'handler': text_handler
    }
]

select records and associate them with the handler_dispatcher

In [3]:
filtered = [rec for rec in unique_results if rec['tag'] == '{http://www.tei-c.org/ns/1.0}pb']
filtered


[{'tag': '{http://www.tei-c.org/ns/1.0}pb',
  'parent': '{http://www.tei-c.org/ns/1.0}div',
  'parent_attrib_keys': ['type']}]

In [4]:
filtered[0]['handler_dispatcher'] = page_dispatcher
filtered

[{'tag': '{http://www.tei-c.org/ns/1.0}pb',
  'parent': '{http://www.tei-c.org/ns/1.0}div',
  'parent_attrib_keys': ['type'],
  'handler_dispatcher': [{'condition': {'element': 'parent',
     'attribute': 'type',
     'value': 'original'},
    'handler': <function __main__.page_handler(event, type, action, resource_id, parent, text_segments, annotations)>}]}]

In [5]:
filtered = [rec for rec in unique_results if rec['tag'] == '{http://www.tei-c.org/ns/1.0}ab']
filtered

[{'tag': '{http://www.tei-c.org/ns/1.0}ab',
  'parent': '{http://www.tei-c.org/ns/1.0}div',
  'parent_attrib_keys': ['type']},
 {'tag': '{http://www.tei-c.org/ns/1.0}ab',
  'parent': '{http://www.tei-c.org/ns/1.0}note',
  'parent_attrib_keys': ['n',
   '{http://www.w3.org/XML/1998/namespace}id',
   'target']}]

In [6]:
filtered[0]['handler_dispatcher'] = handler_dispatcher
#filtered[0]['condition'] = {
#                'element': 'parent',
#                'attribute': 'type',
#                'value': 'original'
#            }
#filtered[0]['handler'] = text_handler

# deze gaat vooralsnog niets doen, want conditions gaan niet matchen
filtered[1]['handler_dispatcher'] = handler_dispatcher

filtered[0:2]

[{'tag': '{http://www.tei-c.org/ns/1.0}ab',
  'parent': '{http://www.tei-c.org/ns/1.0}div',
  'parent_attrib_keys': ['type'],
  'handler_dispatcher': [{'condition': {'element': 'parent',
     'attribute': 'type',
     'value': 'original'},
    'handler': <function __main__.text_handler(event, type, action, resource_id, parent, text_segments, annotations)>},
   {'condition': {'element': 'parent',
     'attribute': 'type',
     'value': 'translation'},
    'handler': <function __main__.annotate_parent_handler(event, type, action, resource_id, parent, text, annotations)>}]},
 {'tag': '{http://www.tei-c.org/ns/1.0}ab',
  'parent': '{http://www.tei-c.org/ns/1.0}note',
  'parent_attrib_keys': ['n',
   '{http://www.w3.org/XML/1998/namespace}id',
   'target'],
  'handler_dispatcher': [{'condition': {'element': 'parent',
     'attribute': 'type',
     'value': 'original'},
    'handler': <function __main__.text_handler(event, type, action, resource_id, parent, text_segments, annotations)>},
   

In [7]:
filtered = [rec for rec in unique_results if rec['tag'] == '{http://www.tei-c.org/ns/1.0}p']
#filtered[2]['handler_dispatcher'] = handler_dispatcher_p  # HDJ - disabled

filtered

[{'tag': '{http://www.tei-c.org/ns/1.0}p',
  'parent': '{http://www.tei-c.org/ns/1.0}licence',
  'parent_attrib_keys': ['target']},
 {'tag': '{http://www.tei-c.org/ns/1.0}p',
  'parent': '{http://www.vangoghletters.org/ns/}letContents',
  'parent_attrib_keys': []},
 {'tag': '{http://www.tei-c.org/ns/1.0}p',
  'parent': '{http://www.tei-c.org/ns/1.0}note',
  'parent_attrib_keys': ['type', '{http://www.w3.org/XML/1998/namespace}id']}]

Vraag is nu, of deze subset van unique_results voldoende info bevat om het untanngle-proces te sturen. Volgende stap is dus kijken in hoeverre ik de oude untanngle code voor TEI hieraan kan aanpassen.

In [8]:
import os
from textservice import segmentedtext

_last_page_begin_index = 0
_last_section_begin_index = -1
_last_chapter_begin_index = -1
_last_paragraph_begin_index = -1
_last_head_begin_index = -1

_last_page_end_index = -1
_last_section_end_index = -1
_last_chapter_end_index = -1
_last_paragraph_end_index = -1
_last_head_end_index = -1

_last_page_id = ""

def get_input_pattern_from(action, elem):
    parent_tag = None if elem.getparent() == None else elem.getparent().tag
    parent_keys = None if parent_tag == None else elem.getparent().attrib.keys()
                
    input_pattern = {
#        'event': action,
        'tag': elem.tag,
        'parent': parent_tag,
        'parent_attrib_keys': parent_keys,
    }  
    return input_pattern

# handle each of the elements in the hierarchy according to 'layer type'
def handle_element(resource_id,action,e,text_segments,annotations): 
#     global resource_id
    global _last_page_begin_index
    global _last_section_begin_index
    global _last_chapter_begin_index
    global _last_head_begin_index

    global _last_page_end_index
    global _last_section_end_index
    global _last_chapter_end_index
    global _last_head_end_index
    
    global _last_page_id
    
    # determine input_pattern from action and elem
    input_pattern = get_input_pattern_from(action, e)
    
    # find matching patterns in unique_results
    for pattern in unique_results:
        if all(item in pattern.items() for item in input_pattern.items()):
            # find handler function by checking conditions from handler_dispatcher
            # handler_dispatcher is a list of condition/handler dicts
            if 'handler_dispatcher' in pattern:
#            if 'condition' in pattern:
                for handler_entry in pattern['handler_dispatcher']:
                    element = handler_entry['condition']['element']
                    xml_element = None
                    if element == 'parent':
                        xml_element = e.getparent()
                        print(xml_element.tag)
                    
                    # temporary if's, fix by introducing a class/function that checks a condition
                    if 'attribute' in handler_entry['condition']:
                        attribute = handler_entry['condition']['attribute']
                        value = handler_entry['condition']['value']
                
                        if attribute in xml_element.attrib and xml_element.attrib[attribute] == value:
                            handler_entry['handler'](e, pattern['tag'], action, resource_id, \
                                                 xml_element, text_segments, annotations)
                    if 'xml_tag' in handler_entry['condition']:
                        element_tag = handler_entry['condition']['xml_tag']
                        print(f"value from condition: {tei_namespace+element_tag}, parent tag: {xml_element.tag}")
                        if tei_namespace+element_tag == xml_element.tag:
                            handler_entry['handler'](e, pattern['tag'], action, resource_id, \
                                                 xml_element, text_segments, annotations)
    return
    
    if action == 'start':
        # store last begin indexes
        if e.tag == tei_namespace+'ab':
            _last_paragraph_begin_index = text.len()               
        elif e.tag == 'div' and e.get('type') == 'chapter':
            _last_chapter_begin_index = text.len()
        elif e.tag == 'div' and e.get('type') == 'section':
            _last_section_begin_index = text.len()
        elif e.tag == 'head':
            _last_head_begin_index = text.len()
    elif action == 'end':
        if e.tag == tei_namespace+'ab': 
            # leaf text element, add to all_textelements, also include text after possible pb's
            for index, t in enumerate(e.itertext()):
                text.append(t.strip())
                if index > 0: # assume: caused by pb contained within p. Update page end.
                    _last_page_end_index = text.len()-1
            
            _last_paragraph_end_index = text.len()-1

            if _last_paragraph_begin_index <= _last_paragraph_end_index:
                annotations.append({'resource_id': resource_id, 'label':'paragraph','begin_anchor': text_segments._anchors[_last_paragraph_begin_index],\
                            'end_anchor':text_segments._anchors[_last_paragraph_end_index],'id': 'annot_'+str(uuid.uuid4())})
        elif e.tag == 'head':
            # leaf text element, add to all_textelements
            text.append(e.text)
            
            _last_head_end_index = text.len()-1
            annotations.append({'resource_id': resource_id, 'label':'head','begin_anchor': text._anchors[_last_head_begin_index],\
                            'end_anchor':text._anchors[_last_head_end_index],'id': 'annot_'+str(uuid.uuid4())}) 
        elif e.tag == 'div' and e.get('type') == 'chapter':
            _last_chapter_end_index = text.len()-1
            annotations.append({'resource_id': resource_id, 'label':'chapter','begin_anchor': text._anchors[_last_chapter_begin_index],\
                            'end_anchor':text._anchors[_last_chapter_end_index],'id': 'annot_'+str(uuid.uuid4())})            
        elif e.tag == 'div' and e.get('type') == 'section':
            _last_section_end_index = text.len()-1
            annotations.append({'resource_id': resource_id, 'label':'section','begin_anchor': text._anchors[_last_section_begin_index],\
                            'end_anchor':text._anchors[_last_section_end_index],'id': 'annot_'+str(uuid.uuid4())})               
        elif e.tag == 'pb':
            # first store the 'previous' page, then store begin and end of currently closed page
            annotations.append({'resource_id': resource_id, 'label':'page','begin_anchor': text._anchors[_last_page_begin_index],\
                            'end_anchor':text._anchors[_last_page_end_index],'id': _last_page_id}) 
            _last_page_begin_index = _last_page_end_index
            _last_page_end_index = text.len()-1 
            _last_page_id = f"page-{e.get('n')}"
#    elif action == 'start-ns':
#        print(e[1]) # namespaceURI part of e, in case of a namespace declaration     
            
    return        


def traverse(resource_id,node,text_segments,annotations):
    for action, elem in node:
        handle_element(resource_id,action,elem,text_segments,annotations)
        
    global _last_begin_indexes
    if _last_begin_indexes['page'] is not None:
        page_start = _last_begin_indexes['page']
        print(f'flushing last page at {page_start}')
        annotations.append({
                'resource_id': resource_id,
                'label': 'page',
                'begin_anchor': text_segments._anchors[page_start],
                'end_anchor': text_segments._anchors[text_segments.len()-1],
                'id': 'annot_' + str(uuid.uuid4())
            })
        del _last_begin_indexes['page']
        
    return
    
# Process per file, properly concatenate results, maintaining proper referencing the baseline text elements
all_textelements=[]  # segmentedtext.SplittableSegmentedText()
all_annotations=[]

for filename in get_file_sequence_for_dir(path):
    basename = os.path.basename(filename)
    resource_id = os.path.splitext(basename)[0]
    print(f'processing resource: {resource_id}')
    source_data = tei.get_root_tree_element(filename)
    text_segments = segmentedtext.SplittableSegmentedText(resource_id)
    annotation_array = []
    
    traverse(resource_id,source_data,text_segments,annotation_array)
    
    all_textelements.append(text_segments)       
    all_annotations.extend(annotation_array)
    
print(all_textelements[0])
print(annotation_array)
    

processing resource: let001
{http://www.tei-c.org/ns/1.0}div
page_begin: 
type=[{http://www.tei-c.org/ns/1.0}pb], action=[start], resource_id=[let001], parent=[<Element {http://www.tei-c.org/ns/1.0}div at 0x7f8bd81d27c0>]
Beginning new page at 0
{http://www.tei-c.org/ns/1.0}div
page_begin: 
type=[{http://www.tei-c.org/ns/1.0}pb], action=[end], resource_id=[let001], parent=[<Element {http://www.tei-c.org/ns/1.0}div at 0x7f8bd81d27c0>]
{http://www.tei-c.org/ns/1.0}div
called text_handler with:
type: {http://www.tei-c.org/ns/1.0}ab
action: start
resource_id: let001
parent: <Element {http://www.tei-c.org/ns/1.0}div at 0x7f8bd81d27c0>
{http://www.tei-c.org/ns/1.0}div
{http://www.tei-c.org/ns/1.0}div
called text_handler with:
type: {http://www.tei-c.org/ns/1.0}ab
action: end
resource_id: let001
parent: <Element {http://www.tei-c.org/ns/1.0}div at 0x7f8bd81d27c0>
{http://www.tei-c.org/ns/1.0}div
{http://www.tei-c.org/ns/1.0}div
called text_handler with:
type: {http://www.tei-c.org/ns/1.0}ab
a

In [13]:
import json
with open('../../data/output/vgtxt.json', 'w') as txtfile:
    json.dump(all_textelements, txtfile, cls=segmentedtext.SegmentEncoder)

In [14]:
with open('../../data/output/vgann.json', 'w') as annfile:
    json.dump(all_annotations, annfile, cls=segmentedtext.AnchorEncoder)