Doe een SAX-style parse van een TEI document(-set) en bepaal alle unieke patronen van relevante inputwaarden voor untanngle. Relevant in de zin van 'hiermee moet kunnen worden bepaald welke handlercode dit SAX-event moet triggeren'.

In [None]:
import sys
sys.path.append('../../packages')

import glob
from pprint import pprint
import tei.util as tei

path = '../../data/vangogh/let*.xml'
datadir = '../../data/output/'

tei_namespace = '{http://www.tei-c.org/ns/1.0}'
vangogh_namespace = '{http://www.vangoghletters.org/ns/}'

def get_file_sequence_for_dir(path):
    tei_file_names = (f for f in glob.glob(path))
    return sorted(tei_file_names)

tei_file_names = get_file_sequence_for_dir(path)
unique_results = tei.find_unique_contexts(tei_file_names)
print(f"There are {len(unique_results)} unique (tag,parent,parent-attr-key) tuples:")
pprint(unique_results)

Hierboven heb ik alle unieke typen events gevonden, en daar allereerst 'ab' (anonymous block) event typen uitgefilterd. 'ab' komt dus voor binnen div's met een 'type' attribuut en binnen notes (met attributes n, id en target).

Volgende stap: vul deze 'records' handmatig aan met benodigde info voor verdere untanngling, inclusief de namen van handler-functies.
Extra info: elt_type (text_container, milestone, ..., wellicht al vastgelegd door koppeling aan handler), resource_id, annotation_type (label, is eigenlijk 'tag'), custom_info

In [None]:
import uuid

_last_begin_indexes = {}
_last_end_indexes = {}

def text_handler(event, type, action, resource_id, parent, text_segments, annotations):
    print('called text_handler with:')
    print(f"type: {type}\naction: {action}\nresource_id: {resource_id}\nparent: {parent}")
    
    global _last_begin_indexes
    global _last_end_indexes    
        
    if action == 'start':
        _last_begin_indexes[type] = text_segments.len()
    if action == 'end':
        parts = [];
        # leaf text element, add to all_textelements, also include text after possible pb's
        for _, part in enumerate(event.itertext()):
            parts.append(part.strip())
        line = ' '.join(parts)
        print(f"line: [{line}]")
        text_segments.append(line)
            
        _last_end_indexes[type] = text_segments.len()-1
        annotations.append({'resource_id': resource_id, 'label':type, 
                            'begin_anchor': text_segments._anchors[_last_begin_indexes[type]],\
                            'end_anchor':text_segments._anchors[_last_end_indexes[type]],
                            'id': 'annot_'+str(uuid.uuid4())}) 
    return

def annotate_parent_handler(event, type, action, resource_id, parent, text, annotations):
    print('called annotate_parent_handler with:')
#    print(f"type: {type}\naction: {action}\nresource_id: {resource_id}\nparent: {parent}")
    
    return

handler_dispatcher = [
    {
        'condition':{
            'element': 'parent',
            'attribute': 'type',
            'value': 'original'
        },
        'handler': text_handler
    },
    {
        'condition': {
            'element': 'parent',
            'attribute': 'type',
            'value': 'translation'            
        },
        'handler': annotate_parent_handler        
    }
]

handler_dispatcher_p = [
    {
        'condition':{
            'element': 'parent',
            'xml_tag': 'note',
        },
        'handler': text_handler
    }
]

select records and associate them with the handler_dispatcher

In [None]:
filtered = [rec for rec in unique_results if rec['tag'] == '{http://www.tei-c.org/ns/1.0}ab']
filtered

In [None]:
filtered[0]['handler_dispatcher'] = handler_dispatcher
#filtered[0]['condition'] = {
#                'element': 'parent',
#                'attribute': 'type',
#                'value': 'original'
#            }
#filtered[0]['handler'] = text_handler

# deze gaat vooralsnog niets doen, want conditions gaan niet matchen
filtered[1]['handler_dispatcher'] = handler_dispatcher

filtered[0:2]

In [None]:
filtered = [rec for rec in unique_results if rec['tag'] == '{http://www.tei-c.org/ns/1.0}p']
filtered[2]['handler_dispatcher'] = handler_dispatcher_p

filtered

Vraag is nu, of deze subset van unique_results voldoende info bevat om het untanngle-proces te sturen. Volgende stap is dus kijken in hoeverre ik de oude untanngle code voor TEI hieraan kan aanpassen.

In [None]:
from textservice import segmentedtext

_last_page_begin_index = 0
_last_section_begin_index = -1
_last_chapter_begin_index = -1
_last_paragraph_begin_index = -1
_last_head_begin_index = -1

_last_page_end_index = -1
_last_section_end_index = -1
_last_chapter_end_index = -1
_last_paragraph_end_index = -1
_last_head_end_index = -1

_last_page_id = ""

def get_input_pattern_from(action, elem):
    parent_tag = None if elem.getparent() == None else elem.getparent().tag
    parent_keys = None if parent_tag == None else elem.getparent().attrib.keys()
                
    input_pattern = {
#        'event': action,
        'tag': elem.tag,
        'parent': parent_tag,
        'parent_attrib_keys': parent_keys,
    }  
    return input_pattern

# handle each of the elements in the hierarchy according to 'layer type'
def handle_element(resource_id,action,e,text_segments,annotations): 
#     global resource_id
    global _last_page_begin_index
    global _last_section_begin_index
    global _last_chapter_begin_index
    global _last_head_begin_index

    global _last_page_end_index
    global _last_section_end_index
    global _last_chapter_end_index
    global _last_head_end_index
    
    global _last_page_id
    
    # determine input_pattern from action and elem
    input_pattern = get_input_pattern_from(action, e)
    
    # find matching patterns in unique_results
    for pattern in unique_results:
        if all(item in pattern.items() for item in input_pattern.items()):
            # find handler function by checking conditions from handler_dispatcher
            # handler_dispatcher is a list of condition/handler dicts
            if 'handler_dispatcher' in pattern:
#            if 'condition' in pattern:
                for handler_entry in pattern['handler_dispatcher']:
                    element = handler_entry['condition']['element']
                    xml_element = None
                    if element == 'parent':
                        xml_element = e.getparent()
                        print(xml_element.tag)
                    
                    # temporary if's, fix by introducing a class/function that checks a condition
                    if 'attribute' in handler_entry['condition']:
                        attribute = handler_entry['condition']['attribute']
                        value = handler_entry['condition']['value']
                
                        if attribute in xml_element.attrib and xml_element.attrib[attribute] == value:
                            handler_entry['handler'](e, pattern['tag'], action, resource_id, \
                                                 xml_element, text_segments, annotations)
                    if 'xml_tag' in handler_entry['condition']:
                        element_tag = handler_entry['condition']['xml_tag']
                        print(f"value from condition: {tei_namespace+element_tag}, parent tag: {xml_element.tag}")
                        if tei_namespace+element_tag == xml_element.tag:
                            handler_entry['handler'](e, pattern['tag'], action, resource_id, \
                                                 xml_element, text_segments, annotations)
    return
    
    if action == 'start':
        # store last begin indexes
        if e.tag == tei_namespace+'ab':
            _last_paragraph_begin_index = text.len()               
        elif e.tag == 'div' and e.get('type') == 'chapter':
            _last_chapter_begin_index = text.len()
        elif e.tag == 'div' and e.get('type') == 'section':
            _last_section_begin_index = text.len()
        elif e.tag == 'head':
            _last_head_begin_index = text.len()
    elif action == 'end':
        if e.tag == tei_namespace+'ab': 
            # leaf text element, add to all_textelements, also include text after possible pb's
            for index, t in enumerate(e.itertext()):
                text.append(t.strip())
                if index > 0: # assume: caused by pb contained within p. Update page end.
                    _last_page_end_index = text.len()-1
            
            _last_paragraph_end_index = text.len()-1

            if _last_paragraph_begin_index <= _last_paragraph_end_index:
                annotations.append({'resource_id': resource_id, 'label':'paragraph','begin_anchor': text_segments._anchors[_last_paragraph_begin_index],\
                            'end_anchor':text_segments._anchors[_last_paragraph_end_index],'id': 'annot_'+str(uuid.uuid4())})
        elif e.tag == 'head':
            # leaf text element, add to all_textelements
            text.append(e.text)
            
            _last_head_end_index = text.len()-1
            annotations.append({'resource_id': resource_id, 'label':'head','begin_anchor': text._anchors[_last_head_begin_index],\
                            'end_anchor':text._anchors[_last_head_end_index],'id': 'annot_'+str(uuid.uuid4())}) 
        elif e.tag == 'div' and e.get('type') == 'chapter':
            _last_chapter_end_index = text.len()-1
            annotations.append({'resource_id': resource_id, 'label':'chapter','begin_anchor': text._anchors[_last_chapter_begin_index],\
                            'end_anchor':text._anchors[_last_chapter_end_index],'id': 'annot_'+str(uuid.uuid4())})            
        elif e.tag == 'div' and e.get('type') == 'section':
            _last_section_end_index = text.len()-1
            annotations.append({'resource_id': resource_id, 'label':'section','begin_anchor': text._anchors[_last_section_begin_index],\
                            'end_anchor':text._anchors[_last_section_end_index],'id': 'annot_'+str(uuid.uuid4())})               
        elif e.tag == 'pb':
            # first store the 'previous' page, then store begin and end of currently closed page
            annotations.append({'resource_id': resource_id, 'label':'page','begin_anchor': text._anchors[_last_page_begin_index],\
                            'end_anchor':text._anchors[_last_page_end_index],'id': _last_page_id}) 
            _last_page_begin_index = _last_page_end_index
            _last_page_end_index = text.len()-1 
            _last_page_id = f"page-{e.get('n')}"
#    elif action == 'start-ns':
#        print(e[1]) # namespaceURI part of e, in case of a namespace declaration     
            
    return        

def traverse(resource_id,node,text_segments,annotations):
    for action, elem in node:
        handle_element(resource_id,action,elem,text_segments,annotations)
        
    return
    
# Process per file, properly concatenate results, maintaining proper referencing the baseline text elements
resource_id = 'let005'
all_textelements=segmentedtext.SplittableSegmentedText(resource_id)

all_annotations=[]

# for f_name in get_file_sequence_for_container(resource_id):
for f_name in ['../../data/vangogh/let005.xml']:
    text_segments = segmentedtext.SplittableSegmentedText()
    annotation_array = []
            
    source_data = tei.get_root_tree_element(f_name)

    traverse(f_name,source_data,text_segments,annotation_array)
    
    all_textelements.extend(text_segments)       
    all_annotations.extend(annotation_array)
    
    print(all_textelements)
    print(annotation_array)
    