Volgende stappen
- los probleem met JSON serializable op: SegmentedText is niet JSON serializable
- maak resty package nav recipe 11.5 uit Python Cookbook
- schrijf handler functies voor aservice REST calls, in de aservice package, op basis huidige functies
- instantieer een PathDispatcher
- registreer REST calls bij die bij de resty PathDispatcher
- start een simpele service

In [1]:
from segmentedtext import tservice

In [2]:
import lxml
from lxml import etree
import uuid

path = 'bosb002graa04_01.xml'

_last_page_begin_index = 0
_last_section_begin_index = -1
_last_chapter_begin_index = -1
_last_paragraph_begin_index = -1
_last_head_begin_index = -1

_last_page_end_index = -1
_last_section_end_index = -1
_last_chapter_end_index = -1
_last_paragraph_end_index = -1
_last_head_end_index = -1

_last_page_id = ""

all_textelements=tservice.SegmentedText()
all_annotations=[]

def get_file_sequence_for_container(text_container):
    return [path]

def get_root_tree_element(file):
    # use iterparse to traverse the xml hierarchy, depth first, post order
    return etree.iterparse(path, events=('start','end'))

# handle each of the elements in the hierarchy according to 'layer type'
def handle_element(action,e,text,annotations):  
    global _last_page_begin_index
    global _last_section_begin_index
    global _last_chapter_begin_index
    global _last_paragraph_begin_index
    global _last_head_begin_index

    global _last_page_end_index
    global _last_section_end_index
    global _last_chapter_end_index
    global _last_paragraph_end_index
    global _last_head_end_index
    
    global _last_page_id

    if action == 'start':
        # store last begin indexes
        if e.tag == 'p':
            _last_paragraph_begin_index = text.len()               
        elif e.tag == 'div' and e.get('type') == 'chapter':
            _last_chapter_begin_index = text.len()
        elif e.tag == 'div' and e.get('type') == 'section':
            _last_section_begin_index = text.len()
        elif e.tag == 'head':
            _last_head_begin_index = text.len()
    elif action == 'end':
        if e.tag == 'p': 
            # leaf text element, add to all_textelements, also include text after possible pb's
            for index, t in enumerate(e.itertext()):
                text.append(t.strip())
                if index > 0: # assume: caused by pb contained within p. Update page end.
                    _last_page_end_index = text.len()-1
            
            _last_paragraph_end_index = text.len()-1

            if _last_paragraph_begin_index <= _last_paragraph_end_index:
                annotations.append({'label':'paragraph','begin_anchor': text._anchors[_last_paragraph_begin_index],\
                            'end_anchor':text._anchors[_last_paragraph_end_index],'id': 'annot_'+str(uuid.uuid4())})
        elif e.tag == 'head':
            # leaf text element, add to all_textelements
            text.append(e.text)
            
            _last_head_end_index = text.len()-1
            annotations.append({'label':'head','begin_anchor': text._anchors[_last_head_begin_index],\
                            'end_anchor':text._anchors[_last_head_end_index],'id': 'annot_'+str(uuid.uuid4())}) 
        elif e.tag == 'div' and e.get('type') == 'chapter':
            _last_chapter_end_index = text.len()-1
            annotations.append({'label':'chapter','begin_anchor': text._anchors[_last_chapter_begin_index],\
                            'end_anchor':text._anchors[_last_chapter_end_index],'id': 'annot_'+str(uuid.uuid4())})            
        elif e.tag == 'div' and e.get('type') == 'section':
            _last_section_end_index = text.len()-1
            annotations.append({'label':'section','begin_anchor': text._anchors[_last_section_begin_index],\
                            'end_anchor':text._anchors[_last_section_end_index],'id': 'annot_'+str(uuid.uuid4())})               
        elif e.tag == 'pb':
            # first store the 'previous' page, then store begin and end of currently closed page
            annotations.append({'label':'page','begin_anchor': text._anchors[_last_page_begin_index],\
                            'end_anchor':text._anchors[_last_page_end_index],'id': _last_page_id}) 
            _last_page_begin_index = _last_page_end_index
            _last_page_end_index = text.len()-1 
            _last_page_id = f"page-{e.get('n')}"
            
    return        

def traverse(node,node_label,text,annotations):
    for action, elem in node:
        handle_element(action,elem,text,annotations)  
    return
    
# Process per file, properly concatenate results, maintaining proper referencing the baseline text elements
for f_name in get_file_sequence_for_container('just one tei document'):
    text_segments = tservice.SegmentedText()
    annotation_array = []
            
    source_data = get_root_tree_element(f_name)

    traverse(source_data,'',text_segments,annotation_array)
           
    # properly concatenate annotation info taking ongoing line indexes into account - trivial, do not apply in this case
#    for ai in annotation_array:
#        ai['begin_index'] += len(all_textlines)
#        ai['end_index'] += len(all_textlines)
    
    all_textelements.extend(text_segments)       
    all_annotations.extend(annotation_array)
    

In [3]:
all_textelements.len()

3130

In [5]:
# return generator for annotations with a specific label
#def get_annotations_of_type(type,annotations):
#    return (a for a in annotations if a['label'] == type)

from annotation import asearch

In [6]:
sample_para = [paras for paras in asearch.get_annotations_of_type('paragraph', all_annotations)][103]
print(sample_para['begin_anchor'])
print(sample_para['end_anchor'])

anchor_7203bec7-fe3f-411d-bb3c-40d5ae717b58
anchor_7203bec7-fe3f-411d-bb3c-40d5ae717b58


In [7]:
all_textelements.slice(sample_para['begin_anchor'], sample_para['end_anchor'])

['‘Die goede Geertrui!’ zeide Elisabeth glimlachend tot Courtenay; ‘zij is zoo ingenomen met den titel, dien mijn vader haar gaf, toen zij mijne min werd, dat zij zelfs hare gehechtheid aan mij onder de stijve windsels der étiquette verbergt. Maar ik schaam mij, ik ben wel egoïstisch, beste Graaf! ik heb er tot hiertoe slechts aan gedacht u mijne eigene grieven te verhalen, en was daarna zoo verdiept in het luisteren naar u, dat ik er bij vergat, hoe vermoeid gij moet zijn en hoe verlangend naar eenige verkwikking.’']

In [8]:
begin_anchor = all_textelements.split(sample_para['begin_anchor'], 11)
end_anchor = all_textelements.split(begin_anchor, 8)

In [9]:
all_textelements.slice(begin_anchor, begin_anchor)

['Geertrui']

In [10]:
all_annotations.append({'label':'entity','begin_anchor': begin_anchor,'end_anchor': begin_anchor,\
                    'id': 'annot_'+str(uuid.uuid4()), 'entity_type': 'per', 'entity_text': 'Geertrui'})

In [11]:
entities = [ents for ents in aservice.get_annotations_of_type('entity', all_annotations)]
entities

NameError: name 'aservice' is not defined

In [11]:
# print all annotations overlapping with the entity
for a in asearch.get_annotations_overlapping_with(entities[0]['begin_anchor'],entities[0]['end_anchor'],all_annotations):
    print(a)

{'label': 'page', 'begin_anchor': anchor_3400da08-0fc8-4ed4-b528-90315d841a54, 'end_anchor': anchor_94940e52-52bf-482f-a3b4-5ff7455d929d, 'id': 'page-22'}
{'label': 'section', 'begin_anchor': anchor_95e714bc-5817-4189-9b17-b3ae955ca505, 'end_anchor': anchor_eec91b82-55d7-4dae-bf79-16a322b2dde4, 'id': 'annot_055acf1c-b61c-4ea1-a6ab-68a42da405bf'}
{'label': 'chapter', 'begin_anchor': anchor_95e714bc-5817-4189-9b17-b3ae955ca505, 'end_anchor': anchor_eec91b82-55d7-4dae-bf79-16a322b2dde4, 'id': 'annot_fc88f57f-789b-439a-9f00-981ca52b5097'}
{'label': 'entity', 'begin_anchor': anchor_ea2fe414-8ca0-48c3-b535-ea281d4ab038, 'end_anchor': anchor_ea2fe414-8ca0-48c3-b535-ea281d4ab038, 'id': 'annot_41001761-c619-4bf1-85a1-cb42c23c0a74', 'entity_type': 'per', 'entity_text': 'Geertrui'}


Gebruik bovenstaande voor het visualiseren van alle tekst voor een pagina, of alle heads (vanaf het midden van het boek)

In [12]:
for a in asearch.get_annotations_of_type('page', all_annotations):
    if a['id'] == 'page-116':
        print(a)

{'label': 'page', 'begin_anchor': anchor_9ad40db4-7699-4233-bbcf-d5069c397b2a, 'end_anchor': anchor_80e670e7-37a5-4f65-913d-db306436c3f3, 'id': 'page-116'}


In [13]:
p116 = [p for p in asearch.get_annotations_of_type('page', all_annotations) if p['id'] == 'page-116'][0]

for t in all_textelements.slice(p116['begin_anchor'],p116['end_anchor']):
    print(f"{t}\n")

dedigen. Het ware hem wellust geweest, slechts eens den zoeten naam der geliefde voor een vertrouwd oor te mogen uitspreken. En toch moest hij laaghartig en zelfzuchtig schijnen, en toch gevoelde hij dat de ander niet meer gelooven kon aan de goedheid van zijn hart, toen hij antwoordde, zooals zijn gegeven woord het eischte:

‘Neen, Arundel, neen! ik kan haar niet helpen; voor haar vermag ik niets; vooralsnog niets;... misschien in het vervolg;... wie weet.... geloof mij, mijn vriend, laat ons geduld hebben.’

‘Dus heeft de beklagenswaardige ook u verloren,’ hervatte de Groot-Kamerheer koel en met stugheid. ‘Ik moet u nog met eene vraag lastig vallen, Mylord; het is de laatste over dit onderwerp. Zult gij mij tegen zijn, zoo ik in haar voordeel spreek?’

‘Op mijn ridderwoord, neen!’ riep Devonshire schielijk; zijne oogen schitterden van vreugde over eene uitkomst, die zoozeer met zijne wenschen instemde en hij stond op en vatte Arundel bij de hand. Zoo brandend jaagde het bloed hem doo

In [14]:
print(all_textelements.toJSON())

{
  "_anchors": [
    {
      "identifier": "anchor_704433d1-a605-483f-8739-e386aa8e5766",
      "sequence_number": 0
    },
    {
      "identifier": "anchor_2ddb3fad-35f0-4bac-b685-2db83da99379",
      "sequence_number": 1
    },
    {
      "identifier": "anchor_f9c57f93-3242-4106-9a78-c1a482d7cdc6",
      "sequence_number": 2
    },
    {
      "identifier": "anchor_1646e191-35fe-457f-bac4-9e06809db6f0",
      "sequence_number": 3
    },
    {
      "identifier": "anchor_d3693a52-4944-472a-8760-d4ff771512b2",
      "sequence_number": 4
    },
    {
      "identifier": "anchor_58c7cb79-7067-4f93-a4fc-1b82f473e1d0",
      "sequence_number": 5
    },
    {
      "identifier": "anchor_490ac7f7-6f20-4739-9a4b-6efc6d1183a1",
      "sequence_number": 6
    },
    {
      "identifier": "anchor_ca040678-f89b-4e0e-892b-2f7b8dc0372e",
      "sequence_number": 7
    },
    {
      "identifier": "anchor_04204992-7fa2-4f3b-911c-4d7afa380109",
      "sequence_number": 8
    },
    {
      "identi

In [15]:
import json

# write all_textlines to a file
with open('all_tei_textlines.txt', 'w') as filehandle:
    json.dump(all_textelements, filehandle)

TypeError: Object of type SegmentedText is not JSON serializable

In [16]:
import json
for a in asearch.get_annotations_of_type('page', all_annotations):
    print(json.dumps(a, sort_keys=False, indent=2))

TypeError: Object of type Anchor is not JSON serializable

In [17]:
for a in asearch.get_annotations_of_type('paragraph', all_annotations):
    print(json.dumps(a, sort_keys=False, indent=2))

TypeError: Object of type Anchor is not JSON serializable

In [None]:
from resty import resty
from wsgiref.simple_server import make_server

dispatcher = resty.PathDispatcher()
dispatcher.register('GET','/annotations', asearch.annots_of_type_handler)

httpd = make_server('', 8080, dispatcher)
print('serving on port 8080...')
httpd.serve_forever()

register method: GET, path: /annotations, function: <function annots_of_type_handler at 0x7fb7c83cf680>
{('get', '/annotations'): <function annots_of_type_handler at 0x7fb7c83cf680>}
serving on port 8080...
/annotations
get
<function annots_of_type_handler at 0x7fb7c83cf680>


127.0.0.1 - - [04/Feb/2021 17:38:45] "GET /annotations?type=page HTTP/1.1" 200 4


In [1]:
from annotation import aservice
from flask import Flask

In [3]:
%tb

SystemExit: 1

In [2]:
import requests

In [3]:
response = requests.get("http://localhost:5000")

ConnectionError: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb8f81f8a10>: Failed to establish a new connection: [Errno 61] Connection refused'))