In [59]:
import sys
sys.path.append('../../packages')

from textservice import segmentedtext
from annotation import asearch

datadir = '../../data/1728/11jan22/'
sessions_folder = 'CAF-sessions-1728-110122/'
resolutions_folder = 'CAF-resolutions-1728-110122/'
text_store = '1728-textstore-220413.json'
annotation_store = '1728-annotationstore-220413.json'
resource_id = 'volume-1728'

In [2]:
import json
import glob
import re

# read files

all_textlines=segmentedtext.IndexedSegmentedText(resource_id)
all_annotations=[]

def text_region_handler(node, begin_index, end_index, annotations):
    # text_region['metadata'] contains enough info to construct annotations for page and scan.
    # this will result in duplicates, so deduplication at a later stage is necessary.

    if 'iiif_url' in node['metadata']:
        scan_annot_info = {'resource_id': resource_id, 'label':'scan','iiif_url': node['metadata']['iiif_url'],\
                               'begin_anchor': begin_index,'end_anchor': end_index}
        scan_annot_info['id'] = node['metadata']['scan_id']
        annotations.append(scan_annot_info)
        
        page_annot_info = {'resource_id': resource_id, 'label':'page',\
                               'begin_anchor': begin_index,'end_anchor': end_index}
        page_annot_info['id'] = node['metadata']['page_id']
        page_annot_info['metadata'] = {'page_id': node['metadata']['page_id'], 'scan_id': node['metadata']['scan_id']}
        page_annot_info['coords'] = node['coords']
        annotations.append(page_annot_info)
        
    return

def test_handler(node):
    print(f"test_handler called for {node['id']}")
    return

untanngle_config = {
    "session": {
        "child_key": "text_regions",
        "child_type": "text_region",
        "extra_fields": ["evidence"],
        "img_region_source": "text_regions"
    },
    "text_region": {
        "child_key": "lines",
        "child_type": "line",
        "extra_fields": [],
        "additional_processing": text_region_handler,
        "img_region_source": "present"
    },
    "line": {
        "child_key": None,
        "child_type": None,
        "extra_fields": ["baseline"],
        "img_region_source": "construct"
    },
    "resolution": {
        "child_key": "paragraphs",
        "child_type": "republic_paragraph",
        "extra_fields": ["evidence"],
        "img_region_source": "line"
    },
    "republic_paragraph": {
        "child_key": None,
        "child_type":None,
        "extra_fields": ["line_ranges"],
        "img_region_source": "line"
    },
    "attendance_list": {
        "child_key": "paragraphs",
        "child_type": "republic_paragraph",
        "extra_fields": ["attendance_spans"],
        "img_region_source": "line"
    },
    "page": {
        "img_region_source": "merge_regions"
    },
    "attendant": {
        "img_region_source": "line"
    },
    "scan": {
        "img_region_source": "present"
    }
}

# We want to load 'text containers' that contain more or less contiguous text and are as long as practically
# possible. Container size is determined by pragmatic reasons, e.g. technical (performance) or user driven
# (corresponding with all scans in a book or volume). This functions returns all component files IN TEXT ORDER.
# Examples: sorted list of files, part of IIIF manifest.

def get_file_sequence_for_container(text_container):
#    path = datadir + sessions_folder + "session-1728-01-15-num*.json"
    path = datadir + sessions_folder + "session-*-num*.json"
    session_file_names = (f for f in glob.glob(path))
    return sorted(session_file_names)

# Many file types contain a hierarchy of ordered text and/or annotation elements of different types. Some form of
# depth-first, post order traversal is necessary. Examples: processing a json hierarchy with dictionaries
# and lists (republic) or parsing TEI XML (DBNL document).

def traverse(node,node_label,text,annotations):
    # find the list that represents the children, each child is a dict
    config = untanngle_config[node_label]
    key_of_children = config['child_key']
    type_of_children = config['child_type']    
    
    coords = None if not 'coords' in node else node['coords']
    metadata = None if not 'metadata' in node else node['metadata']
        
    begin_index = text.len()
    annotation_info = {'resource_id': resource_id, 'label': node_label, 'coords': coords,\
                        'metadata': metadata, 'id':node['id'], 'begin_anchor': begin_index}
    
    # add selected extra_fields to annotation_info
    extra_fields = config['extra_fields']
    for f in extra_fields:
        annotation_info[f] = node[f]
    
    children = [] if key_of_children == None else node[key_of_children]    
    if len(children) == 0:        # if no children, do your 'leaf node thing'
        
        annotation_info['end_anchor'] = text.len()
        node_text = node['text']
        
        if node_text is None:
            node_text = '\n'

        text.append(node_text)
    else:                         # if non-leaf node, first visit children     
        for child in children:
            traverse(child,type_of_children,text,annotations)
        
        end_index = text.len()-1
        annotation_info['end_anchor'] = end_index    # after child text segments are added     
        
    annotations.append(annotation_info)
    
    if 'additional_processing' in config:
        config['additional_processing'](node, begin_index, end_index, annotations)
    
    return

# In case of presence of a hierarchical structure, processing/traversal typically starts from a root element.

def get_root_tree_element(file):
    with open(file, 'r') as myfile:
        session_file=myfile.read() 
        
    session_data = json.loads(session_file)      
    return session_data['_source']

# Rudimentary version of a scanpage_handler

def deduplicate_annotations(a_array, type):
    # filter annotation_info dicts of 'type'
    typed_annots = [ann_info for ann_info in a_array if ann_info['label'] == type]
        
    # use groupBy on a list of dicts (zie Python cookbook 1.15)
    from operator import itemgetter
    from itertools import groupby

    # first sort on scans' id
    typed_annots.sort(key=itemgetter('id'))

    # iterate in groups
    aggregated_typed_annots = []

    for id, items in groupby(typed_annots, key=itemgetter('id')):
        # first, convert the 'items' iterator to a list, to able to use it twice (iterators can be used once)
        itemlist = list(items)
    
        # copy the item with the lowest begin_index
        aggr_typed_annot = min(itemlist, key=itemgetter('begin_anchor')).copy()
    
        # replace 'end_anchor' with the highest end_index in the group
        max_end_index = max(itemlist, key=itemgetter('end_anchor'))['end_anchor']
        aggr_typed_annot['end_anchor'] = max_end_index
        
        # add to result
        aggregated_typed_annots.append(aggr_typed_annot)
        
    # replace old scan annotations with correct aggregated ones
    for old_annot in typed_annots:
        a_array.remove(old_annot)
        
    a_array.extend(aggregated_typed_annots)
           
    return

def correct_scan_imageurls(a_array):
    scan_annots = [ann_info for ann_info in a_array if ann_info['label'] == 'scan']
    
    for scan_ann in scan_annots:
        scan_ann['iiif_url'] = re.sub(r'(\d+),(\d+),(\d+),(\d+)/(full)', r'\5/,\4', scan_ann['iiif_url'])
        
    return

# Process per file, properly concatenate results, maintaining proper referencing the baseline text elements
for f_name in get_file_sequence_for_container(resource_id):
    text_array = segmentedtext.IndexedSegmentedText()
    annotation_array = []
            
    source_data = get_root_tree_element(f_name)

    traverse(source_data,'session',text_array,annotation_array)
           
    # properly concatenate annotation info taking ongoing line indexes into account
    for ai in annotation_array:
        ai['begin_anchor'] += all_textlines.len()
        ai['end_anchor'] += all_textlines.len()
    
    all_textlines.extend(text_array)
    all_annotations.extend(annotation_array)

deduplicate_annotations(all_annotations, 'scan') 
correct_scan_imageurls(all_annotations)
    
deduplicate_annotations(all_annotations, 'page')
    
for a in asearch.get_annotations_of_type('session', all_annotations):
    print(a)

{'resource_id': 'volume-1728', 'label': 'session', 'coords': None, 'metadata': {'id': 'session-1728-01-02-num-1', 'type': 'session', 'inventory_num': 3783, 'session_date': '1728-01-02', 'session_year': 1728, 'session_month': 1, 'session_day': 2, 'session_weekday': 'Veneris', 'date_shift_status': 'normal', 'session_num': 1, 'president': None, 'attendants_list_id': None, 'resolution_ids': [], 'is_workday': True, 'has_session_date_element': True, 'lines_include_rest_day': False, 'text_page_num': [1, 2, 3, 4, 5, 6], 'index_timestamp': '2022-01-10T14:33:04.522221'}, 'id': 'session-1728-01-02-num-1', 'begin_anchor': 0, 'evidence': [{'type': 'PhraseMatch', 'phrase': 'Jovis den 1 Januarii', 'variant': 'Jovis den 1 Januarii', 'string': 'Jovis den 1. Januarii', 'offset': 0, 'label': 'session_date', 'ignorecase': False, 'text_id': 'NL-HaNA_1.01.02_3783_0051-line-2938-1582-522-64', 'match_scores': {'char_match': 1.0, 'ngram_match': 0.9, 'levenshtein_similarity': 0.9523809523809523}}, {'type': 'Phr

In [3]:
all_textlines.len()

107212

In [4]:
len(all_annotations)

111036

In [5]:
resolution_annotations=[]

def get_resolution_files_for_container(text_container):
    path = datadir + resolutions_folder + 'session-*-resolutions.json'
#    path = datadir + resolutions_folder + 'session-1728-01-15-num*-resolutions.json'

    resolution_file_names = (f for f in glob.glob(path))
    return sorted(resolution_file_names)

# def res_traverse(node, node_label):
def res_traverse(node):
    # find the list that represents the children, each child is a dict, assume first list is the correct one
    node_label = node['type'][-1]
    
    config = untanngle_config[node_label]

    key_of_children = config['child_key']
    type_of_children = config['child_type']     

    children = [] if key_of_children == None else node[key_of_children]
    
    if len(children) == 0:        # if no children, do your 'leaf node thing'
        if len(node['line_ranges']) == 0:  # no associated lines, skip this node
            return
        else:
            begin_line_id = node['line_ranges'][0]['line_id']
            end_line_id = node['line_ranges'][-1]['line_id']
        
    else:  # if non-leaf node, first visit children     
        begin_line_id = children[0]['line_ranges'][0]['line_id']
        end_line_id = children[-1]['line_ranges'][-1]['line_id']
        for child in children:
            # res_traverse(child, type_of_children)
            res_traverse(child)
    
    if 'additional_processing' in config:
        config['additional_processing'](node)
        
    annotation_info = {'resource_id': resource_id, 'label' : node['type'][-1],\
                        'begin_anchor' : begin_line_id, \
                        'end_anchor': end_line_id, \
                        'metadata': node['metadata'], \
                        'id': node['id']}
    
    # add selected extra_fields to annotation_info
    extra_fields = config['extra_fields']
    for f in extra_fields:
        if node_label == 'attendance_list' and node['attendance_spans'] == []:
            print(node['id'])
        annotation_info[f] = node[f]
        
    resolution_annotations.append(annotation_info)

    return

# In case of presence of a hierarchical structure, processing/traversal typically starts from a root element.

def get_res_root_element(file):
    with open(file, 'r') as myfile:
        resolution_file=myfile.read() 
        
    resolution_data = json.loads(resolution_file)      
    return resolution_data['hits']['hits']

for f_name in get_resolution_files_for_container(resource_id):    
    # get list of resolution 'hits'
    hits = get_res_root_element(f_name)
    for hit in hits:
        # each hit corresponds with a resolution
        resolution_line_ids = []
        # res_traverse(hit['_source'],'resolution')       
        res_traverse(hit['_source'])

session-1728-01-10-num-1-attendance_list
session-1728-01-23-num-1-attendance_list
session-1728-02-10-num-1-attendance_list
session-1728-02-17-num-1-attendance_list
session-1728-02-19-num-1-attendance_list
session-1728-03-01-num-1-attendance_list
session-1728-03-25-num-1-attendance_list
session-1728-03-30-num-1-attendance_list
session-1728-04-14-num-1-attendance_list
session-1728-04-17-num-1-attendance_list
session-1728-05-04-num-1-attendance_list
session-1728-05-05-num-1-attendance_list
session-1728-05-10-num-1-attendance_list
session-1728-05-25-num-1-attendance_list
session-1728-05-26-num-1-attendance_list
session-1728-06-03-num-1-attendance_list
session-1728-07-09-num-1-attendance_list
session-1728-07-10-num-1-attendance_list
session-1728-07-16-num-1-attendance_list
session-1728-08-03-num-1-attendance_list
session-1728-08-24-num-1-attendance_list
session-1728-08-27-num-1-attendance_list
session-1728-09-28-num-1-attendance_list
session-1728-10-02-num-1-attendance_list
session-1728-11-

In [6]:
len(resolution_annotations)

8232

In [7]:
resolution_annotations[20:30]

[{'resource_id': 'volume-1728',
  'label': 'republic_paragraph',
  'begin_anchor': 'NL-HaNA_1.01.02_3783_0053-line-2775-958-886-99',
  'end_anchor': 'NL-HaNA_1.01.02_3783_0053-line-2758-1440-586-60',
  'metadata': {'inventory_num': 3783,
   'source_id': 'session-1728-01-02-num-1',
   'type': 'resolution_paragraph',
   'id': 'session-1728-01-02-num-1-para-20',
   'text_page_num': [5],
   'page_num': [105],
   'start_offset': 14956,
   'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0053.jpg/2646,363,1117,3121/full/0/default.jpg',
   'doc_id': 'session-1728-01-02-num-1',
   'paragraph_index': 0},
  'id': 'session-1728-01-02-num-1-para-20',
  'line_ranges': [{'start': 0,
    'end': 37,
    'line_id': 'NL-HaNA_1.01.02_3783_0053-line-2775-958-886-99',
    'text_page_num': 5,
    'page_num': 105},
   {'start': 37,
    'end': 75,
    'line_id': 'NL-HaNA_1.01.02_3783_0053-line-2881-1004-780-68',
    'text_page_num': 5,
    'page_num': 105},
   {'start': 

In [8]:
line_ids_vs_indexes = {}
for line in all_annotations:
    if line['label'] == 'line':
        line_ids_vs_indexes.update({line['id'] : line['begin_anchor']})

In [9]:
len(line_ids_vs_indexes)

102936

In [10]:
num_errors = 0
for res in resolution_annotations:    
    try:
        # res['begin_anchor'] = re.sub(r'-column-', r'-col-', res['begin_anchor'])
        # res['end_anchor'] = re.sub(r'-column-', r'-col-', res['end_anchor'])
        res['begin_anchor'] = line_ids_vs_indexes[res['begin_anchor']]
        res['end_anchor'] = line_ids_vs_indexes[res['end_anchor']]
    except:
        res['begin_anchor'] = 0
        res['end_anchor'] = 0
        num_errors += 1
        
if num_errors > 0:
    print(f"number of lookup errors: {num_errors}")
else:
    print("no lookup errors")

no lookup errors


In [11]:
resolution_annotations[20:30]

[{'resource_id': 'volume-1728',
  'label': 'republic_paragraph',
  'begin_anchor': 390,
  'end_anchor': 400,
  'metadata': {'inventory_num': 3783,
   'source_id': 'session-1728-01-02-num-1',
   'type': 'resolution_paragraph',
   'id': 'session-1728-01-02-num-1-para-20',
   'text_page_num': [5],
   'page_num': [105],
   'start_offset': 14956,
   'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0053.jpg/2646,363,1117,3121/full/0/default.jpg',
   'doc_id': 'session-1728-01-02-num-1',
   'paragraph_index': 0},
  'id': 'session-1728-01-02-num-1-para-20',
  'line_ranges': [{'start': 0,
    'end': 37,
    'line_id': 'NL-HaNA_1.01.02_3783_0053-line-2775-958-886-99',
    'text_page_num': 5,
    'page_num': 105},
   {'start': 37,
    'end': 75,
    'line_id': 'NL-HaNA_1.01.02_3783_0053-line-2881-1004-780-68',
    'text_page_num': 5,
    'page_num': 105},
   {'start': 75,
    'end': 111,
    'line_id': 'NL-HaNA_1.01.02_3783_0053-line-2876-1054-782-58',
    '

In [12]:
all_annotations.extend(resolution_annotations)

In [13]:
len(all_annotations)

119268

Neem een voorbeeld attendance_list, zoek betekenis start_offsets uit en check attendant alignment

In [14]:
sess_02_25 = asearch.get_annotation_by_id('session-1728-02-25-num-1', all_annotations)
att_list_02_25 = asearch.get_annotations_of_type_overlapping('attendance_list', \
                            sess_02_25['begin_anchor'], sess_02_25['end_anchor'], all_annotations, resource_id)

alist = list(att_list_02_25)
lines = all_textlines.slice(alist[0]['begin_anchor'], alist[0]['end_anchor'])
print(lines)
alist

['Mercurii den 25. Februarii', '1728.', 'PRAESIDE,', 'Den Heere Bout.', 'PRAESENTIBUS,', 'De Heeren Van Lynden, van Singendonck,', 'yvan Heuckelom, van Heeckeren tot den', "Brandtzenborgh, van W'ynbergen , met", 'een extraordinaris Gedeputeerde uyt de Pro-', 'vincie van Gelderlandt.', 'Van Wassenaer, vanden Boetzelaar, van-', 'der Dussen, Bors van Waveren, van', 'Cattenborgh, Veckhoven, Raadtpensiona-', 'ris van Slingelandt.', 'Noey, van Hoorn , met tqwee extraordina-', 'ris Gedeputeerden uyt de Provincie van', 'Zeelandt.', 'Taats van Amerongen, van Rensqwoude.', 'Van Sehwartzenbergh, Vegilin.', 'Van Haarsolte, van Isselmuden.', 'Van Tamminga.', 'DE Resrolutien , gueeren zeno-', 'men, zyn gelesen en geresumeert,', 'gelijck oock geresumeert ende ge-', 'arresteert zyn de Depesches daar uyt resul-', 'teerende.']


[{'resource_id': 'volume-1728',
  'label': 'attendance_list',
  'begin_anchor': 14337,
  'end_anchor': 14362,
  'metadata': {'inventory_num': 3783,
   'source_id': 'session-1728-02-25-num-1',
   'type': 'attendance_list',
   'id': 'session-1728-02-25-num-1-attendance_list',
   'session_date': '1728-02-25',
   'session_id': 'session-1728-02-25-num-1',
   'session_num': 1,
   'president': None,
   'session_year': 1728,
   'session_month': 2,
   'session_day': 25,
   'session_weekday': 'Mercurii',
   'text_page_num': [128],
   'index_timestamp': '2022-01-10T16:47:08.510847'},
  'id': 'session-1728-02-25-num-1-attendance_list',
  'attendance_spans': [{'offset': 0,
    'end': 32,
    'class': 'preamble',
    'pattern': 'Mercurii den 25. Februarii 1728.',
    'delegate_id': 0,
    'delegate_name': '',
    'delegate_score': 0},
   {'offset': 43,
    'end': 52,
    'class': 'heere',
    'pattern': 'Den Heere',
    'delegate_id': 0,
    'delegate_name': '',
    'delegate_score': 0},
   {'offset

In [15]:
# Extra check: controleer de paragraphs onder attendance_list
paras_02_25 = asearch.get_annotations_of_type_overlapping('republic_paragraph', \
                            alist[0]['begin_anchor'], alist[0]['end_anchor'], all_annotations, resource_id)
para_list_02_25 = list(paras_02_25)
para_list_02_25

[{'resource_id': 'volume-1728',
  'label': 'republic_paragraph',
  'begin_anchor': 14337,
  'end_anchor': 14357,
  'metadata': {'inventory_num': 3783,
   'source_id': 'session-1728-02-25-num-1',
   'type': 'resolution_paragraph',
   'id': 'session-1728-02-25-num-1-para-1',
   'text_page_num': [128],
   'page_num': [228],
   'start_offset': 0,
   'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0115.jpg/308,2111,1045,1285/full/0/default.jpg',
   'doc_id': 'session-1728-02-25-num-1',
   'paragraph_index': 0},
  'id': 'session-1728-02-25-num-1-para-1',
  'line_ranges': [{'start': 0,
    'end': 27,
    'line_id': 'NL-HaNA_1.01.02_3783_0115-line-495-2211-634-78',
    'text_page_num': 128,
    'page_num': 228},
   {'start': 27,
    'end': 33,
    'line_id': 'NL-HaNA_1.01.02_3783_0115-line-735-2286-143-57',
    'text_page_num': 128,
    'page_num': 228},
   {'start': 33,
    'end': 43,
    'line_id': 'NL-HaNA_1.01.02_3783_0115-line-541-2358-521-61',
    

In [16]:
overlapping_02_25 = asearch.get_annotations_overlapping_with(sess_02_25['begin_anchor'], sess_02_25['end_anchor'], all_annotations, resource_id)
overlapping_list = list(overlapping_02_25)

for a in para_list_02_25:
    print(f"{a['label']}:\t{a['id']}\t{a['begin_anchor']}")
    if 'metadata' in a and 'start_offset' in a['metadata']:
        print(f"\t{a['metadata']['start_offset']}")
        print(f"\t{a['begin_anchor'] - sess_02_25['begin_anchor']}")

republic_paragraph:	session-1728-02-25-num-1-para-1	14337
	0
	0
republic_paragraph:	session-1728-02-25-num-1-para-2	14358
	581
	21


In [17]:
all_textlines.slice(14337, 14337+581)

['Mercurii den 25. Februarii',
 '1728.',
 'PRAESIDE,',
 'Den Heere Bout.',
 'PRAESENTIBUS,',
 'De Heeren Van Lynden, van Singendonck,',
 'yvan Heuckelom, van Heeckeren tot den',
 "Brandtzenborgh, van W'ynbergen , met",
 'een extraordinaris Gedeputeerde uyt de Pro-',
 'vincie van Gelderlandt.',
 'Van Wassenaer, vanden Boetzelaar, van-',
 'der Dussen, Bors van Waveren, van',
 'Cattenborgh, Veckhoven, Raadtpensiona-',
 'ris van Slingelandt.',
 'Noey, van Hoorn , met tqwee extraordina-',
 'ris Gedeputeerden uyt de Provincie van',
 'Zeelandt.',
 'Taats van Amerongen, van Rensqwoude.',
 'Van Sehwartzenbergh, Vegilin.',
 'Van Haarsolte, van Isselmuden.',
 'Van Tamminga.',
 'DE Resrolutien , gueeren zeno-',
 'men, zyn gelesen en geresumeert,',
 'gelijck oock geresumeert ende ge-',
 'arresteert zyn de Depesches daar uyt resul-',
 'teerende.',
 'ONtfangen een Missive van den Resi-',
 'dent Rumpf , geschreven te Dresden',
 'den aghtienden deser loopende maandt,',
 'houdende advertentie. WAAR op',

merk op bij bovenstaande: para-1 overlapt met de echte deelnemerslijst, para-2 betreft al resolutie-tekst

Stappen:
- vraag de span van van der Dussen op (gaat over line grenzen) - delegate_id 398 onder attendance_list-attendance_spans
- neem daarvan 'offset' en 'end'
- doorloop paragraphs tot 'offset' en 'end' voorbij zijn
- gebruik 'start_offset' en 'line_ranges' uit paragraphs om line_id's te bepalen
- bepaal hieruit begin_anchor en end_anchor bij resp offset en end
- bepaal daarnaast locale char offsets 

In [18]:
vdDussen_span = [span for span in alist[0]['attendance_spans'] if span['delegate_id'] == 398]
vdDussen_span

[{'offset': 287,
  'end': 300,
  'class': 'delegate',
  'pattern': 'vander Dussen',
  'delegate_id': 398,
  'delegate_name': 'Duijvensz.',
  'delegate_score': 0}]

In [19]:
char_ptr = 0
last_end = 0
begin_anchor_vdd = ''
end_anchor_vdd = ''

for p in para_list_02_25:
    char_ptr += last_end
    for lr in p['line_ranges']:
        last_end = lr['end']
        # print(f"{lr['start']}\t{lr['start'] + char_ptr}")
        
        att_begin = vdDussen_span[0]['offset']
        att_end = vdDussen_span[0]['end']
        line_begin = lr['start'] + char_ptr
        line_end = lr['end'] + char_ptr
        
        if att_begin >= line_begin and att_begin < line_end:
            begin_anchor_vdd = lr['line_id']
            begin_char_offset_vdd = att_begin - lr['start']
            print(f"\n{begin_anchor_vdd}\t begin_char_offset: {begin_char_offset_vdd}")
            
        if att_end >= line_begin and att_end < line_end:
            end_anchor_vdd = lr['line_id']
            end_char_offset_vdd = att_end - lr['start']
            print(f"\n{end_anchor_vdd}\t end_char_offset: {end_char_offset_vdd}")
            break



NL-HaNA_1.01.02_3783_0115-line-412-2760-841-56	 begin_char_offset: 34

NL-HaNA_1.01.02_3783_0115-line-458-2812-795-54	 end_char_offset: 10


In [20]:
print(line_ids_vs_indexes[begin_anchor_vdd])
print(line_ids_vs_indexes[end_anchor_vdd])

14347
14348


In [21]:
all_textlines.slice(14347, 14348)

['Van Wassenaer, vanden Boetzelaar, van-', 'der Dussen, Bors van Waveren, van']

Doe bovenstaande voor alle spans in attendance_spans van alist[0] Verpak eerst in een functie.

In [22]:
def collect_attendant_info(span, paras):
    char_ptr = 0
    last_end = 0
    begin_anchor = ''
    end_anchor = ''
    result = None

    for p in paras:
        if (result != None): # bit ugly, to break out of both loops when result is reached
            break
        char_ptr += last_end

        for lr in p['line_ranges']:
            last_end = lr['end']
        
            att_begin = span['offset']
            att_end = span['end']
            line_begin = lr['start'] + char_ptr
            line_end = lr['end'] + char_ptr
        
            # print(f"l_begin: {line_begin}, l_end: {line_end}, att_begin: {att_begin}, att_end: {att_end}")
        
            if att_begin >= line_begin and att_begin < line_end:
                begin_anchor = line_ids_vs_indexes[lr['line_id']]
                begin_char_offset = att_begin - lr['start']
                # print("hit begin")
            if att_end >= line_begin and att_end < line_end:
                end_anchor = line_ids_vs_indexes[lr['line_id']]
                end_char_offset = att_end - lr['start']    
                # print("hit end")

                result = {'begin_anchor': begin_anchor, \
                    'end_anchor': end_anchor, \
                    'begin_char_offset': begin_char_offset, \
                    'end_char_offset': end_char_offset}
                break
    return result

In [23]:
for span in alist[0]['attendance_spans']:
    a_info = collect_attendant_info(span, para_list_02_25)
    print(span['pattern'])
    print(a_info)
    print(all_textlines.slice(a_info['begin_anchor'], a_info['end_anchor']))
    print('\n')

Mercurii den 25. Februarii 1728.
{'begin_anchor': 14337, 'end_anchor': 14338, 'begin_char_offset': 0, 'end_char_offset': 5}
['Mercurii den 25. Februarii', '1728.']


Den Heere
{'begin_anchor': 14340, 'end_anchor': 14340, 'begin_char_offset': 0, 'end_char_offset': 9}
['Den Heere Bout.']


Bout
{'begin_anchor': 14340, 'end_anchor': 14340, 'begin_char_offset': 10, 'end_char_offset': 14}
['Den Heere Bout.']


PRAESENTIBUS
{'begin_anchor': 14341, 'end_anchor': 14341, 'begin_char_offset': 0, 'end_char_offset': 12}
['PRAESENTIBUS,']


De Heeren
{'begin_anchor': 14342, 'end_anchor': 14342, 'begin_char_offset': 0, 'end_char_offset': 9}
['De Heeren Van Lynden, van Singendonck,']


met een extraordinaris Gedeputeerde uyt de Provincie van Gelderlandt
{'begin_anchor': 14344, 'end_anchor': 14346, 'begin_char_offset': 33, 'end_char_offset': 22}
["Brandtzenborgh, van W'ynbergen , met", 'een extraordinaris Gedeputeerde uyt de Pro-', 'vincie van Gelderlandt.']


Raadtpensionaris
{'begin_anchor': 14349, 

Bepaal welke classes er zijn, en welke ik als attendants uit wil filteren

In [24]:
classes = {}

att_lists = asearch.get_annotations_of_type('attendance_list', all_annotations, resource_id) 
for al in att_lists:    
    for span in al['attendance_spans']:
        c = span['class']
        if c in classes.keys():
            classes[c] += 1
        else:
            classes[c] = 1

print(classes)

{'nihil': 44, 'heere': 558, 'president': 234, 'presentibus': 267, 'delegate': 4104, 'preamble': 212, 'province': 305, 'resolution_summarized': 211, 'resumption': 57, 'raadpensionaris': 107, 'pre': 14}


In [25]:
attendant_classes = ('president', 'delegate', 'raadpensionaris')

Maak nu daadwerkelijk attendant annotations aan, eerst voor 25 februari, daarna voor alle sessions

In [26]:
def create_attendants_for_attlist(attlist, session_id, resource_id):
    attendant_annots = []
    
    spans = attlist['attendance_spans']
    
    sess = asearch.get_annotation_by_id(session_id, all_annotations)
    #paras = list(asearch.get_annotations_of_type_overlapping('republic_paragraph', \
                            #sess['begin_anchor'], sess['end_anchor'], all_annotations, resource_id))
    paras = list(asearch.get_annotations_of_type_overlapping('republic_paragraph', \
                            attlist['begin_anchor'], attlist['end_anchor'], all_annotations, resource_id))
    
    # for p in paras:
        # print(f"{p['begin_anchor']}, {p['end_anchor']}")
              
    for index, s in enumerate(spans):
        # print("\nspan begin")
        if s['class'] in attendant_classes:
            attendant = {'resource_id': resource_id, 'label': 'attendant'}
            attendant['id'] = session_id + '-attendant-' + str(index)
            attendant['metadata'] = s
            
            a_info = collect_attendant_info(s, paras)
            if a_info == None:    #  span not matching with text of paras
                print(f"span does not match: {s} for {session_id}")
            else:
                attendant['begin_anchor'] = a_info['begin_anchor']
                attendant['end_anchor'] = a_info['end_anchor']
                attendant['begin_char_offset'] = a_info['begin_char_offset']
                attendant['end_char_offset'] = a_info['end_char_offset']
        
                attendant_annots.append(attendant)

    return attendant_annots

In [27]:
atts = create_attendants_for_attlist(alist[0], sess_02_25['id'], resource_id)
len(atts)

21

In [28]:
al_04_07 = asearch.get_annotation_by_id('session-1728-04-07-num-1-attendance_list', all_annotations)

atts = create_attendants_for_attlist(al_04_07, 'session-1728-04-07-num-1', resource_id)
atts

span does not match: {'offset': 623, 'end': 627, 'class': 'delegate', 'pattern': 'Noey', 'delegate_id': 13278, 'delegate_name': 'Boner', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 818, 'end': 826, 'class': 'delegate', 'pattern': 'Schurman', 'delegate_id': 19130, 'delegate_name': 'Coulman', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 500, 'end': 513, 'class': 'delegate', 'pattern': 'vander Dussen', 'delegate_id': 398, 'delegate_name': 'Duijvensz.', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 593, 'end': 609, 'class': 'delegate', 'pattern': 'dat de Visschers', 'delegate_id': 385, 'delegate_name': 'Visscher', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 754, 'end': 766, 'class': 'delegate', 'pattern': 'van Goslinga', 'delegate_id': 13050, 'delegate_name': 'van Goslinga', 'delegate_score': 0} for session-1728-04-07-num-1


[{'resource_id': 'volume-1728',
  'label': 'attendant',
  'id': 'session-1728-04-07-num-1-attendant-5',
  'metadata': {'offset': 291,
   'end': 301,
   'class': 'delegate',
   'pattern': 'Van Lynden',
   'delegate_id': 18599,
   'delegate_name': 'van Lynden',
   'delegate_score': 0},
  'begin_anchor': 27661,
  'end_anchor': 27661,
  'begin_char_offset': 1,
  'end_char_offset': 11},
 {'resource_id': 'volume-1728',
  'label': 'attendant',
  'id': 'session-1728-04-07-num-1-attendant-6',
  'metadata': {'offset': 0,
   'end': 14,
   'class': 'delegate',
   'pattern': 'Mercuria den 7',
   'delegate_id': 17094,
   'delegate_name': 'Velters',
   'delegate_score': 0},
  'begin_anchor': 27651,
  'end_anchor': 27651,
  'begin_char_offset': 0,
  'end_char_offset': 14},
 {'resource_id': 'volume-1728',
  'label': 'attendant',
  'id': 'session-1728-04-07-num-1-attendant-10',
  'metadata': {'offset': 303,
   'end': 316,
   'class': 'delegate',
   'pattern': 'van Heuckelom',
   'delegate_id': 19092,
  

In [29]:
all_textlines.slice(27661, 27661)

['De Heeren Van Lynden, van Heuckelom,']

In [30]:
all_textlines.slice(al_04_07['begin_anchor'], al_04_07['end_anchor'])

['Mercuria den 7. April',
 'en verstaan, dat Copie van de voorschreve',
 '1728.',
 'Requeste gesonden sal werden aan de Be-',
 'PRAESIDE,',
 'windthebheren van de Oost-Indische Com-',
 'Den Heere Van Wassenaer.',
 'pagnie ter preesidiale Kamere in Zeelandt,',
 'PRAESENTIBUS.',
 'om der selver bericht daar op ten spoedig h-',
 'De Heeren Van Lynden, van Heuckelom,',
 'sten aan haar Hoogh Mogende te laten toe-',
 'van Wynbergen , met twee extraordinaris',
 'komen.',
 'Gedeputeerden uyt de Provincie van Gel-',
 'derlandt.']

In [31]:
# check buren van 04_07, is daarvoor de tekst ook verhaspeld?

al_04_06 = asearch.get_annotation_by_id('session-1728-04-06-num-1-attendance_list', all_annotations)
al_04_09 = asearch.get_annotation_by_id('session-1728-04-09-num-1-attendance_list', all_annotations)

In [32]:
all_textlines.slice(al_04_06['begin_anchor'], al_04_06['end_anchor'])

['1728.',
 'PRAESIDE,',
 'Den Heere Van Hassenaer.',
 'PRAESENTIBUS,',
 'De Heeren Van Lynden, van Heuckelom,',
 "van B'ynbergen, met twee extraordinaris",
 'Gedeputeerden uyt de Provincie van Gel-',
 'derlundt.',
 'Vanden Boetzelaar, Eelbo, vander Dussen,',
 'Bors van Waveren, Raadtpensionaris van',
 'Slingelandt.',
 'Bout, Noey, van Hoorn.',
 'Taats van Amerongen.',
 'Van Schwartzenbergh, van Goslinga , Ve-',
 'gilin, Schurman.',
 'Vriesen.',
 'Van Tamminga.',
 'DE Resolutien, gisteren geno-',
 'men, zyn gelelen en geresumeert ,',
 'gelyck oock geresumeert en gear-',
 'resteert zyn de Depesches daar uyt resul-',
 'reerende.']

In [33]:
all_textlines.slice(al_04_09['begin_anchor'], al_04_09['end_anchor'])

['Veneris den 9. April',
 '1728.',
 'PRAESIDE,)',
 'Den Heere Van Wassenaer.',
 'PRAESENTIBUS,',
 'De Heeren Van Lynden, van Heuckelom,',
 "wan W'ynbergen , met een extraordinaris",
 'Gedeputeerde uyt de Provincie van Gelder-',
 'landt.',
 'Eelbo, vander Dussen, Bors van Wave-',
 'ren.',
 'Bout, Noey, van Hoorn , met een extraor-',
 'dinaris Gedeputeerde uyt de Provincie van',
 'Zeelandt.',
 'Taats van Amerongen.',
 'Van Schwartzenbergb, vnn Gossiaga, Vegi-',
 'lin, Schurman.',
 'Vriesen.',
 'Van Tamminga.',
 'DE Relosutien, gisteren genso-',
 'men, zyn gelesen en geresumeert,',
 'gelijck oock geresumeert ende gear-',
 'resteert zyn de Depesches daar uyt resulte-',
 'rende.']

In [34]:
# blijkbaar komen er sessies voor zonder attendance_list. Check dit even

for sess in asearch.get_annotations_of_type('session', all_annotations, resource_id):
    alists = list(asearch.get_annotations_of_type_overlapping('attendance_list', \
                            sess['begin_anchor'], sess['end_anchor'], all_annotations, resource_id))
    if len(alists) == 0:
        print(sess['id'])

session-1728-03-17-num-1
session-1728-04-08-num-1
session-1728-05-22-num-1
session-1728-08-03-num-1
session-1728-11-18-num-1
session-1728-12-30-num-1


In [35]:
attendant_annotations = []
for al in asearch.get_annotations_of_type('attendance_list', all_annotations, resource_id):
    session_id = al['metadata']['session_id']
    atts = create_attendants_for_attlist(al, session_id, resource_id)
    attendant_annotations.extend(atts)

span does not match: {'offset': 623, 'end': 627, 'class': 'delegate', 'pattern': 'Noey', 'delegate_id': 13278, 'delegate_name': 'Boner', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 818, 'end': 826, 'class': 'delegate', 'pattern': 'Schurman', 'delegate_id': 19130, 'delegate_name': 'Coulman', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 500, 'end': 513, 'class': 'delegate', 'pattern': 'vander Dussen', 'delegate_id': 398, 'delegate_name': 'Duijvensz.', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 593, 'end': 609, 'class': 'delegate', 'pattern': 'dat de Visschers', 'delegate_id': 385, 'delegate_name': 'Visscher', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match: {'offset': 754, 'end': 766, 'class': 'delegate', 'pattern': 'van Goslinga', 'delegate_id': 13050, 'delegate_name': 'van Goslinga', 'delegate_score': 0} for session-1728-04-07-num-1
span does not match

In [36]:
len(attendant_annotations)

4412

In [37]:
all_annotations.extend(attendant_annotations)

In [38]:
len(all_annotations)

123680

In [39]:
# create overview of extracted annotations from CAF sessions and resolutions indexes

numbers_per_type = {}
for a in all_annotations:
    lbl = a['label']
    if lbl in numbers_per_type:
        numbers_per_type[lbl] += 1
    else:
        numbers_per_type[lbl] = 1
        
numbers_per_type

{'line': 107212,
 'text_region': 2140,
 'session': 308,
 'scan': 479,
 'page': 897,
 'republic_paragraph': 4503,
 'resolution': 3426,
 'attendance_list': 303,
 'attendant': 4412}

Check per annotation type how to best determine the IIIF url for the narrowest enclosing bounding box

In [40]:
sess_02_25 = asearch.get_annotation_by_id('session-1728-02-25-num-1', all_annotations)
annots = list(asearch.get_annotations_of_type_overlapping('line', \
                            sess_02_25['begin_anchor'], sess_02_25['end_anchor'], all_annotations, resource_id))
annots

[{'resource_id': 'volume-1728',
  'label': 'line',
  'coords': [[495, 2231],
   [515, 2211],
   [534, 2211],
   [541, 2218],
   [590, 2221],
   [601, 2232],
   [1111, 2232],
   [1129, 2248],
   [1129, 2278],
   [1124, 2272],
   [1100, 2279],
   [870, 2276],
   [851, 2289],
   [838, 2276],
   [543, 2279],
   [505, 2278],
   [495, 2271]],
  'metadata': {'type': 'line',
   'parent_type': 'column',
   'parent_id': 'NL-HaNA_1.01.02_3783_0115-column-364-442-880-2838',
   'text_region_id': 'NL-HaNA_1.01.02_3783_0115-text_region-364-442-880-2838',
   'extra_id': 'NL-HaNA_1.01.02_3783_0115-text_region-364-442-880-2838',
   'scan_id': 'NL-HaNA_1.01.02_3783_0115',
   'column_id': 'NL-HaNA_1.01.02_3783_0115-column-364-442-880-2838',
   'page_id': 'NL-HaNA_1.01.02_3783_0115-page-228',
   'id': 'NL-HaNA_1.01.02_3783_0115-line-495-2211-634-78'},
  'id': 'NL-HaNA_1.01.02_3783_0115-line-495-2211-634-78',
  'begin_anchor': 14337,
  'baseline': [[505, 2274], [1120, 2271]],
  'end_anchor': 14337},
 {'reso

Check in hoeverre iiif_urls van text_regions accuraat zijn. Doe dit voor de annotaties van enkele scans

In [41]:
scan_annots = list(asearch.get_annotations_of_type_overlapping('scan', \
                            sess_02_25['begin_anchor'], sess_02_25['end_anchor'], all_annotations, resource_id))
scan_annots

[{'resource_id': 'volume-1728',
  'label': 'scan',
  'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0115.jpg/full/,1931/0/default.jpg',
  'begin_anchor': 14302,
  'end_anchor': 14529,
  'id': 'NL-HaNA_1.01.02_3783_0115'},
 {'resource_id': 'volume-1728',
  'label': 'scan',
  'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0116.jpg/full/,2227/0/default.jpg',
  'begin_anchor': 14530,
  'end_anchor': 14754,
  'id': 'NL-HaNA_1.01.02_3783_0116'}]

In [42]:
tr_annots = list(asearch.get_annotations_of_type_overlapping('text_region', \
                            scan_annots[1]['begin_anchor'], scan_annots[1]['end_anchor'], all_annotations, resource_id))
tr_annots

[{'resource_id': 'volume-1728',
  'label': 'text_region',
  'coords': [[349, 2096],
   [352, 465],
   [379, 444],
   [665, 440],
   [791, 439],
   [1206, 439],
   [1245, 458],
   [1250, 602],
   [1254, 748],
   [1254, 1017],
   [1250, 2318],
   [1248, 2415],
   [987, 2466],
   [448, 2461],
   [352, 2454],
   [349, 2263]],
  'metadata': {'structure': {'type': 'resolution'},
   'type': 'resolution',
   'parent_type': 'page',
   'parent_id': 'NL-HaNA_1.01.02_3783_0116-page-230',
   'text_region_id': None,
   'scan_id': 'NL-HaNA_1.01.02_3783_0116',
   'page_id': 'NL-HaNA_1.01.02_3783_0116-page-230',
   'id': 'NL-HaNA_1.01.02_3783_0116-text_region-349-439-905-2027',
   'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0116.jpg/249,339,1105,2227/full/0/default.jpg',
   'page_num': 230,
   'text_page_num': 130},
  'id': 'NL-HaNA_1.01.02_3783_0116-text_region-349-439-905-2027',
  'begin_anchor': 14530,
  'end_anchor': 14569},
 {'resource_id': 'volume-1728'

In [43]:
# probeer even een page iiif_url uit constituent text_regions samen te stellen

page_annots = list(asearch.get_annotations_of_type_overlapping('page', \
                            sess_02_25['begin_anchor'], sess_02_25['end_anchor'], all_annotations, resource_id))
len(page_annots)

3

In [44]:
# neem even page 129, de tweede uit de list
p_129 = page_annots[1]

overlapping_trs = list(asearch.get_annotations_of_type_overlapping('text_region', \
                            p_129['begin_anchor'], p_129['end_anchor'], all_annotations, resource_id))
len(overlapping_trs)

2

In [45]:
urls = [tr['metadata']['iiif_url'] for tr in overlapping_trs]
urls

['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0115.jpg/2424,345,1128,3115/full/0/default.jpg',
 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0115.jpg/3339,344,1122,3103/full/0/default.jpg']

In [46]:
region_pattern = re.compile(r'(.jpg/)(\d+),(\d+),(\d+),(\d+)')
image_id_pattern = re.compile(r'(images.diginfra.net/iiif/)(.*)(\/)(\d+),(\d+),(\d+),(\d+)')

# assume that iiif_urls refer to the same image resource
def union_of_iiif_urls(urls): 
    # check if urls contain same image_identifier
    img_id = image_id_pattern.search(urls[0]).group(2)
    
    # for each url, find left, right, top, bottom
    boxes = []
    for url in urls:
        i_id = image_id_pattern.search(url)
        if image_id_pattern.search(url).group(2) != img_id:
            print(f'url refers to other image than {img_id}')
            print(f'\t{urls[0]}')
            print(f'\t{url}')
            break
            
        region_string = region_pattern.search(url)  
        region = {"left": int(region_string.group(2)), \
                  "right": int(region_string.group(2)) + int(region_string.group(4)), \
                  "top": int(region_string.group(3)), \
                  "bottom": int(region_string.group(3)) + int(region_string.group(5))}
        boxes.append(region)
        
    min_left = min(box['left'] for box in boxes)
    max_right = max(box['right'] for box in boxes)
    min_top = min(box['top'] for box in boxes)
    max_bottom = max(box['bottom'] for box in boxes)
    height = max_bottom - min_top
    width = max_right - min_left
    
    # construct iiif_url by replacing coordinate part in first input url
    bounding_region_str = f"{min_left},{min_top},{width},{height}"
    bounding_url = re.sub(r'(\d+),(\d+),(\d+),(\d+)', rf'{bounding_region_str}', urls[0])
    
    return bounding_url

In [47]:
union_of_iiif_urls(urls)

'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0115.jpg/2424,344,2037,3116/full/0/default.jpg'

Bovenstaande functie ga ik alleen toepassen voor regions op hetzelfde image. Het is voldoende om veiligheidshalve nog een check in te bouwen op identieke 'image_identifiers' (deel tussen 'server' en 'region' deel van de url). Bij ongelijk, throw exception. --> DONE

Pas dit vervolgens toe voor page annotations en test.

In [48]:
# vraag alle page annotations op
pg_annots = list(asearch.get_annotations_of_type('page', all_annotations, resource_id))

for pa in pg_annots:
    # per page, vraag alle overlappende text_regions op
    overlapping_regions = list(asearch.get_annotations_of_type_overlapping('text_region', \
                            pa['begin_anchor'], pa['end_anchor'], all_annotations, resource_id))
    
    # verzamel alle iiif_urls daarvan en unificeer die
    urls = [tr['metadata']['iiif_url'] for tr in overlapping_regions]
    bounding_url = union_of_iiif_urls(urls)
    region_links = [bounding_url]
    
    pa['region_links'] = region_links

pg_annots

url refers to other image than NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0123.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0123.jpg/274,341,1102,3104/full/0/default.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0122.jpg/2426,2369,2017,1092/full/0/default.jpg
url refers to other image than NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0143.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0143.jpg/350,412,1105,3085/full/0/default.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0142.jpg/2495,2331,1979,1076/full/0/default.jpg
url refers to other image than NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0143.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0143.jpg/2453,323,1128,3115/full/0/default.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0142.jpg/2495,2331,1979,1076/full/0/default.jpg
url refers to o

url refers to other image than NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0354.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0354.jpg/291,302,1117,2296/full/0/default.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0355.jpg/260,328,1129,3077/full/0/default.jpg
url refers to other image than NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0354.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0354.jpg/2440,299,1110,3135/full/0/default.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0355.jpg/260,328,1129,3077/full/0/default.jpg
url refers to other image than NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0355.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0355.jpg/260,328,1129,3077/full/0/default.jpg
	https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0354.jpg/307,2388,1097,979/full/0/default.jpg
url refers to other i

[{'resource_id': 'volume-1728',
  'label': 'page',
  'begin_anchor': 0,
  'end_anchor': 27,
  'id': 'NL-HaNA_1.01.02_3783_0051-page-101',
  'metadata': {'page_id': 'NL-HaNA_1.01.02_3783_0051-page-101',
   'scan_id': 'NL-HaNA_1.01.02_3783_0051'},
  'coords': [[2745, 3144],
   [2800, 2014],
   [2938, 1604],
   [2958, 1584],
   [3032, 1582],
   [3268, 1583],
   [3417, 1584],
   [3445, 1585],
   [3460, 1595],
   [3644, 2092],
   [3644, 2134],
   [3642, 2235],
   [3639, 2280],
   [3238, 3186],
   [2894, 3186],
   [2745, 3170]],
  'region_links': ['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/2645,725,1727,2561/full/0/default.jpg']},
 {'resource_id': 'volume-1728',
  'label': 'page',
  'begin_anchor': 28,
  'end_anchor': 143,
  'id': 'NL-HaNA_1.01.02_3783_0052-page-102',
  'metadata': {'page_id': 'NL-HaNA_1.01.02_3783_0052-page-102',
   'scan_id': 'NL-HaNA_1.01.02_3783_0052'},
  'coords': [[508, 3249],
   [508, 3353],
   [548, 3370],
   [914, 3371],
   

In [49]:
# vraag alle sessions op
s_annots = list(asearch.get_annotations_of_type('session', all_annotations, resource_id))

for s in s_annots:
    # per session, vraag alle text_regions op
    overlapping_regions = list(asearch.get_annotations_of_type_overlapping('text_region', \
                            s['begin_anchor'], s['end_anchor'], all_annotations, resource_id))
    
    # verzamel alle iiif_urls daarvan en zet ze in volgorde in 'region_links'
    overlapping_regions.sort(key=lambda r_ann: r_ann['begin_anchor'])
    
    urls = [tr['metadata']['iiif_url'] for tr in overlapping_regions]
    s['region_links'] = urls
    
s_annots

[{'resource_id': 'volume-1728',
  'label': 'session',
  'coords': None,
  'metadata': {'id': 'session-1728-01-02-num-1',
   'type': 'session',
   'inventory_num': 3783,
   'session_date': '1728-01-02',
   'session_year': 1728,
   'session_month': 1,
   'session_day': 2,
   'session_weekday': 'Veneris',
   'date_shift_status': 'normal',
   'session_num': 1,
   'president': None,
   'attendants_list_id': None,
   'resolution_ids': [],
   'is_workday': True,
   'has_session_date_element': True,
   'lines_include_rest_day': False,
   'text_page_num': [1, 2, 3, 4, 5, 6],
   'index_timestamp': '2022-01-10T14:33:04.522221'},
  'id': 'session-1728-01-02-num-1',
  'begin_anchor': 0,
  'evidence': [{'type': 'PhraseMatch',
    'phrase': 'Jovis den 1 Januarii',
    'variant': 'Jovis den 1 Januarii',
    'string': 'Jovis den 1. Januarii',
    'offset': 0,
    'label': 'session_date',
    'ignorecase': False,
    'text_id': 'NL-HaNA_1.01.02_3783_0051-line-2938-1582-522-64',
    'match_scores': {'cha

In [50]:
# resolutie region_links, op basis van lines.

def get_bounding_box_for_coords(coords):
    min_left = min([crd[0] for crd in coords])
    max_right = max([crd[0] for crd in coords])
    min_top = min([crd[1] for crd in coords])
    max_bottom = max([crd[1] for crd in coords])
    
    return {
        'left': min_left,
        'top': min_top,
        'right': max_right,
        'bottom': max_bottom,
        'width': max_right - min_left,
        'height': max_bottom - min_top
    }

def get_bounding_box_for_annots(annotations):        
    ann_bounds = [get_bounding_box_for_coords(a['coords']) for a in annotations if 'coords' in a]

    min_left = min([ab['left'] for ab in ann_bounds]) if ann_bounds != [] else 0
    max_right = max([ab['right'] for ab in ann_bounds]) if ann_bounds != [] else 0
    min_top = min([ab['top'] for ab in ann_bounds]) if ann_bounds != [] else 0
    max_bottom = max([ab['bottom'] for ab in ann_bounds]) if ann_bounds != [] else 0
    
    return {
        'left': min_left,
        'top': min_top,
        'right': max_right,
        'bottom': max_bottom,
        'width': max_right - min_left,
        'height': max_bottom - min_top
    }    
    
# vraag alle resoluties op
r_annots = list(asearch.get_annotations_of_type('resolution', all_annotations, resource_id))

for num, res in enumerate(r_annots[:15]):
    print('\n')
    print(res['id'])
    
    # voor iedere resolutie, vraag overlappende regions        
    overlapping_regions = list(asearch.get_annotations_of_type_overlapping('text_region', \
                            res['begin_anchor'], res['end_anchor'], all_annotations, resource_id))   
    overlapping_regions.sort(key=lambda reg_ann: reg_ann['begin_anchor'])
    print(f"num of overlapping_regions: {len(overlapping_regions)}")

    lines_in_resolution = list(asearch.get_annotations_of_type_overlapping('line', \
                            res['begin_anchor'], res['end_anchor'], all_annotations, resource_id))
    
    # bepaal bounding box voor met RESOLUTION overlappende lines, per text_region
    for tr in overlapping_regions:
        lines_in_region = list(asearch.get_annotations_of_type_overlapping('line', \
                            tr['begin_anchor'], tr['end_anchor'], all_annotations, resource_id))
        
        lines_in_intersection = [l for l in lines_in_resolution if l in lines_in_region]
        
        # determine box enclosing line boxes
        print(f"num of lines_in_intersection: {len(lines_in_intersection)}")
        bb = get_bounding_box_for_annots(lines_in_intersection)
        
        # zet om naar iiif_url en voeg toe aan resolutions' region_links
        bb_str = f"{bb['left']},{bb['top']},{bb['width']},{bb['height']}"
        region_url = re.sub(r'(\d+),(\d+),(\d+),(\d+)', rf'{bb_str}', tr['metadata']['iiif_url'])
        print(region_url)
        
        if num == 9:
            print(lines_in_intersection)



session-1728-01-02-num-1-resolution-1
num of overlapping_regions: 1
num of lines_in_intersection: 26
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0052.jpg/515,454,907,1269/full/0/default.jpg


session-1728-01-02-num-1-resolution-2
num of overlapping_regions: 1
num of lines_in_intersection: 23
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0052.jpg/510,1753,912,1135/full/0/default.jpg


session-1728-01-02-num-1-resolution-3
num of overlapping_regions: 2
num of lines_in_intersection: 10
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0052.jpg/508,2920,909,451/full/0/default.jpg
num of lines_in_intersection: 2
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0052.jpg/1440,464,900,106/full/0/default.jpg


session-1728-01-02-num-1-resolution-4
num of overlapping_regions: 1
num of lines_in_intersection: 10
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_

[{'resource_id': 'volume-1728', 'label': 'line', 'coords': [[1445, 799], [1465, 779], [1664, 783], [1759, 776], [1769, 786], [1938, 782], [2000, 790], [2283, 791], [2296, 802], [2326, 809], [2326, 839], [2321, 833], [2227, 836], [2214, 846], [2202, 836], [1935, 833], [1909, 842], [1897, 832], [1863, 831], [1838, 841], [1826, 830], [1752, 828], [1698, 838], [1677, 826], [1542, 823], [1493, 872], [1455, 872], [1445, 862]], 'metadata': {'type': 'line', 'parent_type': 'column', 'parent_id': 'NL-HaNA_1.01.02_3783_0053-column-1445-492-881-2887', 'text_region_id': 'NL-HaNA_1.01.02_3783_0053-text_region-1445-492-881-2887', 'extra_id': 'NL-HaNA_1.01.02_3783_0053-text_region-1445-492-881-2887', 'scan_id': 'NL-HaNA_1.01.02_3783_0053', 'column_id': 'NL-HaNA_1.01.02_3783_0053-column-1445-492-881-2887', 'page_id': 'NL-HaNA_1.01.02_3783_0053-page-104', 'id': 'NL-HaNA_1.01.02_3783_0053-line-1445-776-881-96'}, 'id': 'NL-HaNA_1.01.02_3783_0053-line-1445-776-881-96', 'begin_anchor': 326, 'baseline': [[14

num of overlapping_regions: 2
num of lines_in_intersection: 38
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0053.jpg/2746,1525,912,1859/full/0/default.jpg
num of lines_in_intersection: 9
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0053.jpg/3681,466,910,455/full/0/default.jpg


session-1728-01-02-num-1-resolution-13
num of overlapping_regions: 2
num of lines_in_intersection: 50
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0053.jpg/3664,953,924,2431/full/0/default.jpg
num of lines_in_intersection: 25
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0054.jpg/532,438,931,1274/full/0/default.jpg


session-1728-01-03-num-1-resolution-1
num of overlapping_regions: 1
num of lines_in_intersection: 5
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0054.jpg/523,2850,902,258/full/0/default.jpg


session-1728-01-03-num-1-resolution-2
num of overlapping_regi

Probeer een nieuwe strategie, die waarschijnlijk efficienter is (want bespaart coord->bounding box transformaties) en die tussentijds beter is te evalueren: bereken eerst iiif_urls voor alle lines, en bepaal daarna voor alle line based regions de iiif_url mbv de functie union_of_iiif_urls

In [51]:
iiif_base = 'https://images.diginfra.net/iiif/'
iiif_extension = '/full/0/default.jpg'

# vraag alle lines op
line_annots = list(asearch.get_annotations_of_type('line', all_annotations, resource_id))

# voeg iiif region_links toe aan alle line annotaties
for line in line_annots:
    coords = line['coords']
    bb = get_bounding_box_for_coords(coords)
    bb_str = f"{bb['left']},{bb['top']},{bb['width']},{bb['height']}"
    scan_id = line['metadata']['scan_id']
    items = scan_id.split('_')
    
    region_url = f"{iiif_base}{items[0]}_{items[1]}/{items[2]}/{scan_id}.jpg/{bb_str}{iiif_extension}"    
    region_links = [region_url]    
    line['region_links'] = region_links

STATUS: iiif_urls toegevoegd aan alle lines. Bepaal nu daaruit de region_links voor resolutions, attendance_lists, etc

In [52]:
line_based_types = ['session', 'republic_paragraph', 'resolution', 'attendance_list', 'attendant']

# vraag alle resoluties op
for ann_type in line_based_types:
    print(f"Starting with annotation type {ann_type}")
    annots = list(asearch.get_annotations_of_type(ann_type, all_annotations, resource_id))

    for num, ann in enumerate(annots):
        ann_region_links = []
    
        # voor iedere resolutie, vraag overlappende regions        
        overlapping_regions = list(asearch.get_annotations_of_type_overlapping('text_region', \
                                ann['begin_anchor'], ann['end_anchor'], all_annotations, resource_id))   
        overlapping_regions.sort(key=lambda reg_ann: reg_ann['begin_anchor'])

        lines_in_annotation = list(asearch.get_annotations_of_type_overlapping('line', \
                                ann['begin_anchor'], ann['end_anchor'], all_annotations, resource_id))
    
        # bepaal bounding box voor met RESOLUTION overlappende lines, per text_region
        for tr in overlapping_regions:
            lines_in_region = list(asearch.get_annotations_of_type_overlapping('line', \
                                tr['begin_anchor'], tr['end_anchor'], all_annotations, resource_id))
        
            lines_in_intersection = [l for l in lines_in_annotation if l in lines_in_region]
        
            # determine iiif url region enclosing the line boxes, assume each line has only one url
            urls = [l['region_links'][0] for l in lines_in_intersection]
            region_url = union_of_iiif_urls(urls)
            ann_region_links.append(region_url)
        
            # generate output to report potential issues with layout of text_regions
            region_string = region_pattern.search(region_url) 
            width = int(region_string.group(4))        
            if width > 1000:
                print(f"potential error in layout, width of text_region {tr['id']} too large: {width}")
        ann['region_links'] = ann_region_links
        if num % 100 == 0:
            print(f"{num} annotations of type {ann_type} processed")

Starting with annotation type session
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0053-text_region-532-489-1794-2907 too large: 1794
0 annotations of type session processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0068-text_region-413-435-1827-2909 too large: 1827
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0080-text_region-455-417-1829-2904 too large: 1829
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0087-text_region-2660-1892-1796-1474 too large: 1796
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0091-text_region-2663-446-1816-2905 too large: 1816
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0092-text_region-2678-780-1833-2573 too large: 1833
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0097-text_region-477-438-1836-2911 too large: 1836
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0119-text

potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0097-text_region-477-438-1836-2911 too large: 1836
500 annotations of type republic_paragraph processed
600 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0119-text_region-2511-457-1825-2902 too large: 1825
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0122-column-2524-458-1821-2894 too large: 1817
700 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0122-column-2524-458-1821-2894 too large: 1817
800 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0142-column-2549-418-1809-2902 too large: 1779
900 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0142-column-2549-418-1809-2902 too large: 1779
1000 annotations of type repub

2400 annotations of type republic_paragraph processed
2500 annotations of type republic_paragraph processed
2600 annotations of type republic_paragraph processed
2700 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0342-text_region-2484-421-1145-2924 too large: 1145
2800 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0353-text_region-363-469-1824-2931 too large: 1824
2900 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0357-text_region-350-452-1851-2914 too large: 1825
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0363-text_region-379-480-1824-2914 too large: 1824
3000 annotations of type republic_paragraph processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0377-text_region-449-434-1816-2914 too large: 1816
3100 annotations of t

potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0161-column-335-390-1834-2945 too large: 1815
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0161-column-335-390-1834-2945 too large: 1810
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0161-column-335-390-1834-2945 too large: 1831
800 annotations of type resolution processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0173-column-2557-421-1799-2918 too large: 1813
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0173-column-2557-421-1799-2918 too large: 1811
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0173-column-2557-421-1799-2918 too large: 1845
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0173-column-2557-421-1799-2918 too large: 1811
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0173-column-2557-421-1799-2918 too large: 1845
potential error in layout, wid

3300 annotations of type resolution processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0525-text_region-2436-505-1795-2906 too large: 1795
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0524-column-2411-470-2063-2895 too large: 1810
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0525-column-2447-498-902-2894 too large: 1795
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0525-column-2447-498-902-2894 too large: 1795
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0530-column-388-410-1899-2893 too large: 1805
3400 annotations of type resolution processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0530-column-388-410-1899-2893 too large: 1805
Starting with annotation type attendance_list
0 annotations of type attendance_list processed
potential error in layout, width of text_region NL-HaNA_1.01.02_3783_0122-column-2524-458-1821-2894 too large: 1817
p

Overblijvend: text_regions

In [53]:
region_annots = list(asearch.get_annotations_of_type('text_region', all_annotations, resource_id))
for ra in region_annots:
    ra['region_links'] = [ra['metadata']['iiif_url']]
    
for ra in region_annots[:10]:
    print(ra)

{'resource_id': 'volume-1728', 'label': 'text_region', 'coords': [[2745, 3144], [2800, 2014], [2938, 1604], [2958, 1584], [3032, 1582], [3268, 1583], [3417, 1584], [3445, 1585], [3460, 1595], [3644, 2092], [3644, 2134], [3642, 2235], [3639, 2280], [3238, 3186], [2894, 3186], [2745, 3170]], 'metadata': {'page_id': 'NL-HaNA_1.01.02_3783_0051-page-101', 'scan_id': 'NL-HaNA_1.01.02_3783_0051', 'type': ['column', 'pagexml_doc', 'text_region'], 'id': 'NL-HaNA_1.01.02_3783_0051-text_region-2745-1582-899-1604', 'median_normal_left': 3667, 'median_normal_right': 4272, 'median_normal_width': 605, 'median_normal_length': 30, 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/2645,1482,1099,1804/full/0/default.jpg', 'parent_type': 'page', 'parent_id': 'NL-HaNA_1.01.02_3783_0051-page-101', 'page_num': 101, 'text_page_num': 1}, 'id': 'NL-HaNA_1.01.02_3783_0051-text_region-2745-1582-899-1604', 'begin_anchor': 0, 'end_anchor': 17, 'region_links': ['https:/

In [54]:
num = 0
for a in all_annotations:
    if not 'region_links' in a:
        print(a)
        num += 1
        
print(num)

{'resource_id': 'volume-1728', 'label': 'scan', 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/full/,1804/0/default.jpg', 'begin_anchor': 0, 'end_anchor': 27, 'id': 'NL-HaNA_1.01.02_3783_0051'}
{'resource_id': 'volume-1728', 'label': 'scan', 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0052.jpg/full/,3117/0/default.jpg', 'begin_anchor': 28, 'end_anchor': 261, 'id': 'NL-HaNA_1.01.02_3783_0052'}
{'resource_id': 'volume-1728', 'label': 'scan', 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0053.jpg/full/,3056/0/default.jpg', 'begin_anchor': 262, 'end_anchor': 497, 'id': 'NL-HaNA_1.01.02_3783_0053'}
{'resource_id': 'volume-1728', 'label': 'scan', 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0054.jpg/full/,1474/0/default.jpg', 'begin_anchor': 498, 'end_anchor': 723, 'id': 'NL-HaNA_1.01.02_3783_0054'}
{'resource_id': 

In [56]:
scan_annots = list(asearch.get_annotations_of_type('scan', all_annotations, resource_id))
len(scan_annots)

479

In [57]:
scan_annots = list(asearch.get_annotations_of_type('scan', all_annotations, resource_id))
for sa in scan_annots:
    sa['region_links'] = [sa['iiif_url']]
    
for sa in scan_annots[:10]:
    print(sa)

{'resource_id': 'volume-1728', 'label': 'scan', 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/full/,1804/0/default.jpg', 'begin_anchor': 0, 'end_anchor': 27, 'id': 'NL-HaNA_1.01.02_3783_0051', 'region_links': ['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/full/,1804/0/default.jpg']}
{'resource_id': 'volume-1728', 'label': 'scan', 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0052.jpg/full/,3117/0/default.jpg', 'begin_anchor': 28, 'end_anchor': 261, 'id': 'NL-HaNA_1.01.02_3783_0052', 'region_links': ['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0052.jpg/full/,3117/0/default.jpg']}
{'resource_id': 'volume-1728', 'label': 'scan', 'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0053.jpg/full/,3056/0/default.jpg', 'begin_anchor': 262, 'end_anchor': 497, 'id': 'NL-HaNA_1.01.02_3783_0053', 're

In [58]:
num = 0
for a in all_annotations:
    if not 'region_links' in a:
        print(a)
        num += 1
        
print(num)

0


Sla resultaten op in json stores

In [66]:
def add_segmented_text_to_store(segmented_text, store_name):
    try:
        with open(datadir+store_name, 'r') as filehandle:
            data = json.loads(filehandle.read())
    except FileNotFoundError:
        data = {'_resources' : []}
        
    data['_resources'].append(segmented_text)
    
    with open(datadir+store_name, 'w') as filehandle:
        json.dump(data, filehandle, indent=4, cls=segmentedtext.SegmentEncoder)     

In [67]:
add_segmented_text_to_store(all_textlines, text_store)

In [70]:
def add_annotations_to_store(annotations, store_name):
    try:
        with open(datadir+store_name, 'r') as filehandle:
            data = json.loads(filehandle.read())
    except FileNotFoundError:
        data = []
        
    data.extend(annotations)
    
    with open(datadir+store_name, 'w') as filehandle:
        json.dump(data, filehandle, indent=4, cls=segmentedtext.AnchorEncoder)   

In [71]:
add_annotations_to_store(all_annotations, annotation_store)