In [1]:
import sys
sys.path.append('../../packages')

from textservice import segmentedtext
from annotation import asearch

datadir = '../data/'
text_store = '1728-06-19-textstore.json'
annotation_store = '1728-06-19-annotationstore.json'
resource_id = 'volume-1728'

In [2]:
import json
import glob
import re

# read files

all_textlines=segmentedtext.IndexedSegmentedText(resource_id)
all_annotations=[]

# We want to load 'text containers' that contain more or less contiguous text and are as long as practically
# possible. Container size is determined by pragmatic reasons, e.g. technical (performance) or user driven
# (corresponding with all scans in a book or volume). This functions returns all component files IN TEXT ORDER.
# Examples: sorted list of files, part of IIIF manifest.

def get_file_sequence_for_container(text_container):
    path = "../../data/1728/CAF-sessions-1728/meeting-*-session*.json"
    session_file_names = (f for f in glob.glob(path))
    return sorted(session_file_names)

# Many file types contain a hierarchy of ordered text and/or annotation elements of different types. Some form of
# depth-first, post order traversal is necessary. Examples: processing a json hierarchy with dictionaries
# and lists (republic) or parsing TEI XML (DBNL document).

def traverse(node,node_label,text,annotations):
    # find the list that represents the children, each child is a dict, assume first list is the correct one
    children = []
    label_of_children = ''
    for key,val in node.items():
        if (type(val) == list):
            children = val
            label_of_children = key
            break 
    
    if 'coords' in node:
        coords = node['coords']
    else:
        coords = None
    
    begin_index = text.len()
    annotation_info = {'resource_id': resource_id, 'label' : node_label,'image_coords': coords,'begin_anchor' : begin_index}
    if len(children) == 0:        # if no children, do your 'leaf node thing'
        annotation_info['id'] = node['metadata']['id']
        annotation_info['end_anchor'] = text.len()
        node_text = node['text']
        
        if node_text is None:
            node_text = '\n'

        text.append(node_text)
    else:                         # if non-leaf node, first visit children     
        for child in children:
            traverse(child,label_of_children,text,annotations)
        
        # ONDERSTAANDE IS SMERIG, hangt van onzekere aannames af
        for k in node['metadata'].keys():
            idkey = ''
            if k.endswith('id'):
                idkey = k
                break
        annotation_info['id'] = node['metadata'][idkey]
        
        end_index = text.len()-1
        annotation_info['end_anchor'] = end_index    # after child text segments are added
        
        if node_label == 'sessions':
            annotation_info['session_date'] = node['metadata']['meeting_date']
            annotation_info['session_year'] = node['metadata']['meeting_year']
            annotation_info['session_weekday'] = node['metadata']['meeting_weekday']
            annotation_info['president'] = node['metadata']['president']
        
        # if node contains iiif_url, create extra annotation_info for 'scanpage'
        if 'iiif_url' in node['metadata']:
            scan_annot_info = {'resource_id': resource_id, 'label':'scanpage','iiif_url':node['metadata']['iiif_url'],\
                               'begin_anchor':begin_index,'end_anchor':end_index}
            scan_annot_info['scan_id'] = node['metadata']['scan_id']
            annotations.append(scan_annot_info)     
        
    annotations.append(annotation_info)
    return

# In case of presence of a hierarchical structure, processing/traversal typically starts from a root element.

def get_root_tree_element(file):
    with open(file, 'r') as myfile:
        session_file=myfile.read() 
        
    session_data = json.loads(session_file)      
    return session_data['_source']

# Rudimentary version of a scanpage_handler

def deduplicate_scanpage_annotations(a_array):
    # use a generator to create a list of only scanpage annotation_info dicts
    scan_page_annots = [ann_info for ann_info in a_array if ann_info['label'] == 'scanpage']
        
    # use groupBy on a list of dicts (zie Python cookbook 1.15)
    from operator import itemgetter
    from itertools import groupby

    # first sort on scan_num
    scan_page_annots.sort(key=itemgetter('scan_id'))

    # iterate in groups
    aggregated_scan_annots = []

    for scan_id, items in groupby(scan_page_annots, key=itemgetter('scan_id')):
        # first, convert the 'items' iterator to a list, to able to use it twice (iterators can be used once)
        itemlist = list(items)
    
        # copy the item with the lowest begin_index
        aggr_scan_annot = min(itemlist, key=itemgetter('begin_anchor')).copy()
    
        # replace 'end_anchor' with the highest end_index in the group
        max_end_index = max(itemlist, key=itemgetter('end_anchor'))['end_anchor']
        aggr_scan_annot['end_anchor'] = max_end_index
        
        # add to result
        aggregated_scan_annots.append(aggr_scan_annot)

#    for scan_ann in aggregated_scan_annots:
#        scan_ann['iiif_url'] = re.sub(r'(\d+),(\d+),(\d+),(\d+)/(full)', r'\5/,\4', scan_ann['iiif_url'])
    
    a_array = [ann for ann in a_array if ann not in scan_page_annots]
    a_array.extend(aggregated_scan_annots)
           
    return

def correct_scanpage_imageurls(a_array):
    scan_page_annots = [ann_info for ann_info in a_array if ann_info['label'] == 'scanpage']
    
    for scan_ann in scan_page_annots:
        scan_ann['iiif_url'] = re.sub(r'(\d+),(\d+),(\d+),(\d+)/(full)', r'\5/,\4', scan_ann['iiif_url'])
        
    return

# Rudimentary version of a page_handler

def add_page_annotations(source_data, ann_array):
    page_data = source_data['page_versions']
        
    # generator
    page_identifiers = (pg['page_id'] for pg in page_data)
    page_annots = [{'resource_id': resource_id, 'label' : 'pages','id' : page_id} for page_id in page_identifiers]
        
    for pa in page_annots:
        scan_num = int(re.search(r'(\d+)-page-', pa['id']).group(1))
        scanpage_for_scan_num = [ai for ai in annotation_array if 'scan_num' in ai.keys() and ai['scan_num'] == \
                                 scan_num]
        pa['begin_anchor'] = scanpage_for_scan_num[0]['begin_anchor']
        pa['end_anchor'] = scanpage_for_scan_num[0]['end_anchor']
        pa['indexesByContainment'] = True
     
    ann_array.extend(page_annots)
    return

# Process per file, properly concatenate results, maintaining proper referencing the baseline text elements
for f_name in get_file_sequence_for_container(resource_id):
    text_array = segmentedtext.IndexedSegmentedText()
    annotation_array = []
            
    source_data = get_root_tree_element(f_name)

    traverse(source_data,'sessions',text_array,annotation_array)
        
    scanpages = deduplicate_scanpage_annotations(annotation_array) 
    correct_scanpage_imageurls(annotation_array)
    
    # add_page_annotations(source_data, annotation_array)
           
    # properly concatenate annotation info taking ongoing line indexes into account
    for ai in annotation_array:
        ai['begin_anchor'] += all_textlines.len()
        ai['end_anchor'] += all_textlines.len()
    
    all_textlines.extend(text_array)
    
    if f_name == '../../data/1728/CAF-sessions-1728/meeting-1728-06-19-session-1.json':
        all_annotations.extend(annotation_array)

In [3]:
all_textlines.len()

114074

In [4]:
len(all_annotations)

1445

In [5]:
resolution_annotations=[]

def get_resolution_files_for_container(text_container):
    path = "../../data/1728/CAF-resolutions-1728/1728-06-19-resolutions.json"
    resolution_file_names = (f for f in glob.glob(path))
    return sorted(resolution_file_names)

def res_traverse(node, line_ids):
    # find the list that represents the children, each child is a dict, assume first list is the correct one
    children = []
    label_of_children = ''
    
    # assume, first list in dict are the children
    for key,val in node.items():
        # HACK ALERT! assumption that first list contains children has exceptions
        if type(val) == list and key != 'paragraphs' and key != 'evidence':
            children = val
            label_of_children = key
            break 
    
    if len(children) == 0:        # if no children, do your 'leaf node thing'
        line_ids.append(node['metadata']['id'])
    else:                         # if non-leaf node, first visit children     
        for child in children:
            res_traverse(child,line_ids)
                            
    return

# In case of presence of a hierarchical structure, processing/traversal typically starts from a root element.

def get_res_root_element(file):
    with open(file, 'r') as myfile:
        resolution_file=myfile.read() 
        
    resolution_data = json.loads(resolution_file)      
    return resolution_data['hits']['hits']

for f_name in get_resolution_files_for_container(resource_id):    
    # get list of resolution 'hits'
    hits = get_res_root_element(f_name)
    for hit in hits:
        # each hit corresponds with a resolution
        resolution_line_ids = []
        res_traverse(hit['_source'],resolution_line_ids)
        
        resolution_info = {'resource_id': resource_id, 'label' : 'resolutions','begin_anchor' : resolution_line_ids[0], \
                                      'end_anchor': resolution_line_ids[len(resolution_line_ids)-1], 'id': hit['_id']}
        resolution_info['proposition_type'] = hit['_source']['metadata']['proposition_type']
        resolution_annotations.append(resolution_info)

In [6]:
len(resolution_annotations)

10

In [7]:
resolution_annotations[:3]

[{'resource_id': 'volume-1728',
  'label': 'resolutions',
  'begin_anchor': 'NL-HaNA_1.01.02_3783_0286-page-571-column-1-tr-1-line-0',
  'end_anchor': 'NL-HaNA_1.01.02_3783_0286-page-571-column-1-tr-1-line-11',
  'id': 'meeting-1728-06-19-session-1-resolution-14',
  'proposition_type': 'missive'},
 {'resource_id': 'volume-1728',
  'label': 'resolutions',
  'begin_anchor': 'NL-HaNA_1.01.02_3783_0286-page-571-column-1-tr-2-line-0',
  'end_anchor': 'NL-HaNA_1.01.02_3783_0287-page-572-column-0-tr-0-line-23',
  'id': 'meeting-1728-06-19-session-1-resolution-15',
  'proposition_type': 'missive'},
 {'resource_id': 'volume-1728',
  'label': 'resolutions',
  'begin_anchor': 'NL-HaNA_1.01.02_3783_0287-page-572-column-0-tr-1-line-0',
  'end_anchor': 'NL-HaNA_1.01.02_3783_0287-page-572-column-0-tr-1-line-22',
  'id': 'meeting-1728-06-19-session-1-resolution-16',
  'proposition_type': 'requeste'}]

In [8]:
line_ids_vs_indexes = {}
for line in all_annotations:
    if line['label'] == 'lines':
        line_ids_vs_indexes.update({line['id'] : line['begin_anchor']})

In [9]:
len(line_ids_vs_indexes)

708

In [10]:
num_errors = 0
for res in resolution_annotations:    
    try:
        # res['begin_anchor'] = re.sub(r'-column-', r'-col-', res['begin_anchor'])
        # res['end_anchor'] = re.sub(r'-column-', r'-col-', res['end_anchor'])
        res['begin_anchor'] = line_ids_vs_indexes[res['begin_anchor']]
        res['end_anchor'] = line_ids_vs_indexes[res['end_anchor']]
    except:
        res['begin_anchor'] = 0
        res['end_anchor'] = 0
        num_errors += 1
        
if num_errors > 0:
    print(f"number of lookup errors: {num_errors}")

In [11]:
resolution_annotations[:3]

[{'resource_id': 'volume-1728',
  'label': 'resolutions',
  'begin_anchor': 56314,
  'end_anchor': 56325,
  'id': 'meeting-1728-06-19-session-1-resolution-14',
  'proposition_type': 'missive'},
 {'resource_id': 'volume-1728',
  'label': 'resolutions',
  'begin_anchor': 56326,
  'end_anchor': 56354,
  'id': 'meeting-1728-06-19-session-1-resolution-15',
  'proposition_type': 'missive'},
 {'resource_id': 'volume-1728',
  'label': 'resolutions',
  'begin_anchor': 56355,
  'end_anchor': 56377,
  'id': 'meeting-1728-06-19-session-1-resolution-16',
  'proposition_type': 'requeste'}]

In [12]:
all_annotations.extend(resolution_annotations)

In [13]:
len(all_annotations)

1455

In [14]:
sessions = asearch.get_annotations_of_type('sessions', all_annotations, resource_id) 
for s in sessions:
    print(s)

{'resource_id': 'volume-1728', 'label': 'sessions', 'image_coords': None, 'begin_anchor': 55952, 'id': 'meeting-1728-06-19-session-1', 'end_anchor': 56659, 'session_date': '1728-06-19', 'session_year': 1728, 'session_weekday': 'Sabbathi', 'president': ''}


Reconstrueer attendant_lists uit sessions en resolutions

Lees 1728_pres.json in uit Rik's bestand

In [23]:
with open('../../data/1728/CAF-json/1728_pres.json', 'r') as filehandle:
    pres_data = json.loads(filehandle.read())
    
pres_data

[{'metadata': {'inventory_num': 3783,
   'meeting_lines': [],
   'coords': [],
   'text': '',
   'zittingsdag_id': 'meeting-1728-01-01-session-1',
   'url': None},
  'spans': []},
 {'metadata': {'inventory_num': 3783,
   'meeting_lines': ['Jovis den 1, Januarii',
    '1728',
    'Zynde Nieuwejaarsdagh.',
    'Nibil attum eft.',
    'Veneris den 2. Januari',
    '1728.',
    'PR&ESIDE,',
    'Den Heere #an HWalenaer.',
    'PRA&SENTIBUS,',
    'De Heeren Van Singendonck , van Heecke',
    'ren tot Barlham , van Heeckeren tot den',
    'Brandtzenborgh, van Wynbergen, Torck,',
    'Umbgroeven , met een extraordinaris Ge-',
    'deputeerde uyt de Provincie van Gelder',
    'landt.',
    'Six, Vander Dufen , Bers van Waveren,',
    'van Mar(eveen, de Raadt.',
    'Valters, Noey, van Hoorn , met een extra',
    'ordinaris Gedeputeerde uyt de Provincie van',
    'Zeeland.',
    'Pan Renswoude.',
    'Van Schwartzenbergh, Fegilin.',
    'Van Haar[olte.',
    'Van Tamminga.',
    '2E, Keoutien 

In [24]:
att_lists = []

for a in asearch.get_annotations_of_type('sessions', all_annotations):
    attendants_list = {'resource_id': resource_id, 'label': 'attendantslists'}
    attendants_list['begin_anchor'] = a['begin_anchor']
    
    # approximate end of attendants_list by taking begin of first resolution in the session
    overlapping_resolutions = asearch.get_annotations_of_type_overlapping('resolutions',\
                                a['begin_anchor'], a['end_anchor'], all_annotations, resource_id)    
    reslist = list(overlapping_resolutions)
    
    # remove resolutions that are not aligned with text lines
    reslist = [r for r in reslist if r['end_anchor'] != 0]
    
    if len(reslist) == 0:
        attendants_list['end_anchor'] = a['end_anchor']
    else:
        attendants_list['end_anchor'] = min([r['begin_anchor'] for r in reslist]) - 1
    
    attendants_list['id'] = a['id'] + '-attendantslist-1'
    attendants_list['session_id'] = a['id']
    att_lists.append(attendants_list)  

Doorloop  pres_data per zittingsdag. 1) vul attendents_list voor die zittingsdag aan met metadata 2) maak attendants voor iedere aangetroffen span. Reken offset + end om naar line_id(s)

In [25]:
def get_lines_for_offset(offset, end, meeting_lines, offset_in_resource):
    result = []
    line_start = 0
    for line_index, line_text in enumerate(meeting_lines):
        line_end = line_start + len(line_text)+1
        if (offset >= line_start and offset < line_end) or\
            (end >= line_start and end < line_end) or\
            (offset <= line_start and end >= line_end):
            
            result.append(line_index)
        line_start = line_end
    result = [line_index + offset_in_resource for line_index in result]
    return result

In [26]:
def create_attendants_for_attlist(attlist, att_list_annot, session_id):
    attendant_annots = []
    spans = attlist['spans']
    for index, s in enumerate(spans):
        attendant = {'resource_id': resource_id, 'label': 'attendants'}
        attendant['id'] = session_id + '-attendant-' + str(index)
        attendant['metadata'] = s

        overlapping_lines = get_lines_for_offset(s['offset'], s['end'], attlist['metadata']['meeting_lines'], \
                            att_list_annot['begin_anchor'])

        attendant['begin_anchor'] = min(overlapping_lines)
        attendant['end_anchor'] = max(overlapping_lines)
        
        attendant_annots.append(attendant)
    return attendant_annots

In [29]:
attendant_annots = []

for attlist in pres_data[1:]: # skip the first, seems to be non-existing
    session_id = attlist['metadata']['zittingsdag_id']
    if session_id == 'meeting-1728-06-19-session-1':
        sess_annot = asearch.get_annotation_by_id(session_id, all_annotations)
        att_list_annot = list(asearch.get_annotations_of_type_overlapping('attendantslists', \
                                sess_annot['begin_anchor'], sess_annot['end_anchor'], att_lists, resource_id))[0]
        results = create_attendants_for_attlist(attlist, att_list_annot, session_id)
        attendant_annots.extend(results)
    
attendant_annots[:3]

[{'resource_id': 'volume-1728',
  'label': 'attendants',
  'id': 'meeting-1728-06-19-session-1-attendant-0',
  'metadata': {'offset': 53,
   'end': 61,
   'class': 'delegate',
   'pattern': 'Glinfra.',
   'delegate_id': 13046,
   'delegate_name': 'van Glinstra',
   'delegate_score': 0.0},
  'begin_anchor': 55955,
  'end_anchor': 55955},
 {'resource_id': 'volume-1728',
  'label': 'attendants',
  'id': 'meeting-1728-06-19-session-1-attendant-1',
  'metadata': {'offset': 90,
   'end': 100,
   'class': 'delegate',
   'pattern': 'Heuckelom,',
   'delegate_id': 19778,
   'delegate_name': 'van Hoeckelom',
   'delegate_score': 0.0},
  'begin_anchor': 55957,
  'end_anchor': 55957},
 {'resource_id': 'volume-1728',
  'label': 'attendants',
  'id': 'meeting-1728-06-19-session-1-attendant-2',
  'metadata': {'offset': 101,
   'end': 109,
   'class': 'delegate',
   'pattern': 'Coulman,',
   'delegate_id': 19130,
   'delegate_name': 'Coulman',
   'delegate_score': 0.0},
  'begin_anchor': 55957,
  'end

In [30]:
len(attendant_annots)

16

In [31]:
all_annotations.extend(att_lists)
all_annotations.extend(attendant_annots)

In [32]:
len(all_annotations)

1472

In [33]:
# voor iedere annotatie, bepaal image_range en voeg deze toe

def get_bounding_box_for(annotations): 
    ann_list = list(annotations) # because a generator can only be used once
    
    min_left = min([ann['image_coords']['left'] for ann in ann_list if 'image_coords' in ann])
    max_right = max([ann['image_coords']['right'] for ann in ann_list if 'image_coords' in ann])
    min_top = min([ann['image_coords']['top'] for ann in ann_list if 'image_coords' in ann])
    max_bottom = max([ann['image_coords']['bottom'] for ann in ann_list if 'image_coords' in ann])
    height = max_bottom - min_top
    width = max_right - min_left

    return {'left': min_left, 'right': max_right, 'top': min_top, 'bottom': max_bottom, 'height': height, 'width': width}

def add_image_range(ann):
    ann['image_range'] = []
    
    ann_begin=ann['begin_anchor']
    ann_end=ann['end_anchor']
        
    # loop over scans die overlappen met de annotatie
    for a in asearch.get_annotations_of_type_overlapping('scanpage',ann_begin,ann_end,all_annotations, resource_id):
        bounding_boxes = []
        image_url = a['iiif_url']
        
        scan_begin=a['begin_anchor']
        scan_end=a['end_anchor']
        
        # loop over alle kolommen op de betreffende scan. Per kolom, bereken bounding box voor 
        # overlappende resolutieregels
        for clm in asearch.get_annotations_of_type_overlapping('columns',scan_begin,scan_end,all_annotations, resource_id):
            clm_begin=clm['begin_anchor']
            clm_end=clm['end_anchor']
            
            # bepaal overlap_begin en overlap_end indexes voor kolom
            overlap_begin=max(ann_begin, clm_begin)
            overlap_end=min(ann_end, clm_end)
                        
            # bepaal hieruit de bounding box coords voor deze kolom
            if overlap_end-overlap_begin >= 0: # resolution and column are overlapping
                bounding_box=get_bounding_box_for(asearch.get_annotations_of_type_overlapping('lines',\
                                                        overlap_begin,overlap_end,all_annotations, resource_id))
                bounding_boxes.append(bounding_box)
        
        ann['image_range'].append((image_url, bounding_boxes))
    return       

In [34]:
def add_region_links(ann):
    region_links = []
    try:
        for image_url, regions in ann['image_range']:
            for coords in regions:
                # construct iiif_url from image_url and coords
                coord_str = f"{coords['left']},{coords['top']},{coords['width']},{coords['height']}"
                region_url = re.sub(r'(full)/(,\d+)', rf'{coord_str}/\1', image_url)
                region_links.append(region_url)
    except:
        print('error: annotation without image range')
        
    ann['region_links'] = region_links
    return

In [35]:
# for a in asearch.get_annotations_of_type('attendantslists', all_annotations):
#    add_image_range(a)  
#    add_region_links(a)
    
# for a in asearch.get_annotations_of_type('resolutions', all_annotations):
#    add_image_range(a)  
#    add_region_links(a)

for a in all_annotations:
    add_image_range(a)
    add_region_links(a)

In [41]:
for a in asearch.get_annotations_of_type('lines', all_annotations):
    print(a)

{'resource_id': 'volume-1728', 'label': 'lines', 'image_coords': {'left': 1451, 'right': 1962, 'top': 2124, 'bottom': 2184, 'height': 60, 'width': 511}, 'begin_anchor': 55952, 'id': 'NL-HaNA_1.01.02_3783_0285-page-568-column-1-tr-2-line-0', 'end_anchor': 55952, 'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0285.jpg/full/,1316/0/default.jpg', [{'left': 1451, 'right': 1962, 'top': 2124, 'bottom': 2184, 'height': 60, 'width': 511}])], 'region_links': ['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0285.jpg/1451,2124,511,60/full/0/default.jpg']}
{'resource_id': 'volume-1728', 'label': 'lines', 'image_coords': {'left': 1645, 'right': 1787, 'top': 2183, 'bottom': 2244, 'height': 61, 'width': 142}, 'begin_anchor': 55953, 'id': 'NL-HaNA_1.01.02_3783_0285-page-568-column-1-tr-2-line-1', 'end_anchor': 55953, 'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0285.jpg/full/,1316/0

In [31]:
for a in asearch.get_annotations_of_type('resolutions', all_annotations):
    print(a)

{'resource_id': 'volume-1728', 'label': 'resolutions', 'begin_anchor': 434, 'end_anchor': 441, 'id': 'meeting-1728-01-02-session-1-resolution-1', 'proposition_type': 'missive', 'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/full/,2000/0/default.jpg', [{'left': 2814, 'right': 3628, 'top': 3223, 'bottom': 3385, 'height': 162, 'width': 814}]), ('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/full/,1949/0/default.jpg', [{'left': 3685, 'right': 4568, 'top': 1582, 'bottom': 1789, 'height': 207, 'width': 883}])], 'region_links': ['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/2814,3223,814,162/full/0/default.jpg', 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0051.jpg/3685,1582,883,207/full/0/default.jpg']}
{'resource_id': 'volume-1728', 'label': 'resolutions', 'begin_anchor': 442, 'end_anchor': 446, 'id': 'meeting-1728-01-02-session-1

{'resource_id': 'volume-1728', 'label': 'resolutions', 'begin_anchor': 86484, 'end_anchor': 86521, 'id': 'meeting-1728-09-18-session-1-resolution-5', 'proposition_type': None, 'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0416.jpg/full/,3107/0/default.jpg', [{'left': 2455, 'right': 3328, 'top': 3086, 'bottom': 3348, 'height': 262, 'width': 873}]), ('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0416.jpg/full/,3098/0/default.jpg', [{'left': 3324, 'right': 4224, 'top': 437, 'bottom': 1991, 'height': 1554, 'width': 900}])], 'region_links': ['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0416.jpg/2455,3086,873,262/full/0/default.jpg', 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3783/NL-HaNA_1.01.02_3783_0416.jpg/3324,437,900,1554/full/0/default.jpg']}
{'resource_id': 'volume-1728', 'label': 'resolutions', 'begin_anchor': 86523, 'end_anchor': 86537, 'id': 'meeting-1728-09-18-sessio

In [33]:
def add_segmented_text_to_store(segmented_text, store_name):
    try:
        with open(datadir+store_name, 'r') as filehandle:
            data = json.loads(filehandle.read())
    except FileNotFoundError:
        data = {'_resources' : []}
        
    data['_resources'].append(segmented_text)
    
    with open(datadir+store_name, 'w') as filehandle:
        json.dump(data, filehandle, indent=4, cls=segmentedtext.SegmentEncoder)     

In [34]:
add_segmented_text_to_store(all_textlines, text_store)

In [48]:
def add_annotations_to_store(annotations, store_name):
    try:
        with open('../../data/'+store_name, 'r') as filehandle:
            data = json.loads(filehandle.read())
    except FileNotFoundError:
        data = []
        
    data.extend(annotations)
    
    with open('../../data/'+store_name, 'w') as filehandle:
        json.dump(data, filehandle, indent=4, cls=segmentedtext.AnchorEncoder)   

In [49]:
add_annotations_to_store(all_annotations, annotation_store)

In [50]:
len(all_annotations)

1472

In [32]:
all_textlines.slice(1005, 1012)

['\\Ntfangen een Miflive van den Refi-',
 'J dent van Affendelft , gefchreven te Kop-',
 'penhage den {even en twintighften',
 'der voorlede maandt , geaddrefTeert aan',
 'den Griffier Fagel, houdende adverten-',
 'tie.',
 'WAAR op geen refolutie is',
 'gevallen.']

In [None]:
for a in asearch.get_annotations_of_type('columns',all_annotations, resource_id):
    print(a)