Experiment, sheet 3:
- Voeg extra image_coords toe en bijbehorende zoekvraag.
- Scheiden generiek en case-specifiek in de code. 
- Verwijs naar een resolutie of named entity mbv een resolvable web annotation
- indexeer naar MTAS

In [3]:
import json
import glob
import re

# read files

path = "../data/sessions/meeting-1705*"
session_file_names = [f for f in glob.glob(path)]

all_textlines=[]
all_annotations=[]

def traverse(node,node_label,text,annotations):
    # find the list that represents the children, each child is a dict, assume first list is the correct one
    children = []
    label_of_children = ''
    for key,val in node.items():
        if (type(val) == list):
            children = val
            label_of_children = key
            break 
    
    if 'coords' in node:
        coords = node['coords']
    else:
        coords = None
    
    begin_index = len(text)
    annotation_info = {'label' : node_label,'image_coords': coords,'begin_index' : begin_index}
    if len(children) == 0:        # if no children, do your 'leaf node thing'
        annotation_info['id'] = node['id']
        annotation_info['end_index'] = len(text)
        node_text = node['text']
        
        if node_text is None:
            node_text = '\n'

        text.append(node_text)
    else:                         # if non-leaf node, first visit children     
        for child in children:
            traverse(child,label_of_children,text,annotations)
        
        # ONDERSTAANDE IS SMERIG, hangt van onzekere aannames af
        for k in node['metadata'].keys():
            idkey = ''
            if k.endswith('id'):
                idkey = k
                break
        annotation_info['id'] = node['metadata'][idkey]
        
        end_index = len(text)-1
        annotation_info['end_index'] = end_index    # after child text segments are added
        
        # if node contains iiif_url, create extra annotation_info for 'scanpage'
        if 'iiif_url' in node['metadata']:
            scan_annot_info = {'label':'scanpage','iiif_url':node['metadata']['iiif_url'],\
                               'begin_index':begin_index,'end_index':end_index}
            scan_annot_info['scan_num'] = node['metadata']['scan_num']
            annotations.append(scan_annot_info)
        
    annotations.append(annotation_info)
    return

for sf_name in sorted(session_file_names):
    text_array = []
    annotation_array = []
    with open(sf_name, 'r') as myfile:
        session_file=myfile.read()
            
    session_data = json.loads(session_file)      
    source_data = session_data['_source']
        
    traverse(source_data,'sessions',text_array,annotation_array)
        
    # use a generator to create a list of only scanpage annotation_info dicts
    scan_page_annots = [ann_info for ann_info in annotation_array if ann_info['label'] == 'scanpage']
        
    # use groupBy on a list of dicts (zie Python cookbook 1.15)
    from operator import itemgetter
    from itertools import groupby

    # first sort on scan_num
    scan_page_annots.sort(key=itemgetter('scan_num'))

    # iterate in groups
    aggregated_scan_annots = []

    for scan_num, items in groupby(scan_page_annots, key=itemgetter('scan_num')):
        # first, convert the 'items' iterator to a list, to able to use it twice (iterators can be used once)
        itemlist = list(items)
    
        # copy the item with the lowest begin_index
        aggr_scan_annot = min(itemlist, key=itemgetter('begin_index')).copy()
    
        # replace 'end_index' with the highest end_index in the group
        max_end_index = max(itemlist, key=itemgetter('end_index'))['end_index']
        aggr_scan_annot['end_index'] = max_end_index
        
        # add to result
        aggregated_scan_annots.append(aggr_scan_annot)

    for scan_ann in aggregated_scan_annots:
        scan_ann['iiif_url'] = re.sub(r'(\d+),(\d+),(\d+),(\d+)/(full)', r'\5/,\4', scan_ann['iiif_url'])
            
    annotation_array = [ann for ann in annotation_array if ann not in scan_page_annots]
    annotation_array.extend(aggregated_scan_annots)
        
    page_data = session_data['_source']['page_versions']
        
    # generator
    page_identifiers = (pg['page_id'] for pg in page_data)
    page_annots = [{'label' : 'pages','id' : page_id} for page_id in page_identifiers]
        
    for pa in page_annots:
        scan_num = int(re.search(r'(\d+)-page-', pa['id']).group(1))
        scanpage_for_scan_num = [ai for ai in annotation_array if 'scan_num' in ai.keys() and ai['scan_num'] == \
                                 scan_num]
        pa['begin_index'] = scanpage_for_scan_num[0]['begin_index']
        pa['end_index'] = scanpage_for_scan_num[0]['end_index']
        pa['indexesByContainment'] = True
     
    annotation_array.extend(page_annots)
    
    # properly concatenate annotation info taking ongoing line indexes into account
    for ai in annotation_array:
        ai['begin_index'] += len(all_textlines)
        ai['end_index'] += len(all_textlines)
    
    all_textlines.extend(text_array)       
    all_annotations.extend(annotation_array)

In [4]:
all_annotations[-20:]

[{'label': 'lines',
  'image_coords': {'left': 2591,
   'right': 3313,
   'top': 2226,
   'bottom': 2302,
   'height': 76,
   'width': 722},
  'begin_index': 4670,
  'id': 'NL-HaNA_1.01.02_3760_0032-page-63-col-0-tr-3-line-2',
  'end_index': 4670},
 {'label': 'lines',
  'image_coords': {'left': 2586,
   'right': 3416,
   'top': 2274,
   'bottom': 2351,
   'height': 77,
   'width': 830},
  'begin_index': 4671,
  'id': 'NL-HaNA_1.01.02_3760_0032-page-63-col-0-tr-3-line-3',
  'end_index': 4671},
 {'label': 'lines',
  'image_coords': {'left': 2636,
   'right': 3247,
   'top': 2324,
   'bottom': 2396,
   'height': 72,
   'width': 611},
  'begin_index': 4672,
  'id': 'NL-HaNA_1.01.02_3760_0032-page-63-col-0-tr-3-line-4',
  'end_index': 4672},
 {'label': 'lines',
  'image_coords': {'left': 2590,
   'right': 3417,
   'top': 2371,
   'bottom': 2449,
   'height': 78,
   'width': 827},
  'begin_index': 4673,
  'id': 'NL-HaNA_1.01.02_3760_0032-page-63-col-0-tr-3-line-5',
  'end_index': 4673},
 {'l

Localiseer een resolutietekst, liefst over kolom en/of paginagrenzen heen, en representeer die als annotation.

In [5]:
for i in range(len(all_textlines)):
    print(f"{i}: {all_textlines[i]}")

0: Jovis den 1. Januari 1705.
1: Nihil actum est.
2: 

3: 7 61
4: Veneris den 2. Januarii 1705.
5: 1951
6: P R ES I D E,
7: vooila
8: 7 In:
9: 5
10: De Heer van Gent.
11: P R AE S E.N T I B U S,
12: De Heeren Ham.
13: V. anden Honert ‚ Heemskerck, Meerens , Raedt-
14: Pensionaris Heynsius,
15: Becker , met een extraordinaris Gedeputeerde uyt
16: de Provincie van Zeelandt.
17: Wellandt, Renswoude , Velthuysen, met een ex-
18: traordinaris Gedeputeerde uy de Provincie van
19: Utrecht.
20: Dau Tour.
21: Wichers, Gockinga.
22: ai
23: a
24: Rinm gE Resolutien eergisteren genomen
25: jp En geleen en geresumeert , ge-
26: |
27: A He steert zyn de Depesches daer uyt re-
28: a ) sulterende.
29: Ontfangen een Missive van den Resident Bil-
30: derbeeck, geschreven tot Keulen den dertigsten
31: der voorlede maendt ‚ houdende advertentie :
32: Waer op geen resolutie is gevallen.
33: Ontfangen een Missive van haer Hoogh Mo-
34: gende Gecommitteerde Hulft i geschreven
35: 

36: 120 D noen A
37: 9
38:

1958: tedoen debefcheren ter: fomme vat vyf hondert
1959: ouldene tebetalen uvt de penninger ‘van de ge
1960: ® orale Colle&e. daer van derrfelven:t fijnef-tydt
1961: flverrekenen. volgens cm in éonforiniteyt van
1962: d
1963: de ordres vanhet Landt:
1964: Ts ter Vergaderinge gelefen de Requefte: van
1965: Ta Wimer. Baron van Pallandt: Generaek Mai:
1966: san de Infanterve:, verfoedkende; dat: haer:
1967: Hosok Movende oeliefden aen hem: Supplidnt te
1968: veeg leren Dermiké bom vooreen weeet vier:4
1969: vof om fine afireste; mogen gaen verrichte
1970: Aer hat noodichwefenfoude. > Waer opgedex
1971: Vhoreert zude svoereevondenen verftacm; ‘dat
1972: Oonveva devoortchreve Reduêfte gefondén-fat
1973: walter zen den Grive-van Noyelles;' Generael:
1974: van Ae Tafonterve tegenwidordighzcommdnde
1975: vende. 40 de Moefel- om aën haer Hoogh Mo:
1976: dende Act Afs-advis daer onsite laten toko:
1977: Bot
1978: fr
1979: OTE
1980: men,
1981: Te ter Verraderinge gelefen dé Requefte van
1982: He

3643: 

3644: Waer op gedelibereert zynde is goetgevonden
3645: ende verstaen , dat Copye van het voorschreve
3646: C onsent gesonden sal werden aen den Raedt van
3647: State , om te strecken tot der selver narich-
3648: tinge ; ende dat dien onvermindert de Heeren
3649: S taten van de hoogh-gemelde Provincie by Mif-
3650: sive sullen werden versocht in de voorschreve Pe-
3651: titie van driemael hondert en vyftigh duysent gul-
3652: dens niet alleen maer oock in de Petitie van twee
3653: mael hondert duysendt guldens suppletoir tot de
3654: Recrutes te willen consenteren dat de hoogh-
3655: gemelde Heeren Staten willen considereren dat
3656: aen de spoedige recruteringe en herstellinge van
3657: de Militie ten hooghsten veel gelegen is; dat de
3658: Vyanden om hare Militie te herstelien alle beden-
3659: kelijcke en extraordinaris middelen gebruycken,
3660: en de gantsche vrucht van de victorien door Go-
3661: des zegen in de voorlede C ampagne verkregen sal
3662: verloren gaen, in ge

In [6]:
resolution_segments = [(97,126),(127,147),(2669,2685),(2813,2825),(3068,3093)]

id_suffix = 1
resolution_annotations = []
for rs in resolution_segments:
    res_annot = {'label':'resolutions','begin_index':rs[0],'end_index':rs[1],\
                 'id':'NL-HaNA_1.01.02_3760_resolution-'+str(id_suffix)}
    resolution_annotations.append(res_annot)
    id_suffix += 1

resolution_annotations

[{'label': 'resolutions',
  'begin_index': 97,
  'end_index': 126,
  'id': 'NL-HaNA_1.01.02_3760_resolution-1'},
 {'label': 'resolutions',
  'begin_index': 127,
  'end_index': 147,
  'id': 'NL-HaNA_1.01.02_3760_resolution-2'},
 {'label': 'resolutions',
  'begin_index': 2669,
  'end_index': 2685,
  'id': 'NL-HaNA_1.01.02_3760_resolution-3'},
 {'label': 'resolutions',
  'begin_index': 2813,
  'end_index': 2825,
  'id': 'NL-HaNA_1.01.02_3760_resolution-4'},
 {'label': 'resolutions',
  'begin_index': 3068,
  'end_index': 3093,
  'id': 'NL-HaNA_1.01.02_3760_resolution-5'}]

In [7]:
all_annotations.extend(resolution_annotations)

In [8]:
def annotations_at_index(index,label=None):
    annots_at_index = [ann_info for ann_info in all_annotations if ann_info['begin_index'] <= index <= ann_info['end_index']]  
    return annots_at_index

In [9]:
annotations_at_index(2675)

[{'label': 'lines',
  'image_coords': {'left': 1390,
   'right': 2278,
   'top': 955,
   'bottom': 1015,
   'height': 60,
   'width': 888},
  'begin_index': 2675,
  'id': 'NL-HaNA_1.01.02_3760_0022-page-42-col-1-tr-0-line-12',
  'end_index': 2675},
 {'label': 'textregions',
  'image_coords': {'left': 1380,
   'right': 2294,
   'top': 372,
   'bottom': 3290,
   'height': 2918,
   'width': 914},
  'begin_index': 2663,
  'id': 'NL-HaNA_1.01.02_3760_0022-page-42-col-1-tr-0',
  'end_index': 2724},
 {'label': 'columns',
  'image_coords': {'left': 1380,
   'right': 2294,
   'top': 372,
   'bottom': 3290,
   'height': 2918,
   'width': 914},
  'begin_index': 2663,
  'id': 'NL-HaNA_1.01.02_3760_0022-page-42-col-1',
  'end_index': 2724},
 {'label': 'sessions',
  'image_coords': None,
  'begin_index': 2568,
  'id': 'meeting-1705-01-07-session-1',
  'end_index': 3093},
 {'label': 'scanpage',
  'iiif_url': 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0022.jpg/full/,31

Zoek alle annotaties die overlappen met een interval. Dat interval kan ontleent worden aan begin_index en end_index van een resolutie.

In [10]:
# return generator for annotations with a specific label
def get_annotations_of_type(type,annotations):
    return (a for a in annotations if a['label'] == type)

In [11]:
for a in get_annotations_of_type('resolutions', all_annotations):
    print(a)

{'label': 'resolutions', 'begin_index': 97, 'end_index': 126, 'id': 'NL-HaNA_1.01.02_3760_resolution-1'}
{'label': 'resolutions', 'begin_index': 127, 'end_index': 147, 'id': 'NL-HaNA_1.01.02_3760_resolution-2'}
{'label': 'resolutions', 'begin_index': 2669, 'end_index': 2685, 'id': 'NL-HaNA_1.01.02_3760_resolution-3'}
{'label': 'resolutions', 'begin_index': 2813, 'end_index': 2825, 'id': 'NL-HaNA_1.01.02_3760_resolution-4'}
{'label': 'resolutions', 'begin_index': 3068, 'end_index': 3093, 'id': 'NL-HaNA_1.01.02_3760_resolution-5'}


In [12]:
def get_annotations_overlapping_with(begin,end,annotations):
    return (a for a in annotations if (a['begin_index'] >= begin and a['begin_index'] < end) or\
           (a['end_index'] > begin and a['end_index'] <= end) or\
           (a['begin_index'] <= begin and a['end_index'] >= end))

In [13]:
for a in get_annotations_overlapping_with(2813,2825,all_annotations):
    print(a)

{'label': 'lines', 'image_coords': {'left': 3575, 'right': 4413, 'top': 1561, 'bottom': 1657, 'height': 96, 'width': 838}, 'begin_index': 2813, 'id': 'NL-HaNA_1.01.02_3760_0022-page-43-col-1-tr-2-line-15', 'end_index': 2813}
{'label': 'lines', 'image_coords': {'left': 3522, 'right': 4413, 'top': 1609, 'bottom': 1702, 'height': 93, 'width': 891}, 'begin_index': 2814, 'id': 'NL-HaNA_1.01.02_3760_0022-page-43-col-1-tr-2-line-16', 'end_index': 2814}
{'label': 'lines', 'image_coords': {'left': 3517, 'right': 4413, 'top': 1655, 'bottom': 1749, 'height': 94, 'width': 896}, 'begin_index': 2815, 'id': 'NL-HaNA_1.01.02_3760_0022-page-43-col-1-tr-2-line-17', 'end_index': 2815}
{'label': 'lines', 'image_coords': {'left': 3515, 'right': 4414, 'top': 1703, 'bottom': 1799, 'height': 96, 'width': 899}, 'begin_index': 2816, 'id': 'NL-HaNA_1.01.02_3760_0022-page-43-col-1-tr-2-line-18', 'end_index': 2816}
{'label': 'lines', 'image_coords': {'left': 3518, 'right': 4406, 'top': 1753, 'bottom': 1848, 'heigh

Combineer beide: alle annotaties van een bepaald type, binnen een interval

In [14]:
# nested generators (is that allowed?)
def get_annotations_of_type_overlapping(type,begin,end,annotations):
    return get_annotations_of_type(type,(get_annotations_overlapping_with(begin,end,annotations)))

In [15]:
for a in get_annotations_of_type_overlapping('resolutions',2599,2850,all_annotations):
    print(a)

{'label': 'resolutions', 'begin_index': 2669, 'end_index': 2685, 'id': 'NL-HaNA_1.01.02_3760_resolution-3'}
{'label': 'resolutions', 'begin_index': 2813, 'end_index': 2825, 'id': 'NL-HaNA_1.01.02_3760_resolution-4'}


In [16]:
def get_textlines_between(begin,end,annotations): 
    textlines = []
    for line_annot in get_annotations_of_type_overlapping('lines',begin,end,annotations):
        textlines.append((line_annot['begin_index'],all_textlines[line_annot['begin_index']]))
    return textlines
        

In [17]:
# get text for (text segment of) resolution
for a in get_textlines_between(2813,2825,all_annotations):
    print(a)

(2813, 'Is ter Vergaderinge gelesen de Requeste van')
(2814, 'Arnoldus Boomhouwer , Auditeur tot Roer')
(2815, 'monde, versoeckende, om redenen in de voor.')
(2816, 'schreve Requeste geësprimeert, dat haer Hoogh')
(2817, 'Mogende geliefden hem Suppliant te begunse,')
(2818, 'gen met de Fourier- ofte Biljet meesters pact')
(2819, 'binnen de Stadt Rroermonde, ende aen hen dae,')
(2820, 'van te verleenen Commissie in forma, Waer ey')
(2821, 'gedelibereert aynde is goetgevonden ende yer.')
(2822, 'staen, dat Copye van de voorschreve Request')
(2823, 'gesonden sal werden aen de Magistraet van koe')
(2824, 'inonde » om der selver b erie ih t daer op en ie')
(2825, 'Hoogh Mogende te laten toekomen.')


Vraag nu de tekst op voor de kolom waar deze resolutie deel van uitmaakt

In [18]:
# resolutiegrenzen 2813 en 2825 waren eerder al opgevraagd middels 'get_annotations_of_type'
# vraag eerst columns op, die overlappen met die grenzen
begin = 2813
end = 2825
column_annots = get_annotations_of_type_overlapping('columns',begin,end,all_annotations)

# voor ieder column, print column_id, print column text
for col_annot in column_annots:
    print(f"column id: {col_annot['id']}")
    print(f"{col_annot['begin_index']}   {col_annot['end_index']}\n")
    
    for a in get_textlines_between(col_annot['begin_index'],col_annot['end_index'],all_annotations):
        print(a)

column id: NL-HaNA_1.01.02_3760_0022-page-43-col-1
2785   2850

(2785, '(29')
(2786, 'ai')
(2787, 'Den 7. Januarth,')
(2788, 'H')
(2789, 'Gul-')
(2790, '1')
(2791, 'het besolsieitere nde bet iinge Y oe')
(2792, 'ges Waer op gedelibel ee zynde is goetge.')
(2793, 'vonden ende verstaen , mite desen te onsen.')
(2794, 'teren in het voorschreve versoeck, ende wend.')
(2795, 'dienvolgende aen den Suppliant gesccordee')
(2796, 'permissie en verlof voor den tydt van wee maen.')
(2797, 'den ì')
(2798, 'ts er Vergaderinge gelesen de Requeste van')
(2799, 'Frangois d le Serra, Lieut enant Collon l van a')
(2800, 'Regiment van den Colsonel Henry de Caris, ver.')
(2801, '\n')
(2802, 'qucste geësprimeert, dat haer Hoogh Mogen.')
(2803, 'de geliefden a en hem Sapp lane te permis ven')
(2804, '\n')
(2805, 'Collonel op approbatie ende aggreatie van haa,')
(2806, 'Hoogh Mogende aen een ander bequzem Of')
(2807, '\n')
(2808, 'gedelibereert ynde is goetgevonden ende ye')
(2809, 'sen , dat Copye van de vo

Volgende stap: definieren van named entities. Voorlopig alleen door te verwijzen naar de index van de line(s) waar deze entities op voorkomen

In [19]:
entity_segments = [(2814,2814,'per','Arnoldus Boomhouwer'),(2832,2833,'per','van Rabenpeé'),\
                   (2819,2819,'loc','Roermond')]

id_suffix = 1
entity_annotations = []
for es in entity_segments:
    ent_annot = {'label':'entities','begin_index':es[0],'end_index':es[1],\
                 'id':'NL-HaNA_1.01.02_3760_entity-'+str(id_suffix),\
                 'entity_type': es[2],'entity_text':es[3]}
    entity_annotations.append(ent_annot)
    id_suffix += 1

entity_annotations

[{'label': 'entities',
  'begin_index': 2814,
  'end_index': 2814,
  'id': 'NL-HaNA_1.01.02_3760_entity-1',
  'entity_type': 'per',
  'entity_text': 'Arnoldus Boomhouwer'},
 {'label': 'entities',
  'begin_index': 2832,
  'end_index': 2833,
  'id': 'NL-HaNA_1.01.02_3760_entity-2',
  'entity_type': 'per',
  'entity_text': 'van Rabenpeé'},
 {'label': 'entities',
  'begin_index': 2819,
  'end_index': 2819,
  'id': 'NL-HaNA_1.01.02_3760_entity-3',
  'entity_type': 'loc',
  'entity_text': 'Roermond'}]

In [20]:
all_annotations.extend(entity_annotations)

In [21]:
for a in get_annotations_of_type('entities', all_annotations):
    print(a)

{'label': 'entities', 'begin_index': 2814, 'end_index': 2814, 'id': 'NL-HaNA_1.01.02_3760_entity-1', 'entity_type': 'per', 'entity_text': 'Arnoldus Boomhouwer'}
{'label': 'entities', 'begin_index': 2832, 'end_index': 2833, 'id': 'NL-HaNA_1.01.02_3760_entity-2', 'entity_type': 'per', 'entity_text': 'van Rabenpeé'}
{'label': 'entities', 'begin_index': 2819, 'end_index': 2819, 'id': 'NL-HaNA_1.01.02_3760_entity-3', 'entity_type': 'loc', 'entity_text': 'Roermond'}


In [22]:
# get text for (text segment of) entity
for a in get_textlines_between(2832,2833,all_annotations):
    print(a)

(2832, 'werden gedepescheert voor den ris. van Ra.')
(2833, 'benpeé, Groot Jagermeester , voor den Goy,')


In [23]:
# get text for (text segment of) entity
for a in get_textlines_between(2819,2819,all_annotations):
    print(a)

(2819, 'binnen de Stadt Rroermonde, ende aen hen dae,')


In [24]:
# get all entities for a specific session

# first get session that contains our example resolution from 2813 to 2825
for a in get_annotations_of_type_overlapping('sessions',2599,2850,all_annotations):
    print(a)

{'label': 'sessions', 'image_coords': None, 'begin_index': 2568, 'id': 'meeting-1705-01-07-session-1', 'end_index': 3093}


In [25]:
# then, get overlapping entities
for a in get_annotations_of_type_overlapping('entities',2568,3093,all_annotations):
    print(a)

{'label': 'entities', 'begin_index': 2814, 'end_index': 2814, 'id': 'NL-HaNA_1.01.02_3760_entity-1', 'entity_type': 'per', 'entity_text': 'Arnoldus Boomhouwer'}
{'label': 'entities', 'begin_index': 2832, 'end_index': 2833, 'id': 'NL-HaNA_1.01.02_3760_entity-2', 'entity_type': 'per', 'entity_text': 'van Rabenpeé'}
{'label': 'entities', 'begin_index': 2819, 'end_index': 2819, 'id': 'NL-HaNA_1.01.02_3760_entity-3', 'entity_type': 'loc', 'entity_text': 'Roermond'}


Experiment: voeg een 'image_range' element toe voor iedere resolutie

In [26]:
resolution_annotations

[{'label': 'resolutions',
  'begin_index': 97,
  'end_index': 126,
  'id': 'NL-HaNA_1.01.02_3760_resolution-1'},
 {'label': 'resolutions',
  'begin_index': 127,
  'end_index': 147,
  'id': 'NL-HaNA_1.01.02_3760_resolution-2'},
 {'label': 'resolutions',
  'begin_index': 2669,
  'end_index': 2685,
  'id': 'NL-HaNA_1.01.02_3760_resolution-3'},
 {'label': 'resolutions',
  'begin_index': 2813,
  'end_index': 2825,
  'id': 'NL-HaNA_1.01.02_3760_resolution-4'},
 {'label': 'resolutions',
  'begin_index': 3068,
  'end_index': 3093,
  'id': 'NL-HaNA_1.01.02_3760_resolution-5'}]

In [27]:
# voor iedere resolutie, bepaal image_range en voeg deze toe

def get_bounding_box_for(annotations): 
    ann_list = list(annotations) # because a generator can only be used once
    
    min_left = min([ann['image_coords']['left'] for ann in ann_list if 'image_coords' in ann])
    max_right = max([ann['image_coords']['right'] for ann in ann_list if 'image_coords' in ann])
    min_top = min([ann['image_coords']['top'] for ann in ann_list if 'image_coords' in ann])
    max_bottom = max([ann['image_coords']['bottom'] for ann in ann_list if 'image_coords' in ann])
    height = max_bottom - min_top
    width = max_right - min_left

    return {'left': min_left, 'right': max_right, 'top': min_top, 'bottom': max_bottom, 'height': height, 'width': width}

for r in resolution_annotations:
    r['image_range'] = []
    
    res_begin=r['begin_index']
    res_end=r['end_index']
    print(f"\nbegin of resolution: {res_begin}, end of resolution: {res_end}")
    
    # loop over scans die overlappen met de resolutie
    for a in get_annotations_of_type_overlapping('scanpage',res_begin,res_end,all_annotations):
        bounding_boxes = []
        image_url = a['iiif_url']
        print(image_url)
        
        scan_begin=a['begin_index']
        scan_end=a['end_index']
        
        # loop over alle kolommen op de betreffende scan. Per kolom, bereken bounding box voor 
        # overlappende resolutieregels
        for clm in get_annotations_of_type_overlapping('columns',scan_begin,scan_end,all_annotations):
            clm_begin=clm['begin_index']
            clm_end=clm['end_index']
            
            # bepaal overlap_begin en overlap_end indexes voor kolom
            overlap_begin=max(res_begin, clm_begin)
            overlap_end=min(res_end, clm_end)
            
#            print(f"{clm['id']}  {clm_begin}  {clm_end} overlap_begin: {overlap_begin} overlap_end: {overlap_end}")
            
            # bepaal hieruit de bounding box coords voor deze kolom
            if overlap_end-overlap_begin >= 0: # resolution and column are overlapping
                bounding_box=get_bounding_box_for(get_annotations_of_type_overlapping('lines',\
                                                        overlap_begin,overlap_end,all_annotations))
                bounding_boxes.append(bounding_box)
        
        r['image_range'].append((image_url, bounding_boxes))
        print(r['image_range'])
                


begin of resolution: 97, end of resolution: 126
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg
[('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg', [{'left': 405, 'right': 1310, 'top': 1036, 'bottom': 2451, 'height': 1415, 'width': 905}])]

begin of resolution: 127, end of resolution: 147
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg
[('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg', [{'left': 420, 'right': 1317, 'top': 2436, 'bottom': 3273, 'height': 837, 'width': 897}, {'left': 1361, 'right': 2264, 'top': 349, 'bottom': 557, 'height': 208, 'width': 903}])]

begin of resolution: 2669, end of resolution: 2685
https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0022.jpg/full/,3130/0/default.jpg
[('https://images.diginf

Tussenconclusie: het is mogelijk voor een resolutie alle omsluitende rectangles af te leiden, zelfs als de resolutietekst over meerdere kolommen en/of meerdere scans doorloopt. In principe zijn uit deze image_ranges IIIF image urls af te leiden voor de resolutieonderdelen op de scans, of de hele scans zijn op te halen, en de resolutiedelen daarop zijn te omkaderen.

Omdat het leuk is: genereer nog even een list van IIIF image urls per resolutie (en zet ze dan in je browser even naast elkaar.

In [28]:
resolution_annotations

[{'label': 'resolutions',
  'begin_index': 97,
  'end_index': 126,
  'id': 'NL-HaNA_1.01.02_3760_resolution-1',
  'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg',
    [{'left': 405,
      'right': 1310,
      'top': 1036,
      'bottom': 2451,
      'height': 1415,
      'width': 905}])]},
 {'label': 'resolutions',
  'begin_index': 127,
  'end_index': 147,
  'id': 'NL-HaNA_1.01.02_3760_resolution-2',
  'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg',
    [{'left': 420,
      'right': 1317,
      'top': 2436,
      'bottom': 3273,
      'height': 837,
      'width': 897},
     {'left': 1361,
      'right': 2264,
      'top': 349,
      'bottom': 557,
      'height': 208,
      'width': 903}])]},
 {'label': 'resolutions',
  'begin_index': 2669,
  'end_index': 2685,
  'id': 'NL-HaNA_1.01.02_3760_resolution-3',
  'image_range': [('h

In [29]:
for res in resolution_annotations:
    region_links = []
    for image_url, regions in res['image_range']:
        for coords in regions:
            # construct iiif_url from image_url and coords
            coord_str = f"{coords['left']},{coords['top']},{coords['width']},{coords['height']}"
            region_url = re.sub(r'(full)/(,\d+)', rf'{coord_str}/\1', image_url)
            region_links.append(region_url)
    res['region_links'] = region_links

In [30]:
resolution_annotations

[{'label': 'resolutions',
  'begin_index': 97,
  'end_index': 126,
  'id': 'NL-HaNA_1.01.02_3760_resolution-1',
  'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg',
    [{'left': 405,
      'right': 1310,
      'top': 1036,
      'bottom': 2451,
      'height': 1415,
      'width': 905}])],
  'region_links': ['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/405,1036,905,1415/full/0/default.jpg']},
 {'label': 'resolutions',
  'begin_index': 127,
  'end_index': 147,
  'id': 'NL-HaNA_1.01.02_3760_resolution-2',
  'image_range': [('https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg',
    [{'left': 420,
      'right': 1317,
      'top': 2436,
      'bottom': 3273,
      'height': 837,
      'width': 897},
     {'left': 1361,
      'right': 2264,
      'top': 349,
      'bottom': 557,
      'height': 208,
      'width': 90

In [31]:
for a in get_annotations_of_type('resolutions', all_annotations):
    print(a['region_links'])

['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/405,1036,905,1415/full/0/default.jpg']
['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/420,2436,897,837/full/0/default.jpg', 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/1361,349,903,208/full/0/default.jpg']
['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0022.jpg/1388,666,903,796/full/0/default.jpg']
['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0022.jpg/3505,1561,909,658/full/0/default.jpg']
['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0023.jpg/3483,2145,925,1181/full/0/default.jpg', 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0024.jpg/372,370,880,112/full/0/default.jpg']


In [32]:
def get_annotation_by_id(id, annotations):
    return next(ann for ann in annotations if 'id' in ann.keys() and ann['id'] == id)

In [33]:
for a in get_annotations_of_type_overlapping('resolutions',130,130,all_annotations):
    print(a['region_links'])

['https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/420,2436,897,837/full/0/default.jpg', 'https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/1361,349,903,208/full/0/default.jpg']


In [34]:
for a in get_textlines_between(127,147,all_annotations):
    print(a)

(127, '0, de Requeste van Paulus de Noter ‚ Schip-')
(128, 'per, woonende tot Brussel, is na voorgdende de-')
(129, 'liberatie goetgevofiden en verstaer, dat ten be-')
(130, 'hoeve van den Suppliant een Pasport sal werden')
(131, 'oedepescheert, om met sijn Schip direct en son-')
(132, 'der overschepinge van Brussel uyt dese Landen te')
(133, 'mogen afhalen 1000 sacken wit gerafineert Zout,')
(134, 'ende de selve na Brussel te mogen uytvoeren,')
(135, 'mits betalende ’sLandts gerechtieheyt, mitsga-')
(136, 'ders de Spaensche rechten van het PortSt. Maria')
(137, 'aen den Ontfanger Flinck. Endesal Exeract van')
(138, 'dese haer Hoogh Mogende Resolutie gesonden')
(139, 'werden aen het Cellegie ter A dmiralieeye in Zee-')
(140, 'lande, en het selve aengeschreven soodanige or-')
(141, 'dre te stellen , en die voorsseninge te doen dat')
(142, 'de voorschreve quantiteyt Zout onverhindert en')
(143, 'sonder eenige molestatie op der elver Comptoi-')
(144, 'ren mogen passeren. De Heeren Gedeput

In [35]:
for a in get_annotations_of_type('resolutions', all_annotations):
    print(json.dumps(a, sort_keys=False, indent=2))

{
  "label": "resolutions",
  "begin_index": 97,
  "end_index": 126,
  "id": "NL-HaNA_1.01.02_3760_resolution-1",
  "image_range": [
    [
      "https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg",
      [
        {
          "left": 405,
          "right": 1310,
          "top": 1036,
          "bottom": 2451,
          "height": 1415,
          "width": 905
        }
      ]
    ]
  ],
  "region_links": [
    "https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/405,1036,905,1415/full/0/default.jpg"
  ]
}
{
  "label": "resolutions",
  "begin_index": 127,
  "end_index": 147,
  "id": "NL-HaNA_1.01.02_3760_resolution-2",
  "image_range": [
    [
      "https://images.diginfra.net/iiif/NL-HaNA_1.01.02/3760/NL-HaNA_1.01.02_3760_0009.jpg/full/,3114/0/default.jpg",
      [
        {
          "left": 420,
          "right": 1317,
          "top": 2436,
          "bottom": 3273,
          "height": 837,

In [36]:
all_textlines[-10:]

['Renswoude , Ve elthuysen.',
 'Du our,',
 'il',
 'P',
 'Wichers, Gockinga.',
 'bn',
 'In',
 'nn',
 '\n',
 'AE Resolutien eergisteren genomen, zijn']

In [37]:
# write all_textlines to a file
with open('../data/all_textlines.txt', 'w') as filehandle:
    json.dump(all_textlines, filehandle)