In [2]:
# Intall the library to access arkindex exports files https://doc.teklia.com/arkindex_export/
!pip install arkindex-export




In [8]:
from arkindex_export import open_database, Element, Metadata, Transcription
from arkindex_export.queries import list_children
from pathlib import Path

# load the  export
open_database(Path("cke-atelier-pictoria-18092025-20251127-090658.sqlite"))


In [4]:
# compute some statistics
print("Number of folders", Element.select().where(Element.type == 'folder').count())
print("Number of pages:", Element.select().where(Element.type == 'page').count())
print("Number of photographs:", Element.select().where(Element.type == 'photograph').count())
print("Number of persons:", Element.select().where(Element.type == 'person').count())
print("Number of metadata:", Metadata.select().count())
print("Number of metadata Title:", Metadata.select().where(Metadata.name == 'Title').count())
print("Number of metadata Creator:", Metadata.select().where(Metadata.name == 'Creator').count())


Number of folders 2
Number of pages: 191
Number of photographs: 84
Number of persons: 0
Number of metadata: 225
Number of metadata Title: 2
Number of metadata Creator: 5


In [5]:
# count the number of pages in each folder
for folder in Element.select().where(Element.type == 'folder'):
    page_number = list_children(folder.id).where(Element.type == 'page').count()
    print(f"Folder '{folder.name}' has {page_number} pages.")

Folder 'BnF, département Société de Géographie, SG WD-346' has 100 pages.
Folder 'BnF, département Société de Géographie, SG WD-232 (RES)' has 91 pages.


In [6]:
#!pip install shapely
from shapely import Polygon
import ast
# compute the max, min and average size of photographs
# loop on all pages and get the size of each photograph

# for the documentation about element objects, see the code :
#  https://gitlab.teklia.com/arkindex/export/-/blob/master/arkindex_export/models.py?ref_type=heads#L86

sizes = []
for page in Element.select().where(Element.type == 'page'):
    photographs = list_children(page.id).where(Element.type == 'photograph')
    for photo in photographs:
        # convert photo.polygon to shapely polygon
        polygon = Polygon(ast.literal_eval(photo.polygon))
        minx, miny, maxx, maxy = polygon.bounds
        width = maxx - minx
        height = maxy - miny
        sizes.append((width, height))
# compute max, min and average size
widths, heights = zip(*sizes)
print("Photograph width - min:", min(widths), "max:", max(widths), "avg:", sum(widths)/len(widths))
print("Photograph height - min:", min(heights), "max:", max(heights), "avg:", sum(heights)/len(heights))    


Photograph width - min: 2915.0 max: 5761.0 avg: 3361.4285714285716
Photograph height - min: 2367.0 max: 6133.0 avg: 3570.1666666666665


In [None]:

# loop on all the pages and get the photographs and the assocated text_lines
for page in Element.select().where(Element.type == 'page'):
    photographs = list_children(page.id).where(Element.type == 'photograph')
    text_lines = list_children(page.id).where(Element.type == 'text_line')
    # print only if there are photographs and text lines
    if photographs.count() > 0 and text_lines.count() > 0:
        print(f"Page '{page.name}' has {photographs.count()} photographs and {text_lines.count()} text lines.")
    # look for the photographs that overlap with text lines
    for photo in photographs:
        associated_texts = []
        photo_polygon = Polygon(ast.literal_eval(photo.polygon))
        for text_line in text_lines:
            text_polygon = Polygon(ast.literal_eval(text_line.polygon))
            if photo_polygon.intersects(text_polygon):
                #print(f"Photograph '{photo.name}' overlaps with text line '{text_line.name}' on page '{page.name}'.")
                transcriptions = Transcription.select().where(Transcription.element == text_line.id)
                for transcription in transcriptions:
                    associated_texts.append(transcription.text)
        if len(associated_texts) > 0:
            print(f"Photograph '{photo.id}' on page '{page.id}' has associated texts: {associated_texts}")


Page 'NP' has 1 photographs and 4 text lines.
Photograph 'a7294f33-f383-4033-90c4-ad1035170f10' on page '38d93943-9c6c-4e6b-a141-7e41d845596f' has associated texts: ['A 137 FUKUDA AT NAGASAKI.']
Page 'NP' has 1 photographs and 4 text lines.
Photograph '1288c0e6-b611-4eb2-a45a-bdc161c2256b' on page '060d3dd2-836a-46fe-92ae-d6a9d4eaa9c1' has associated texts: ['4. KAGO, TRAVELLING CHAIR HAKONE ROAD.', 'W/d- 346 (27)', 'Wd 346']
Page 'NP' has 1 photographs and 4 text lines.
Photograph '0b098574-b98c-48fc-bc84-6d54a1a1802a' on page '4b7b2aa6-e2f8-4f16-8350-033909c45ca2' has associated texts: ['213. GROUP OF CHILDREN.', 'Wd 346']
Page 'NP' has 1 photographs and 3 text lines.
Photograph 'c98d1b84-2877-40e5-b998-e888d959fe3f' on page '06d217d7-613c-4bbf-ad62-6ce3eb15b150' has associated texts: ['106. Betto, Groom, Who Runs With Your Horse.']
Page 'NP' has 1 photographs and 7 text lines.
Photograph '66fe040d-fdc5-4dcb-8e4c-a3b6c9e7899a' on page '07e6c6d3-ecbb-4605-a63b-715f42b65594' has associ