Skip to content

Commit

Permalink
Document.get_default_image_group()
Browse files Browse the repository at this point in the history
  • Loading branch information
hnesk committed Oct 14, 2020
1 parent 89b5d0e commit 0a96543
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 2 deletions.
27 changes: 26 additions & 1 deletion ocrd_browser/model/document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import shutil

from ocrd import Workspace, Resolver
Expand Down Expand Up @@ -226,7 +227,6 @@ def get_image_paths(self, file_group: str) -> Dict[str, Path]:
More precisely: fast = Faster than iterating over page_ids and using mets.get_physical_page_for_file for each entry
"""
log = getLogger('ocrd_browser.model.document.Document.get_image_paths')

image_paths = {}
file_index = self.get_file_index()
for page_id in self.page_ids:
Expand All @@ -238,6 +238,31 @@ def get_image_paths(self, file_group: str) -> Dict[str, Path]:
image_paths[page_id] = None
return image_paths

def get_default_image_group(self, preferred_image_file_groups:Optional[List] = None) -> Optional[str]:
image_file_groups = []
for file_group, mimetype in self.file_groups_and_mimetypes:
weight = 0.0
if mimetype.split('/')[0] == 'image':
# prefer images
weight += 0.5
if preferred_image_file_groups:
for i, preferred_image_file_group in enumerate(preferred_image_file_groups):
if re.fullmatch(preferred_image_file_group, file_group):
# prefer matches earlier in the list
weight += (len(preferred_image_file_groups) - i)
break
# prefer shorter `file_group`s
weight -= len(file_group)*0.00001
image_file_groups.append((file_group,weight))
# Sort by weight
image_file_groups = sorted(image_file_groups, key=lambda e:e[1], reverse=True)

if len(image_file_groups) > 0:
return image_file_groups[0][0]
else:
return None


def get_unused_page_id(self, template_page_id: str = 'PAGE_{page_nr}') -> Tuple[str, int]:
"""
Finds a page_nr that yields an unused page_id for the workspace and returns page_id, page_nr
Expand Down
24 changes: 23 additions & 1 deletion tests/model/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,28 @@ def test_get_page_index(self):
self.assertEqual(3, len(page17))
self.assertEqual(2, len(alto))

def test_get_image_paths(self):
doc = Document.load(self.path)
image_paths = doc.get_image_paths('OCR-D-IMG')
self.assertEqual(2, len(image_paths))
self.assertEqual('INPUT_0017.tif', image_paths['PHYS_0017'].name)
self.assertEqual('INPUT_0020.tif', image_paths['PHYS_0020'].name)

def test_get_default_image_group(self):
doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml')
file_group = doc.get_default_image_group(['OCR-D-IMG-BIN', 'OCR-D-IMG.*'])
self.assertEqual('OCR-D-IMG-BIN', file_group)

def test_get_default_image_group_no_preference(self):
doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml')
file_group = doc.get_default_image_group()
self.assertEqual('OCR-D-IMG', file_group)

def test_get_default_image_group_with_missing_ocr_d_img(self):
doc = Document.load(ASSETS_PATH / '../example/workspaces/no_ocrd_d_img_group/mets.xml')
file_group = doc.get_default_image_group()
self.assertEqual('OCR-D-IMG-PNG', file_group)

def test_path_string(self):
doc = Document.load(self.path)
self.assertEqual(ASSETS_PATH / 'kant_aufklaerung_1784/data/lala.xml', doc.path('lala.xml'))
Expand Down Expand Up @@ -116,7 +138,7 @@ def test_page_for_id_with_nothing_for_page_and_fileGrp(self):
https://github.com/hnesk/browse-ocrd/issues/4
"""
doc = Document.load(ASSETS_PATH / '../bad/workspaces/kant_aufklaerung_1784_missing_xml/mets.xml')
doc = Document.load(ASSETS_PATH / '../example/workspaces/kant_aufklaerung_1784_missing_xml/mets.xml')
with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch:
page = doc.page_for_id('PHYS_0020', 'OCR-D-GT-PAGE')
self.assertIsNone(page)
Expand Down

0 comments on commit 0a96543

Please sign in to comment.