Document.get_default_image_group()

hnesk · Oct 14, 2020 · 0a96543 · 0a96543
1 parent 89b5d0e
commit 0a96543
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 2 deletions.
diff --git a/ocrd_browser/model/document.py b/ocrd_browser/model/document.py
@@ -1,3 +1,4 @@
+import re
 import shutil
 
 from ocrd import Workspace, Resolver
@@ -226,7 +227,6 @@ def get_image_paths(self, file_group: str) -> Dict[str, Path]:
         More precisely:  fast = Faster than iterating over page_ids and using mets.get_physical_page_for_file for each entry
         """
         log = getLogger('ocrd_browser.model.document.Document.get_image_paths')
-
         image_paths = {}
         file_index = self.get_file_index()
         for page_id in self.page_ids:
@@ -238,6 +238,31 @@ def get_image_paths(self, file_group: str) -> Dict[str, Path]:
                 image_paths[page_id] = None
         return image_paths
 
+    def get_default_image_group(self, preferred_image_file_groups:Optional[List] = None) -> Optional[str]:
+        image_file_groups = []
+        for file_group, mimetype in self.file_groups_and_mimetypes:
+            weight = 0.0
+            if mimetype.split('/')[0] == 'image':
+                # prefer images
+                weight += 0.5
+            if preferred_image_file_groups:
+                for i, preferred_image_file_group in enumerate(preferred_image_file_groups):
+                    if re.fullmatch(preferred_image_file_group, file_group):
+                        # prefer matches earlier in the list
+                        weight += (len(preferred_image_file_groups) - i)
+                        break
+            # prefer shorter `file_group`s
+            weight -= len(file_group)*0.00001
+            image_file_groups.append((file_group,weight))
+        # Sort by weight
+        image_file_groups = sorted(image_file_groups, key=lambda e:e[1], reverse=True)
+
+        if len(image_file_groups) > 0:
+            return image_file_groups[0][0]
+        else:
+            return None
+
+
     def get_unused_page_id(self, template_page_id: str = 'PAGE_{page_nr}') -> Tuple[str, int]:
         """
         Finds a page_nr that yields an unused page_id for the workspace and returns page_id, page_nr

diff --git a/tests/model/test_document.py b/tests/model/test_document.py
@@ -38,6 +38,28 @@ def test_get_page_index(self):
         self.assertEqual(3, len(page17))
         self.assertEqual(2, len(alto))
 
+    def test_get_image_paths(self):
+        doc = Document.load(self.path)
+        image_paths = doc.get_image_paths('OCR-D-IMG')
+        self.assertEqual(2, len(image_paths))
+        self.assertEqual('INPUT_0017.tif', image_paths['PHYS_0017'].name)
+        self.assertEqual('INPUT_0020.tif', image_paths['PHYS_0020'].name)
+
+    def test_get_default_image_group(self):
+        doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml')
+        file_group = doc.get_default_image_group(['OCR-D-IMG-BIN', 'OCR-D-IMG.*'])
+        self.assertEqual('OCR-D-IMG-BIN', file_group)
+
+    def test_get_default_image_group_no_preference(self):
+        doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml')
+        file_group = doc.get_default_image_group()
+        self.assertEqual('OCR-D-IMG', file_group)
+
+    def test_get_default_image_group_with_missing_ocr_d_img(self):
+        doc = Document.load(ASSETS_PATH / '../example/workspaces/no_ocrd_d_img_group/mets.xml')
+        file_group = doc.get_default_image_group()
+        self.assertEqual('OCR-D-IMG-PNG', file_group)
+
     def test_path_string(self):
         doc = Document.load(self.path)
         self.assertEqual(ASSETS_PATH / 'kant_aufklaerung_1784/data/lala.xml', doc.path('lala.xml'))
@@ -116,7 +138,7 @@ def test_page_for_id_with_nothing_for_page_and_fileGrp(self):
 
         https://github.com/hnesk/browse-ocrd/issues/4
         """
-        doc = Document.load(ASSETS_PATH / '../bad/workspaces/kant_aufklaerung_1784_missing_xml/mets.xml')
+        doc = Document.load(ASSETS_PATH / '../example/workspaces/kant_aufklaerung_1784_missing_xml/mets.xml')
         with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch:
             page = doc.page_for_id('PHYS_0020', 'OCR-D-GT-PAGE')
         self.assertIsNone(page)