Moved and fixed get_image_paths to cope with missing images

in preparation for #7
hnesk · Oct 14, 2020 · cdd6d39 · cdd6d39
1 parent 097c9ba
commit cdd6d39
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 22 deletions.
diff --git a/ocrd_browser/model/document.py b/ocrd_browser/model/document.py
@@ -24,7 +24,6 @@
 
 import cv2
 
-log = getLogger(__name__)
 
 EventCallBack = Optional[Callable[[str, Any], None]]
 
@@ -70,6 +69,7 @@ def clone(cls, mets_url: Union[Path, str], emitter: EventCallBack = None) -> 'Do
         """
         Clones a project (mets.xml and all used files) to a temporary directory for editing
         """
+        log = getLogger('ocrd_browser.model.document.Document.clone')
         mets_url = cls._strip_local(mets_url, disallow_remote=False)
         temporary_workspace = mkdtemp(prefix='browse-ocrd-clone-')
         # TODO download = False and lazy loading would be nice for responsiveness
@@ -144,7 +144,7 @@ def _tree(self) -> ElementTree:
         # noinspection PyProtectedMember
         return self.workspace.mets._tree
 
-    def xpath(self, xpath) -> Any:
+    def xpath(self, xpath: str) -> Any:
         return self._tree.getroot().xpath(xpath, namespaces=NS)
 
     @property
@@ -202,10 +202,14 @@ def get_file_index(self) -> Dict[str, OcrdFile]:
         page17 = [file for file in file_index.values() if file.static_page_id == 'PHYS_0017']
 
         """
-        file_index = {file.ID: file for file in self.workspace.mets.find_files()}
-        file_pointers = self.xpath('mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr')
+        log = getLogger('ocrd_browser.model.document.Document.get_file_index')
+        file_index = {}
+        for file in self.workspace.mets.find_files():
+            file.static_page_id = None
+            file_index[file.ID] = file
+
+        file_pointers: List[Element] = self.xpath('mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr')
         for file_pointer in file_pointers:
-            file_pointer: Element
             file_id = file_pointer.get('FILEID')
             page_id = file_pointer.getparent().get('ID')
             if file_id in file_index:
@@ -215,6 +219,25 @@ def get_file_index(self) -> Dict[str, OcrdFile]:
 
         return file_index
 
+    def get_image_paths(self, file_group: str) -> Dict[str, Path]:
+        """
+        Builds a Dict ID->Path for all page_ids fast
+
+        More precisely:  fast = Faster than iterating over page_ids and using mets.get_physical_page_for_file for each entry
+        """
+        log = getLogger('ocrd_browser.model.document.Document.get_image_paths')
+
+        image_paths = {}
+        file_index = self.get_file_index()
+        for page_id in self.page_ids:
+            images = [image for image in file_index.values() if image.static_page_id == page_id and image.fileGrp == file_group]
+            if len(images) == 1:
+                image_paths[page_id] = self.path(images[0])
+            else:
+                log.warning('Found %d images for PAGE %s and fileGrp %s, expected 1', len(images), page_id, file_group)
+                image_paths[page_id] = None
+        return image_paths
+
     def get_unused_page_id(self, template_page_id: str = 'PAGE_{page_nr}') -> Tuple[str, int]:
         """
         Finds a page_nr that yields an unused page_id for the workspace and returns page_id, page_nr
@@ -250,11 +273,12 @@ def display_id_range(self, page_id: str, page_qty: int) -> List[str]:
 
     def page_for_id(self, page_id: str, file_group: str = None) -> Optional['Page']:
         """
-        Find the Page object for page_id and filegroup
+        Find the Page object for page_id and file_group
 
         If no PAGE-XML is found finds a single image file in the group, PAGE-XML will be populated automatically by page_from_image
         This is modelled after Processor.input_files https://github.com/OCR-D/core/pull/556/
         """
+        log = getLogger('ocrd_browser.model.document.Document.page_for_id')
         page_files = self.files_for_page_id(page_id, file_group, MIMETYPE_PAGE)
         if not page_files:
             page_files = self.files_for_page_id(page_id, file_group, mimetype="//image/.*")

diff --git a/ocrd_browser/ui/page_store.py b/ocrd_browser/ui/page_store.py
@@ -4,6 +4,8 @@
 from itertools import count
 from pathlib import Path
 
+from ocrd_utils import getLogger
+
 from ocrd_browser.util.image import cv_to_pixbuf, cv_scale
 from ocrd_browser.model import Document, DEFAULT_FILE_GROUP
 from .icon_store import LazyLoadingListStore
@@ -15,7 +17,6 @@
 ChangeList = Union[List[str], Dict[str, str]]
 Column = NewType('Column', int)
 
-
 class PageListStore(LazyLoadingListStore):
     """
     PageListStore is a GTK.ListStore for use with GTK.IconView and works as an adapter to ocrd_browser.model.Document
@@ -50,11 +51,13 @@ def __init__(self, document: Document):
         self.loading_image_pixbuf = GdkPixbuf.Pixbuf.new_from_resource(
             '/org/readmachine/ocrd-browser/icons/loading.png')
 
-        file_lookup = self._get_image_paths(self.document)
+        # TODO: do not hardcode DEFAULT_FILE_GROUP = 'OCR-D-IMG', see https://github.com/hnesk/browse-ocrd/issues/7#issuecomment-707851109
+        file_group = DEFAULT_FILE_GROUP
+        file_lookup = document.get_image_paths(file_group)
         order = count(start=1)
         for page_id in self.document.page_ids:
-            file = str(file_lookup[page_id])
-            self.append((page_id, '', file, None, next(order)))
+            file = file_lookup[page_id]
+            self.append((page_id, '', str(file) if file else None, None, next(order)))
 
         GLib.timeout_add(10, self.start_loading)
 
@@ -139,16 +142,6 @@ def _reordered(old_to_new_ids: Dict[str, str]) -> None:
         }
         handler[subtype](changes)
 
-    @staticmethod
-    def _get_image_paths(document: Document, file_group: str = 'OCR-D-IMG') -> Dict[str, Path]:
-        """
-        Builds a Dict ID->Path for all page_ids fast
-        """
-        images = list(document.workspace.mets.find_files(fileGrp=file_group))
-        page_ids = document.workspace.mets.get_physical_pages(for_fileIds=[image.ID for image in images])
-        file_paths = [document.path(image.url) for image in images]
-        return dict(zip(page_ids, file_paths))
-
     def _init_row(self, row: Gtk.TreeModelRow) -> None:
         row[1] = 'Loading {}'.format(row[self.COLUMN_FILENAME])
         row[3] = self.loading_image_pixbuf
@@ -164,5 +157,8 @@ def _load_row(row: Gtk.TreeModelRow) -> Gtk.TreeModelRow:
     @staticmethod
     def _hash_row(row: Gtk.TreeModelRow) -> str:
         file = row[PageListStore.COLUMN_FILENAME]
-        modified_time = os.path.getmtime(file)
-        return '{}:{}'.format(file, modified_time)
+        if file is not None:
+            modified_time = os.path.getmtime(file)
+            return '{}:{}'.format(file, modified_time)
+        else:
+            return ''