Skip to content

Commit

Permalink
Moved and fixed get_image_paths to cope with missing images
Browse files Browse the repository at this point in the history
in preparation for #7
  • Loading branch information
hnesk committed Oct 14, 2020
1 parent 097c9ba commit cdd6d39
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 22 deletions.
36 changes: 30 additions & 6 deletions ocrd_browser/model/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

import cv2

log = getLogger(__name__)

EventCallBack = Optional[Callable[[str, Any], None]]

Expand Down Expand Up @@ -70,6 +69,7 @@ def clone(cls, mets_url: Union[Path, str], emitter: EventCallBack = None) -> 'Do
"""
Clones a project (mets.xml and all used files) to a temporary directory for editing
"""
log = getLogger('ocrd_browser.model.document.Document.clone')
mets_url = cls._strip_local(mets_url, disallow_remote=False)
temporary_workspace = mkdtemp(prefix='browse-ocrd-clone-')
# TODO download = False and lazy loading would be nice for responsiveness
Expand Down Expand Up @@ -144,7 +144,7 @@ def _tree(self) -> ElementTree:
# noinspection PyProtectedMember
return self.workspace.mets._tree

def xpath(self, xpath) -> Any:
def xpath(self, xpath: str) -> Any:
return self._tree.getroot().xpath(xpath, namespaces=NS)

@property
Expand Down Expand Up @@ -202,10 +202,14 @@ def get_file_index(self) -> Dict[str, OcrdFile]:
page17 = [file for file in file_index.values() if file.static_page_id == 'PHYS_0017']
"""
file_index = {file.ID: file for file in self.workspace.mets.find_files()}
file_pointers = self.xpath('mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr')
log = getLogger('ocrd_browser.model.document.Document.get_file_index')
file_index = {}
for file in self.workspace.mets.find_files():
file.static_page_id = None
file_index[file.ID] = file

file_pointers: List[Element] = self.xpath('mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr')
for file_pointer in file_pointers:
file_pointer: Element
file_id = file_pointer.get('FILEID')
page_id = file_pointer.getparent().get('ID')
if file_id in file_index:
Expand All @@ -215,6 +219,25 @@ def get_file_index(self) -> Dict[str, OcrdFile]:

return file_index

def get_image_paths(self, file_group: str) -> Dict[str, Path]:
"""
Builds a Dict ID->Path for all page_ids fast
More precisely: fast = Faster than iterating over page_ids and using mets.get_physical_page_for_file for each entry
"""
log = getLogger('ocrd_browser.model.document.Document.get_image_paths')

image_paths = {}
file_index = self.get_file_index()
for page_id in self.page_ids:
images = [image for image in file_index.values() if image.static_page_id == page_id and image.fileGrp == file_group]
if len(images) == 1:
image_paths[page_id] = self.path(images[0])
else:
log.warning('Found %d images for PAGE %s and fileGrp %s, expected 1', len(images), page_id, file_group)
image_paths[page_id] = None
return image_paths

def get_unused_page_id(self, template_page_id: str = 'PAGE_{page_nr}') -> Tuple[str, int]:
"""
Finds a page_nr that yields an unused page_id for the workspace and returns page_id, page_nr
Expand Down Expand Up @@ -250,11 +273,12 @@ def display_id_range(self, page_id: str, page_qty: int) -> List[str]:

def page_for_id(self, page_id: str, file_group: str = None) -> Optional['Page']:
"""
Find the Page object for page_id and filegroup
Find the Page object for page_id and file_group
If no PAGE-XML is found finds a single image file in the group, PAGE-XML will be populated automatically by page_from_image
This is modelled after Processor.input_files https://github.com/OCR-D/core/pull/556/
"""
log = getLogger('ocrd_browser.model.document.Document.page_for_id')
page_files = self.files_for_page_id(page_id, file_group, MIMETYPE_PAGE)
if not page_files:
page_files = self.files_for_page_id(page_id, file_group, mimetype="//image/.*")
Expand Down
28 changes: 12 additions & 16 deletions ocrd_browser/ui/page_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from itertools import count
from pathlib import Path

from ocrd_utils import getLogger

from ocrd_browser.util.image import cv_to_pixbuf, cv_scale
from ocrd_browser.model import Document, DEFAULT_FILE_GROUP
from .icon_store import LazyLoadingListStore
Expand All @@ -15,7 +17,6 @@
ChangeList = Union[List[str], Dict[str, str]]
Column = NewType('Column', int)


class PageListStore(LazyLoadingListStore):
"""
PageListStore is a GTK.ListStore for use with GTK.IconView and works as an adapter to ocrd_browser.model.Document
Expand Down Expand Up @@ -50,11 +51,13 @@ def __init__(self, document: Document):
self.loading_image_pixbuf = GdkPixbuf.Pixbuf.new_from_resource(
'/org/readmachine/ocrd-browser/icons/loading.png')

file_lookup = self._get_image_paths(self.document)
# TODO: do not hardcode DEFAULT_FILE_GROUP = 'OCR-D-IMG', see https://github.com/hnesk/browse-ocrd/issues/7#issuecomment-707851109
file_group = DEFAULT_FILE_GROUP
file_lookup = document.get_image_paths(file_group)
order = count(start=1)
for page_id in self.document.page_ids:
file = str(file_lookup[page_id])
self.append((page_id, '', file, None, next(order)))
file = file_lookup[page_id]
self.append((page_id, '', str(file) if file else None, None, next(order)))

GLib.timeout_add(10, self.start_loading)

Expand Down Expand Up @@ -139,16 +142,6 @@ def _reordered(old_to_new_ids: Dict[str, str]) -> None:
}
handler[subtype](changes)

@staticmethod
def _get_image_paths(document: Document, file_group: str = 'OCR-D-IMG') -> Dict[str, Path]:
"""
Builds a Dict ID->Path for all page_ids fast
"""
images = list(document.workspace.mets.find_files(fileGrp=file_group))
page_ids = document.workspace.mets.get_physical_pages(for_fileIds=[image.ID for image in images])
file_paths = [document.path(image.url) for image in images]
return dict(zip(page_ids, file_paths))

def _init_row(self, row: Gtk.TreeModelRow) -> None:
row[1] = 'Loading {}'.format(row[self.COLUMN_FILENAME])
row[3] = self.loading_image_pixbuf
Expand All @@ -164,5 +157,8 @@ def _load_row(row: Gtk.TreeModelRow) -> Gtk.TreeModelRow:
@staticmethod
def _hash_row(row: Gtk.TreeModelRow) -> str:
file = row[PageListStore.COLUMN_FILENAME]
modified_time = os.path.getmtime(file)
return '{}:{}'.format(file, modified_time)
if file is not None:
modified_time = os.path.getmtime(file)
return '{}:{}'.format(file, modified_time)
else:
return ''

0 comments on commit cdd6d39

Please sign in to comment.