rename: transkribus_{fixer,to_prima}, fix #7

kba · Jan 11, 2022 · ac01e9c · ac01e9c
1 parent 1bf0e34
commit ac01e9c
Show file tree

Hide file tree

Showing 10 changed files with 62 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
-# transkribus-fixer
+# transkribus-to-prima
 
-> Transforms Transkribus' flavor of PAGE-XML to standard PAGE-XML
+> Transforms Transkribus' [flavor of PAGE-XML](https://gitlab.com/readcoop/transkribus/TranskribusCore/-/blob/master/src/main/resources/xsd/pagecontent_extension.xsd) to [standard PAGE-XML](https://ocr-d.de/en/gt-guidelines/trans/trPage.html)
 
 ## Installation
 
 From PyPI:
 
 ```
-pip install transkribus_fixer
+pip install transkribus-to-prima
 ```
 
 From repo root:

diff --git a/setup.py b/setup.py
@@ -1,21 +1,21 @@
 from setuptools import setup
 
 setup(
-    name='transkribus_fixer',
+    name='transkribus-to-prima',
     version='0.0.1',
-    author="kba",
+    author="kba, bertsky",
     author_email="unixprog@gmail.com",
-    url="https://github.com/kba/transkribus-fixer",
+    url="https://github.com/kba/transkribus-to_prima",
     license='Apache License 2.0',
     long_description=open('README.md').read(),
     long_description_content_type='text/markdown',
     install_requires=open('requirements.txt').read().split('\n'),
-    packages=['transkribus_fixer'],
+    packages=['transkribus_to_prima'],
     entry_points={
         'console_scripts': [
-            'transkribus-fixer=transkribus_fixer.cli:cli',
-            'page-fix-coordinates=transkribus_fixer.cli_coordinate_fixer:cli',
-            'page-dimensions-from-image=transkribus_fixer.set_dimensions_from_image:cli'
+            'transkribus-to_prima=transkribus_to_prima.cli:cli',
+            'page-fix-coordinates=transkribus_to_prima.cli_coordinate_to_prima:cli',
+            'page-dimensions-from-image=transkribus_to_prima.set_dimensions_from_image:cli'
         ]
     },
 )
diff --git a/transkribus_fixer/cli.py b/transkribus_fixer/cli.py
diff --git a/transkribus_fixer/__init__.py → transkribus_to_prima/__init__.py b/transkribus_fixer/__init__.py → transkribus_to_prima/__init__.py
diff --git a/transkribus_to_prima/cli.py b/transkribus_to_prima/cli.py
@@ -0,0 +1,43 @@
+from pkg_resources import resource_filename
+from click import command, Choice, File, argument, option
+from lxml import etree as ET
+
+from .convert import TranskribusToPrima, NS
+
+CONVERTERS = [func[8:] for func in dir(TranskribusToPrima)
+          if callable(getattr(TranskribusToPrima, func)) and func.startswith('convert_')]
+CONVERTERDOCS = [func + ': ' + getattr(TranskribusToPrima, 'convert_' + func).__doc__ for func in CONVERTERS]
+CONVERTERS.append('namespace')
+CONVERTERDOCS.append('namespace: Also convert PAGE namespace version from 2013 to 2019.')
+
+
+@command(context_settings={'help_option_names': ['-h', '--help']})
+@option('-f', '--convertes', help="Conversions to apply. Repeatable [default: all].\n\n" + "\n\n".join(CONVERTERDOCS),
+        default=CONVERTERS, type=Choice(CONVERTERS), multiple=True)
+@option('-I', '--prefer-imgurl', help="use TranskribusMetadata/@imgUrl for @imageFilename if available", is_flag=True)
+@option('-V', '--validate', help="Validate output against schema.", is_flag=True)
+@argument('input-file', type=File('r'), nargs=1)
+@argument('output-file', default='-', type=File('w'), nargs=1)
+def cli(convertes, prefer_imgurl, validate, input_file, output_file):
+    """
+    Transform (Transkribus PAGE) INPUT_FILE to (PRImA PAGE) OUTPUT_FILE under the chosen convertes.
+    """
+    converter = TranskribusToPrima(ET.parse(input_file), prefer_imgurl)
+    for convert in [f for f in convertes if f != 'namespace']:
+        getattr(converter, f'convert_{convert}')()
+    as_str = converter.tostring()
+    if 'namespace' in convertes:
+        as_str = as_str.replace(NS['p2013'], NS['p2019'])
+    if validate:
+        if 'namespace' not in convertes and converter.tree.getroot().tag == "{%s}PcGts" % NS['p2013']:
+            schema = resource_filename(__name__, 'page2013.xsd')
+        else:
+            schema = resource_filename(__name__, 'page2019.xsd')
+        schema = ET.parse(schema)
+        schema = ET.XMLSchema(schema)
+        #schema.assertValid(converter.tree) # may need namespace converter
+        schema.assertValid(ET.fromstring(as_str.encode('utf-8')))
+    output_file.write(as_str)
+
+if __name__ == "__main__":
+    cli() # pylint: disable=no-value-for-parameter
diff --git a/transkribus_fixer/cli_coordinate_fixer.py → transkribus_to_prima/cli_coordinate_fixer.py b/transkribus_fixer/cli_coordinate_fixer.py → transkribus_to_prima/cli_coordinate_fixer.py
@@ -1,7 +1,7 @@
 from click import command, argument
 from lxml import etree as ET
 
-from .fixer import TranskribusFixer, NS
+from .convert import TranskribusToPrima, NS
 
 @command()
 @argument('infile')

diff --git a/transkribus_fixer/fixer.py → transkribus_to_prima/convert.py b/transkribus_fixer/fixer.py → transkribus_to_prima/convert.py
@@ -4,7 +4,7 @@
 NS2019 = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'
 NS = {'p2013': NS2013, 'p2019': NS2019}
 
-class TranskribusFixer():
+class TranskribusToPrima():
     """
     Translates Transkribus variant of PAGE to standard-conformant PAGE
     """
@@ -13,7 +13,7 @@ def __init__(self, tree, prefer_imgurl=False):
         self.tree = tree
         self.prefer_imgurl = prefer_imgurl
 
-    def fix_metadata(self):
+    def convert_metadata(self):
         """Remove any Metadata/TranskribusMetadata"""
         el_page = self.tree.find('{*}Page')
         el_metadata = self.tree.find('{*}Metadata')
@@ -24,7 +24,7 @@ def fix_metadata(self):
                 el_page.attrib['imageFilename'] = el_metadata.attrib['imgUrl']
             el_metadata.getparent().remove(el_metadata)
 
-    def fix_reading_order(self):
+    def convert_reading_order(self):
         """Convert ???"""
         ro = self.tree.xpath('//*[local-name()="ReadingOrder"]/*[local-name()="OrderedGroup"]')[0]
         relations = self.tree.xpath('//*[local-name()="Relations"]')
@@ -44,7 +44,7 @@ def fix_reading_order(self):
         if not relations.findall('*'):
             relations.getparent().remove(relations)
 
-    def fix_table(self):
+    def convert_table(self):
         """Convert each TableRegion/TableCell into a TableRegion/TextRegion, writing row/col index/span as new TableCellRole accordingly"""
         for el_table in self.tree.xpath('//*[local-name()="TableRegion"]'):
             for el_cell in el_table.xpath('*[local-name()="TableCell"]'):
@@ -64,7 +64,7 @@ def fix_table(self):
                 el_roles = el_region.find('{*}Roles')
                 if el_roles is None:
                     el_roles = ET.SubElement(el_region, '{%s}Roles' % NS2013)
-                # NS2013 does not have TableCellRole, so we implicitly rely on the namespace fixer here
+                # NS2013 does not have TableCellRole, so we implicitly rely on the namespace converter here
                 el_tablecellrole = ET.SubElement(el_roles, '{%s}TableCellRole' % NS2013)
                 el_tablecellrole.set('rowIndex', el_cell.get('row'))
                 el_tablecellrole.set('columnIndex', el_cell.get('col'))
@@ -76,7 +76,7 @@ def fix_table(self):
                            for suf in ['Region', 'TextLine', 'TextEquiv', 'TextStyle']):
                         el_region.append(node)
 
-    def fix_textequiv(self):
+    def convert_textequiv(self):
         """Convert any //TextEquiv/UnicodeAlternative into additional ../TextEquiv/Unicode"""
         for el_te in self.tree.xpath('//*[local-name()="TextEquiv"]'):
             for el_uni in el_te.xpath('//*[local-name()="UnicodeAlternative"]'):
@@ -85,7 +85,7 @@ def fix_textequiv(self):
                 el_tenewuni = ET.SubElement(el_tenew, '{%s}Unicode' % NS2013)
                 el_tenewuni.text = el_uni.text
 
-    def fix_image_transform(self):
+    def convert_image_transform(self):
         """Convert Page/@image(Rotation|Translation|Scaling) to Labels"""
         el_page = self.tree.find('{*}Page')
         el_labels = el_page.find('{*}Labels')
@@ -111,7 +111,7 @@ def new_label(typ, val):
             if label in el_page.attrib:
                 new_label(label, el_page.attrib.pop(label))
 
-    def fix_tag_property_link(self):
+    def convert_tag_property_link(self):
         """Remove Tag, Property and Link elements whereever they appear"""
         # all known under PageType, RegionType, TextLineType, WordType, GlyphType
         # Tag known under TextEquivType

diff --git a/transkribus_fixer/page2013.xsd → transkribus_to_prima/page2013.xsd b/transkribus_fixer/page2013.xsd → transkribus_to_prima/page2013.xsd
diff --git a/transkribus_fixer/page2019.xsd → transkribus_to_prima/page2019.xsd b/transkribus_fixer/page2019.xsd → transkribus_to_prima/page2019.xsd
diff --git a/...kribus_fixer/set_dimensions_from_image.py → ...bus_to_prima/set_dimensions_from_image.py b/...kribus_fixer/set_dimensions_from_image.py → ...bus_to_prima/set_dimensions_from_image.py