Skip to content

Commit

Permalink
rename: transkribus_{fixer,to_prima}, fix #7
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Jan 11, 2022
1 parent 1bf0e34 commit ac01e9c
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 62 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# transkribus-fixer
# transkribus-to-prima

> Transforms Transkribus' flavor of PAGE-XML to standard PAGE-XML
> Transforms Transkribus' [flavor of PAGE-XML](https://gitlab.com/readcoop/transkribus/TranskribusCore/-/blob/master/src/main/resources/xsd/pagecontent_extension.xsd) to [standard PAGE-XML](https://ocr-d.de/en/gt-guidelines/trans/trPage.html)
## Installation

From PyPI:

```
pip install transkribus_fixer
pip install transkribus-to-prima
```

From repo root:
Expand Down
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
from setuptools import setup

setup(
name='transkribus_fixer',
name='transkribus-to-prima',
version='0.0.1',
author="kba",
author="kba, bertsky",
author_email="unixprog@gmail.com",
url="https://github.com/kba/transkribus-fixer",
url="https://github.com/kba/transkribus-to_prima",
license='Apache License 2.0',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
install_requires=open('requirements.txt').read().split('\n'),
packages=['transkribus_fixer'],
packages=['transkribus_to_prima'],
entry_points={
'console_scripts': [
'transkribus-fixer=transkribus_fixer.cli:cli',
'page-fix-coordinates=transkribus_fixer.cli_coordinate_fixer:cli',
'page-dimensions-from-image=transkribus_fixer.set_dimensions_from_image:cli'
'transkribus-to_prima=transkribus_to_prima.cli:cli',
'page-fix-coordinates=transkribus_to_prima.cli_coordinate_to_prima:cli',
'page-dimensions-from-image=transkribus_to_prima.set_dimensions_from_image:cli'
]
},
)
43 changes: 0 additions & 43 deletions transkribus_fixer/cli.py

This file was deleted.

File renamed without changes.
43 changes: 43 additions & 0 deletions transkribus_to_prima/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pkg_resources import resource_filename
from click import command, Choice, File, argument, option
from lxml import etree as ET

from .convert import TranskribusToPrima, NS

CONVERTERS = [func[8:] for func in dir(TranskribusToPrima)
if callable(getattr(TranskribusToPrima, func)) and func.startswith('convert_')]
CONVERTERDOCS = [func + ': ' + getattr(TranskribusToPrima, 'convert_' + func).__doc__ for func in CONVERTERS]
CONVERTERS.append('namespace')
CONVERTERDOCS.append('namespace: Also convert PAGE namespace version from 2013 to 2019.')


@command(context_settings={'help_option_names': ['-h', '--help']})
@option('-f', '--convertes', help="Conversions to apply. Repeatable [default: all].\n\n" + "\n\n".join(CONVERTERDOCS),
default=CONVERTERS, type=Choice(CONVERTERS), multiple=True)
@option('-I', '--prefer-imgurl', help="use TranskribusMetadata/@imgUrl for @imageFilename if available", is_flag=True)
@option('-V', '--validate', help="Validate output against schema.", is_flag=True)
@argument('input-file', type=File('r'), nargs=1)
@argument('output-file', default='-', type=File('w'), nargs=1)
def cli(convertes, prefer_imgurl, validate, input_file, output_file):
"""
Transform (Transkribus PAGE) INPUT_FILE to (PRImA PAGE) OUTPUT_FILE under the chosen convertes.
"""
converter = TranskribusToPrima(ET.parse(input_file), prefer_imgurl)
for convert in [f for f in convertes if f != 'namespace']:
getattr(converter, f'convert_{convert}')()
as_str = converter.tostring()
if 'namespace' in convertes:
as_str = as_str.replace(NS['p2013'], NS['p2019'])
if validate:
if 'namespace' not in convertes and converter.tree.getroot().tag == "{%s}PcGts" % NS['p2013']:
schema = resource_filename(__name__, 'page2013.xsd')
else:
schema = resource_filename(__name__, 'page2019.xsd')
schema = ET.parse(schema)
schema = ET.XMLSchema(schema)
#schema.assertValid(converter.tree) # may need namespace converter
schema.assertValid(ET.fromstring(as_str.encode('utf-8')))
output_file.write(as_str)

if __name__ == "__main__":
cli() # pylint: disable=no-value-for-parameter
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from click import command, argument
from lxml import etree as ET

from .fixer import TranskribusFixer, NS
from .convert import TranskribusToPrima, NS

@command()
@argument('infile')
Expand Down
16 changes: 8 additions & 8 deletions transkribus_fixer/fixer.py → transkribus_to_prima/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
NS2019 = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'
NS = {'p2013': NS2013, 'p2019': NS2019}

class TranskribusFixer():
class TranskribusToPrima():
"""
Translates Transkribus variant of PAGE to standard-conformant PAGE
"""
Expand All @@ -13,7 +13,7 @@ def __init__(self, tree, prefer_imgurl=False):
self.tree = tree
self.prefer_imgurl = prefer_imgurl

def fix_metadata(self):
def convert_metadata(self):
"""Remove any Metadata/TranskribusMetadata"""
el_page = self.tree.find('{*}Page')
el_metadata = self.tree.find('{*}Metadata')
Expand All @@ -24,7 +24,7 @@ def fix_metadata(self):
el_page.attrib['imageFilename'] = el_metadata.attrib['imgUrl']
el_metadata.getparent().remove(el_metadata)

def fix_reading_order(self):
def convert_reading_order(self):
"""Convert ???"""
ro = self.tree.xpath('//*[local-name()="ReadingOrder"]/*[local-name()="OrderedGroup"]')[0]
relations = self.tree.xpath('//*[local-name()="Relations"]')
Expand All @@ -44,7 +44,7 @@ def fix_reading_order(self):
if not relations.findall('*'):
relations.getparent().remove(relations)

def fix_table(self):
def convert_table(self):
"""Convert each TableRegion/TableCell into a TableRegion/TextRegion, writing row/col index/span as new TableCellRole accordingly"""
for el_table in self.tree.xpath('//*[local-name()="TableRegion"]'):
for el_cell in el_table.xpath('*[local-name()="TableCell"]'):
Expand All @@ -64,7 +64,7 @@ def fix_table(self):
el_roles = el_region.find('{*}Roles')
if el_roles is None:
el_roles = ET.SubElement(el_region, '{%s}Roles' % NS2013)
# NS2013 does not have TableCellRole, so we implicitly rely on the namespace fixer here
# NS2013 does not have TableCellRole, so we implicitly rely on the namespace converter here
el_tablecellrole = ET.SubElement(el_roles, '{%s}TableCellRole' % NS2013)
el_tablecellrole.set('rowIndex', el_cell.get('row'))
el_tablecellrole.set('columnIndex', el_cell.get('col'))
Expand All @@ -76,7 +76,7 @@ def fix_table(self):
for suf in ['Region', 'TextLine', 'TextEquiv', 'TextStyle']):
el_region.append(node)

def fix_textequiv(self):
def convert_textequiv(self):
"""Convert any //TextEquiv/UnicodeAlternative into additional ../TextEquiv/Unicode"""
for el_te in self.tree.xpath('//*[local-name()="TextEquiv"]'):
for el_uni in el_te.xpath('//*[local-name()="UnicodeAlternative"]'):
Expand All @@ -85,7 +85,7 @@ def fix_textequiv(self):
el_tenewuni = ET.SubElement(el_tenew, '{%s}Unicode' % NS2013)
el_tenewuni.text = el_uni.text

def fix_image_transform(self):
def convert_image_transform(self):
"""Convert Page/@image(Rotation|Translation|Scaling) to Labels"""
el_page = self.tree.find('{*}Page')
el_labels = el_page.find('{*}Labels')
Expand All @@ -111,7 +111,7 @@ def new_label(typ, val):
if label in el_page.attrib:
new_label(label, el_page.attrib.pop(label))

def fix_tag_property_link(self):
def convert_tag_property_link(self):
"""Remove Tag, Property and Link elements whereever they appear"""
# all known under PageType, RegionType, TextLineType, WordType, GlyphType
# Tag known under TextEquivType
Expand Down
File renamed without changes.
File renamed without changes.

0 comments on commit ac01e9c

Please sign in to comment.