Skip to content

Commit

Permalink
update master (#408)
Browse files Browse the repository at this point in the history
* TLDR-549 delete custom loggers (#393)

* changed dedoc-utils version (#394)

* remove PdfTxtlayerReader from TxtLayerDetector (#395)

* Make train dataset API separated (#396)

* TLDR-584 words boldness for images (#397)

* TLDR-584 text boldness for words in images

* TLDR-582 fix pdf_txtlayer_reader bboxes for lines (labeling_mode="true") (#399)

* TLDR-585 added TEDS table benchmark (#398)

* TLDR-538 tesseract postprocessing (#388)

* TLDR-590 fix code style in scripts directory (#400)

* Add job to meet requirements of the develop branch (#401)

* TLDR-602 some fixes of web form (#402)

* Translate labeling web pages into English (#403)

* TLDR-556 tutorial how to add a new structure type (#405)

* TLRD-182 eml reader bug fix (#406)

* new version (#407)

---------

Co-authored-by: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com>
Co-authored-by: Alexander Golodkov <golodkov@ispras.ru>
Co-authored-by: Andrew Perminov <perminov@ispras.ru>
Co-authored-by: Oksana Belyaeva <belyaeva@ispras.ru>
  • Loading branch information
5 people committed Mar 5, 2024
1 parent 1888659 commit 297dec8
Show file tree
Hide file tree
Showing 234 changed files with 64,121 additions and 1,487 deletions.
6 changes: 3 additions & 3 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ max-line-length = 160
max-complexity = 13
inline-quotes = "
application-import-names = dedoc, tests
application-import-names = dedoc, tests, scripts, train_dataset
import-order-style = pycharm
exclude =
Expand All @@ -14,8 +14,6 @@ exclude =
.github,
*__init__.py,
resources,
dedoc/scripts,
examples,
venv,
build,
dedoc.egg-info
Expand All @@ -24,3 +22,5 @@ exclude =
# ANN101 - type annotations for self
ignore =
ANN101
per-file-ignores =
scripts/*:T201
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ jobs:
cd docs/source/_static/code_examples
python dedoc_usage_tutorial.py
python dedoc_add_new_doc_type_tutorial.py
python dedoc_add_new_structure_type_tutorial.py
34 changes: 34 additions & 0 deletions .github/workflows/test_labeling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: CI

# Controls when the action will run.
on:
pull_request:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
push:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

jobs:
labeling:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: '3.9'
- name: Run tests for labeling
run: |
test="true" docker-compose -f labeling/docker-compose.yml up --build --exit-code-from test
17 changes: 17 additions & 0 deletions .github/workflows/test_skip.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: Skip CI

on:
push:
branches:
- new_version
paths:
- 'VERSION'
- 'docs/source/changelog.rst'
workflow_dispatch:

jobs:
pipeline:
runs-on: ubuntu-latest
steps:
- name: Skip tests (only VERSION and changelog have been changed)
run: echo "This is used to meet the requirements of pull-request to the develop branch (pipeline should pass)"
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ ENV/
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
args:
- "--config=.flake8"
additional_dependencies: [
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/vers
ADD tests /dedoc_root/tests
ADD resources /dedoc_root/resources

CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
CMD ["python3", "/dedoc_root/dedoc/main.py"]
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0
2.1
4 changes: 2 additions & 2 deletions dedoc/api/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab
if tables is not None and len(tables) > 0:
text += "<h3> Tables: </h3>"
for table in tables:
text += __table2html(table, table2id)
text += table2html(table, table2id)
text += "<p>&nbsp;</p>"
return text

Expand Down Expand Up @@ -201,7 +201,7 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str:
return text.replace("\n", "<br>")


def __table2html(table: Table, table2id: Dict[str, int]) -> str:
def table2html(table: Table, table2id: Dict[str, int]) -> str:
uid = table.metadata.uid
text = f"<h4> table {table2id[uid]}:</h4>"
text += f'<table border="1" id={uid} style="border-collapse: collapse; width: 100%;">\n<tbody>\n'
Expand Down
16 changes: 10 additions & 6 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
<style>
.parameters {padding: 5px; box-shadow: 1px 1px 2px #bbbbbb; width: 70%}
.body {margin-left: 20%}
details > summary {font-style: italic; cursor: pointer}
details > summary {font-style: italic; cursor: pointer; display: list-item;}
.child.max {padding-left: 5px; flex: 1}
.parent {display: flex}
</style>
</head>

Expand Down Expand Up @@ -92,11 +94,13 @@ <h4>Attachments handling</h4>


<div class="parameters">
<h4>Tables handling</h4>
<h4>Tables handling </h4>
<details><summary>need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle</summary>
<br>
<p>
<label><input name="need_pdf_table_analysis" type="checkbox" value="true" checked> need_pdf_table_analysis</label>
<label>
<input type="hidden" name="need_pdf_table_analysis" value="false">
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
</p>

<p>
Expand Down Expand Up @@ -188,9 +192,9 @@ <h4>Other formats handling</h4>
</div>

<br>
<div class="row">
<div class="col-md-3"><input type=file name=file class="btn btn-default" data-buttonText="Choose file"></div>
<div class="col-md-2"><input type=submit value=Upload class="btn btn-default"></div>
<div class="parent">
<div class="child"><input type=file name=file class="btn btn-default" data-buttonText="Choose file"></div>
<div class="child max"><input type=submit value=Upload class="btn btn-default"></div>
</div>

</form>
Expand Down
38 changes: 0 additions & 38 deletions dedoc/api/web/train_dataset/info_labeling_mode.html

This file was deleted.

20 changes: 5 additions & 15 deletions dedoc/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import importlib.util
import logging
import os
import sys
from typing import Any, Optional

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")

Expand All @@ -13,6 +11,7 @@
# -----------------------------------------RESOURCES PATH SETTINGS----------------------------------------------------
resources_path=RESOURCES_PATH,
intermediate_data_path=os.path.join(RESOURCES_PATH, "datasets"),
table_path="/tmp/tables",

# -----------------------------------------COMMON DEBUG SETTINGS----------------------------------------------------
debug_mode=DEBUG_MODE,
Expand Down Expand Up @@ -66,20 +65,11 @@ def get_instance(cls: "Configuration") -> "Configuration":

return cls.__instance

def __init_config(self, args: Optional[Any] = None) -> None:
if args is not None and args.config_path is not None:
spec = importlib.util.spec_from_file_location("config_module", args.config_path)
config_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config_module)
self.__config = config_module._config
else:
def get_config(self) -> dict:
if self.__config is None:
self.__config = _config

def get_config(self, args: Optional[Any] = None) -> dict:
if self.__config is None or args is not None:
self.__init_config(args)
return self.__config


def get_config(args: Optional[Any] = None) -> dict:
return Configuration.get_instance().get_config(args)
def get_config() -> dict:
return Configuration.get_instance().get_config()
46 changes: 27 additions & 19 deletions dedoc/data_structures/hierarchy_level.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from functools import total_ordering
from typing import Optional

import numpy as np


@total_ordering
class HierarchyLevel:
Expand All @@ -12,7 +14,9 @@ class HierarchyLevel:
- level_1 defines primary importance (e.g. root - level_1=0, header - level_1=1, etc.);
- level_2 defines the level inside lines of equal type (e.g. for list items - "1." - level_2=1, "1.1." - level_2=2, etc.).
For the least important lines like raw_text both levels are None.
For the least important lines (line_type=raw_text) both levels are None.
Look to the :ref:`hierarchy level description <add_structure_type_hierarchy_level>` to get more details.
"""
root = "root"
toc = "toc"
Expand Down Expand Up @@ -46,43 +50,47 @@ def __is_defined(self, other: "HierarchyLevel") -> bool:
def __eq__(self, other: "HierarchyLevel") -> bool:
"""
Defines the equality of two hierarchy levels:
- two raw text lines or lines with unknown type are equal;
- two lines with equal level_1, level_2 are equal.
- if some of the levels is None, its value is considered as +inf (infinities have equal value)
:param other: other hierarchy level
:return: whether current hierarchy level == other hierarchy level
"""
if not isinstance(other, HierarchyLevel):
return False

if self.__is_defined(other) and (self.level_1, self.level_2) == (other.level_1, other.level_2):
return True
if self.line_type == HierarchyLevel.raw_text and other.line_type == HierarchyLevel.raw_text:
return True
if self.line_type == HierarchyLevel.unknown and other.line_type == HierarchyLevel.unknown:
return True
return False
level_1, level_2 = self.__to_number(self.level_1), self.__to_number(self.level_2)
other_level_1, other_level_2 = self.__to_number(other.level_1), self.__to_number(other.level_2)
return (level_1, level_2) == (other_level_1, other_level_2)

def __lt__(self, other: "HierarchyLevel") -> bool:
"""
Defines the comparison of hierarchy levels:
- line1 < line2 if (level_1, level_2) of line1 <= (level_1, level_2) of line2;
- line1 < line2 if line2 is raw text or unknown, and line1 has another type.
- current level < other level if (level_1, level_2) < other (level_1, level_2);
- if some of the levels is None, its value is considered as +inf (infinities have equal value)
Else line1 >= line2.
:param other: hierarchy level of the line2
:param other: other hierarchy level
:return: whether current hierarchy level < other hierarchy level
"""
# all not None
if self.__is_defined(other):
return (self.level_1, self.level_2) < (other.level_1, other.level_2)

# all None
if self.level_1 is None and self.level_2 is None and other.level_1 is None and other.level_2 is None:
return False
if (self.level_1 is None or self.level_2 is None) and (other.level_1 is not None or other.level_2 is not None):
return False
if (self.level_1 is not None or self.level_2 is not None) and (other.level_1 is None or other.level_2 is None):
return True
return (self.level_1, self.level_2) < (other.level_1, other.level_2)

level_1, level_2 = self.__to_number(self.level_1), self.__to_number(self.level_2)
other_level_1, other_level_2 = self.__to_number(other.level_1), self.__to_number(other.level_2)

return (level_1, level_2) < (other_level_1, other_level_2)

def __str__(self) -> str:
return f"HierarchyLevel(level_1={self.level_1}, level_2={self.level_2}, can_be_multiline={self.can_be_multiline}, line_type={self.line_type})"

def __to_number(self, x: Optional[int]) -> int:
return np.inf if x is None else x

def is_raw_text(self) -> bool:
"""
Check if the line is raw text.
Expand Down
5 changes: 3 additions & 2 deletions dedoc/dedoc_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from dedoc.data_structures import ParsedDocument, UnstructuredDocument
from dedoc.manager_config import get_manager_config
from dedoc.metadata_extractors import BaseMetadataExtractor
from dedoc.train_dataset.train_dataset_utils import get_path_original_documents, save_line_with_meta
from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta
from dedoc.utils.utils import get_unique_name


Expand Down Expand Up @@ -114,7 +114,7 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
self.logger.info(f"Extract structure from file {file_name}")

if self.config.get("labeling_mode", False):
self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document)
self.__save(converted_file_path, unstructured_document)

# Step 5 - Form the output structure
parsed_document = self.structure_constructor.construct(document=unstructured_document, parameters=parameters)
Expand All @@ -141,5 +141,6 @@ def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict:
return result_parameters

def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None:
self.logger.info(f'Save document lines to {self.config["intermediate_data_path"]}')
save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path))
shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path)))
25 changes: 3 additions & 22 deletions dedoc/main.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,7 @@
import argparse

from dedoc.api.dedoc_api import get_api, run_api # noqa
from dedoc.config import Configuration, get_config


def main() -> None:
run_api(get_api())
from dedoc.config import Configuration


if __name__ == "__main__":
parser_config = argparse.ArgumentParser()
parser_config.add_argument("-c", "--config_path", help="path to configuration file")
parser_config.add_argument("-m", "--module", help="Only for tests")
parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs="*", help="Only for tests")
parser_config.add_argument("-v", "--unitest_verbose_mode", nargs="?", help="to enable verbose mode of unittest. Only for tests")

args_config = parser_config.parse_args()
Configuration.get_instance().get_config(args_config)
config = get_config()

if config.get("labeling_mode", False):
from api.train_dataset.train_dataset_api import run_special_api # noqa
run_special_api()
else:
main()
Configuration.get_instance().get_config()
run_api(get_api())

0 comments on commit 297dec8

Please sign in to comment.