Skip to content

Commit

Permalink
Add more assertion checks to the tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Hiromu Hota authored and senwu committed Oct 6, 2020
1 parent 3735553 commit b44cdcd
Showing 1 changed file with 16 additions and 3 deletions.
19 changes: 16 additions & 3 deletions tests/parser/test_preprocessor.py
@@ -1,19 +1,32 @@
"""Unit tests for preprocessors."""
from bs4 import BeautifulSoup

from fonduer.parser.preprocessors.hocr_doc_preprocessor import HOCRDocPreprocessor


def test_hocrpreprocessor():
"""Test hOCRDocPreprocessor."""
"""Test hOCRDocPreprocessor with a simple hOCR."""
path = "tests/data/hocr_simple/md.hocr"
preprocessor = HOCRDocPreprocessor(path=path)
doc = next(iter(preprocessor))
assert doc.name == "md"
# the intermidiate attribute: "fonduer" should be removed.
assert "fonduer" not in doc.text
# number of "left" attribute is equal to that of "ppageno" - 1 (at ocr_page)
assert doc.text.count("left") == doc.text.count("ppageno") - 1 == 33


def test_hocrpreprocessor_wo_ppageno():
"""Test hOCRDocPreprocessor."""
def test_hocrpreprocessor_space_false():
"""Test hOCRDocPreprocessor with space=False."""
path = "tests/data/hocr_simple/japan.hocr"
preprocessor = HOCRDocPreprocessor(path=path, space=False)
doc = next(iter(preprocessor))
assert doc.name == "japan"
# the intermidiate attribute: "fonduer" should be removed.
assert "fonduer" not in doc.text

soup = BeautifulSoup(doc.text, "lxml")
element = soup.find(id="par_1_1")

# A token cannot contain " " (whitespace) as "tokens" are deliminated by a " ".
assert len(element.get("left").split()) == len(element.get("tokens").split()) == 59

0 comments on commit b44cdcd

Please sign in to comment.