From 62599b4347b2007f84610fe6763e7c5d068c0b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=20Boug=C3=A9?= Date: Tue, 16 Feb 2021 15:32:11 +0100 Subject: [PATCH 1/6] Fix NullPointer exception in PDFALTOOutlineSaxHandler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I observed the following error. févr. 12, 2021 3:20:44 PM org.grobid.core.document.Document addTokenizedDocument GRAVE: Cannot parse file: my_dir/grobid-0.6.1/grobid-home/tmp/ZW3PpM4E9g.lxml_outline.xml févr. 12, 2021 3:20:44 PM org.grobid.core.document.Document addTokenizedDocument GRAVE: Cannot parse file: my_dir/grobid-0.6.1/grobid-home/tmp/ZW3PpM4E9g.lxml_outline.xml févr. 12, 2021 3:20:46 PM org.grobid.core.engines.ProcessEngine createTraining INFOS: 2 files processed. it originates in an illegal nullPointer access in file PDFALTOOutlineSaxHandler.java. In understand that the error is not that "GRAVE". According to @kermitt2, it is simply one of the generated XML file resulting from the PDF parsing which is not XML valid (very frequent) - it has no impact because the outline file (containing the table of content outline if available embedded in the PDF) is not exploited by GROBID for the moment - it's to allow some possible improvements in the future. The error is more a reminder for the developers... the XML parser that classifies it as "GRAVE" but it would be rather INFO for us. Unfortunately, a side effect might have been overlooked. If father is null, then father.addChild(currentNode) is called with a nullPointer exception. This exception is caught by catch (Exception e) at line 372 in grobid-core/src/main/java/org/grobid/core/document/Document.java where the error message "Cannot parse" is misleading. I think an additional else is just missing. --- .../grobid/core/sax/PDFALTOOutlineSaxHandler.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java index 3e770543ef..b127d30ef9 100644 --- a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java @@ -92,10 +92,22 @@ public void startElement(String namespaceURI, String localName, nodes.put(currentId,currentNode); if (currentParentId != -1) { DocumentNode father = nodes.get(currentParentId); + // Correction proposed by Luc Bougé @lucbouge + // If father is null, then one should not access any method of father. + /************************* if (father == null) System.out.println("Warning, father not yet encountered! id is " + currentParentId); currentNode.setFather(father); father.addChild(currentNode); + *************************/ + if (father == null) + System.out.println("Warning, father not yet encountered! id is " + currentParentId); + else { + currentNode.setFather(father); + father.addChild(currentNode); + } + /************************/ + // End of correction } else { // parent is the root node currentNode.setFather(root); @@ -211,4 +223,4 @@ public void startElement(String namespaceURI, String localName, accumulator.setLength(0); } -} \ No newline at end of file +} From 2340aa7636773577f3fedaa88058892bf9062ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=20Boug=C3=A9?= Date: Fri, 19 Feb 2021 19:47:22 +0100 Subject: [PATCH 2/6] Update PDFALTOOutlineSaxHandler.java I cleaned up the correction as requested. Luc. --- .../org/grobid/core/sax/PDFALTOOutlineSaxHandler.java | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java index b127d30ef9..d4ce968df7 100644 --- a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java @@ -92,22 +92,12 @@ public void startElement(String namespaceURI, String localName, nodes.put(currentId,currentNode); if (currentParentId != -1) { DocumentNode father = nodes.get(currentParentId); - // Correction proposed by Luc Bougé @lucbouge - // If father is null, then one should not access any method of father. - /************************* - if (father == null) - System.out.println("Warning, father not yet encountered! id is " + currentParentId); - currentNode.setFather(father); - father.addChild(currentNode); - *************************/ if (father == null) System.out.println("Warning, father not yet encountered! id is " + currentParentId); else { currentNode.setFather(father); father.addChild(currentNode); } - /************************/ - // End of correction } else { // parent is the root node currentNode.setFather(root); From 300efd318157817afce26c06cef0ee87bdc30e4d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 2 Mar 2021 14:49:25 +0900 Subject: [PATCH 3/6] fixing wrong assignment of father id and wrong override of label, adding a test case, rename test to match the class name --- .../grobid/core/document/DocumentNode.java | 29 ++++--- .../core/sax/PDFALTOOutlineSaxHandler.java | 20 +++-- .../sax/PDFALTOOutlineSaxHandlerTest.java | 74 ++++++++++++++++++ .../core/sax/PDFALTOOutlineSaxParserTest.java | 47 ------------ .../org/grobid/core/sax/test_outline.xml | 75 +++++++++++++++++++ 5 files changed, 180 insertions(+), 65 deletions(-) create mode 100644 grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java delete mode 100644 grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxParserTest.java create mode 100644 grobid-core/src/test/resources/org/grobid/core/sax/test_outline.xml diff --git a/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java b/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java index cc93f386ef..2ecb71f915 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java +++ b/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java @@ -12,32 +12,33 @@ */ public class DocumentNode { + private Integer id = null; + // Gorn address for tree structure private String address = null; - // real numbering of the section, if any - private String realNumber = null; + private String realNumber = null; // normalized numbering of the section, if any - private String normalizedNumber = null; + private String normalizedNumber = null; // the string attached to this document level, e.g. section title - private String label = null; + private String label = null; // list of child document nodes - private List children = null; + private List children = null; // offset relatively to the document tokenization (so token offset, NOT character offset) + public int startToken = -1; public int endToken = -1; - // coordinates of the string attached to this document level, typically where an index link // action point in the document - private BoundingBox boundingBox = null; + private BoundingBox boundingBox = null; // parent document node, if null it is a root node - private DocumentNode father = null; + private DocumentNode father = null; public DocumentNode() { } @@ -128,8 +129,8 @@ public String toString() { } public String toString(int tab) { - StringBuffer sb = new StringBuffer(); - sb.append(address + " " + label + " " + startToken + " " + endToken + "\n"); + StringBuilder sb = new StringBuilder(); + sb.append(id).append(" ").append(address).append(" ").append(label).append(" ").append(startToken).append(" ").append(endToken).append("\n"); if (children != null) { for (DocumentNode node : children) { @@ -169,6 +170,7 @@ public DocumentNode getSpanningNode(int position) { } } + /*public DocumentNode nextSlibing() { if ( (children != null) && (children.size() > 0) ) { return children.get(0); @@ -182,5 +184,12 @@ else if (father == null) { } } }*/ + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java index d4ce968df7..57081fce97 100644 --- a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java @@ -32,6 +32,7 @@ public class PDFALTOOutlineSaxHandler extends DefaultHandler { private int currentParentId = -1; private Map nodes = null; + private Map labels = new HashMap<>(); public PDFALTOOutlineSaxHandler(Document doc) { this.doc = doc; @@ -53,11 +54,14 @@ public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { if (qName.equals("STRING")) { - label = getText(); + currentNode.setLabel(getText()); + currentNode.setBoundingBox(box); } else if (qName.equals("ITEM")) { - currentNode.setLabel(label); - currentNode.setBoundingBox(box); - } + box = null; + label = null; + } else if (qName.equals("TOCITEMLIST")) { + currentParentId = -1; + } } public void startElement(String namespaceURI, String localName, @@ -66,7 +70,7 @@ public void startElement(String namespaceURI, String localName, // this is the document root root = new DocumentNode(); nodes = new HashMap(); - } if (qName.equals("ITEM")) { + } else if (qName.equals("ITEM")) { currentNode = new DocumentNode(); // get the node id int length = atts.getLength(); @@ -78,7 +82,7 @@ public void startElement(String namespaceURI, String localName, String value = atts.getValue(i); if ((name != null) && (value != null)) { - if (name.equals("id")) { + if (name.equalsIgnoreCase("id")) { try { currentId = Integer.parseInt(value); } catch(Exception e) { @@ -88,12 +92,12 @@ public void startElement(String namespaceURI, String localName, } } } - //currentNode.setId(currentId); + currentNode.setId(currentId); nodes.put(currentId,currentNode); if (currentParentId != -1) { DocumentNode father = nodes.get(currentParentId); if (father == null) - System.out.println("Warning, father not yet encountered! id is " + currentParentId); + LOGGER.warn("Father not yet encountered! id is " + currentParentId); else { currentNode.setFather(father); father.addChild(currentNode); diff --git a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java new file mode 100644 index 0000000000..55ff70622a --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java @@ -0,0 +1,74 @@ +package org.grobid.core.sax; + +import org.grobid.core.document.Document; +import org.grobid.core.document.DocumentSource; +import org.grobid.core.document.DocumentNode; +import org.junit.Before; +import org.junit.Test; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import java.io.InputStream; + +import static org.easymock.EasyMock.createMock; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.nullValue; +import static org.hamcrest.collection.IsCollectionWithSize.hasSize; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; + +public class PDFALTOOutlineSaxHandlerTest { + SAXParserFactory spf = SAXParserFactory.newInstance(); + + PDFALTOOutlineSaxHandler target; + DocumentSource mockDocumentSource; + Document document; + + @Before + public void setUp() throws Exception { + + mockDocumentSource = createMock(DocumentSource.class); + + document = Document.createFromText(""); + target = new PDFALTOOutlineSaxHandler(document); + } + + @Test + public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception { + InputStream is = this.getClass().getResourceAsStream("pdfalto.xml_outline.xml"); + + SAXParser p = spf.newSAXParser(); + p.parse(is, target); + + DocumentNode root = target.getRootNode(); + assertTrue(root.getChildren().size() > 0); + assertThat(root.getChildren(), hasSize(9)); + } + + @Test + public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception { + InputStream is = this.getClass().getResourceAsStream("test_outline.xml"); + + SAXParser p = spf.newSAXParser(); + p.parse(is, target); + + DocumentNode root = target.getRootNode(); + assertThat(root.getChildren(), hasSize(5)); + assertThat(root.getChildren().get(0).getLabel(), is("A Identification")); + assertThat(root.getChildren().get(0).getChildren(), is(nullValue())); + assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public.")); + assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); + assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français")); + assertThat(root.getChildren().get(2).getChildren(), hasSize(6)); + assertThat(root.getChildren().get(2).getChildren().get(2).getLabel(), is("C.3 Approche scientifique et technique")); + assertThat(root.getChildren().get(3).getLabel(), is("D Liste des livrables")); + assertThat(root.getChildren().get(3).getChildren(), is(nullValue())); + assertThat(root.getChildren().get(4).getLabel(), is("E Impact du projet")); + assertThat(root.getChildren().get(4).getChildren(), hasSize(4)); + assertThat(root.getChildren().get(4).getChildren().get(1).getLabel(), is("E.2 Liste des publications et communications")); + assertThat(root.getChildren().get(4).getChildren().get(2).getLabel(), is("E.3 Liste des autres valorisations scientifiques")); + + } + +} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxParserTest.java b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxParserTest.java deleted file mode 100644 index 247425328b..0000000000 --- a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxParserTest.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.grobid.core.sax; - -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.DocumentNode; -import org.junit.Before; -import org.junit.Test; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import java.io.InputStream; - -import static org.easymock.EasyMock.createMock; -import static org.hamcrest.collection.IsCollectionWithSize.hasSize; -import static org.junit.Assert.assertThat; -import static org.junit.Assert.assertTrue; - -public class PDFALTOOutlineSaxParserTest { - SAXParserFactory spf = SAXParserFactory.newInstance(); - - PDFALTOOutlineSaxHandler target; - DocumentSource mockDocumentSource; - Document document; - - @Before - public void setUp() throws Exception { - - mockDocumentSource = createMock(DocumentSource.class); - - document = Document.createFromText(""); - target = new PDFALTOOutlineSaxHandler(document); - } - - @Test - public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception { - InputStream is = this.getClass().getResourceAsStream("pdfalto.xml_outline.xml"); - - SAXParser p = spf.newSAXParser(); - p.parse(is, target); - - DocumentNode root = target.getRootNode(); - assertTrue(root.getChildren().size() > 0); - assertThat(root.getChildren(), hasSize(9)); - } - -} \ No newline at end of file diff --git a/grobid-core/src/test/resources/org/grobid/core/sax/test_outline.xml b/grobid-core/src/test/resources/org/grobid/core/sax/test_outline.xml new file mode 100644 index 0000000000..be55ae88fa --- /dev/null +++ b/grobid-core/src/test/resources/org/grobid/core/sax/test_outline.xml @@ -0,0 +1,75 @@ + + + + + A Identification + + + + B Résumé consolidé public. + + + + B.1 Résumé consolidé public en français + + + + + + C Mémoire scientifique en français + + + + C.1 Résumé du mémoire + + + + C.2 Enjeux et problématique, état de l’art + + + + C.3 Approche scientifique et technique + + + + C.4 Résultats obtenus + + + + C.5 Discussion, conclusion + + + + C.6 Références + + + + + + D Liste des livrables + + + + E Impact du projet + + + + E.1 Indicateurs d’impact + + + + E.2 Liste des publications et communications + + + + E.3 Liste des autres valorisations scientifiques + + + + E.4 Bilan et suivi des personnels recrutés en CDD (hors stagiaires) + + + + + + From 6ad419f9af11fd64faed83e8d8a8860561238672 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 12 Apr 2021 09:09:07 +0900 Subject: [PATCH 4/6] fix bounding box extraction and update test --- .../java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java | 5 ++--- .../org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java | 5 +++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java index 57081fce97..9d3e2de29d 100644 --- a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java @@ -32,8 +32,7 @@ public class PDFALTOOutlineSaxHandler extends DefaultHandler { private int currentParentId = -1; private Map nodes = null; - private Map labels = new HashMap<>(); - + public PDFALTOOutlineSaxHandler(Document doc) { this.doc = doc; } @@ -55,8 +54,8 @@ public void endElement(java.lang.String uri, java.lang.String localName, if (qName.equals("STRING")) { currentNode.setLabel(getText()); - currentNode.setBoundingBox(box); } else if (qName.equals("ITEM")) { + currentNode.setBoundingBox(box); box = null; label = null; } else if (qName.equals("TOCITEMLIST")) { diff --git a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java index 55ff70622a..54ddea7695 100644 --- a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java +++ b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java @@ -57,6 +57,11 @@ public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception { assertThat(root.getChildren(), hasSize(5)); assertThat(root.getChildren().get(0).getLabel(), is("A Identification")); assertThat(root.getChildren().get(0).getChildren(), is(nullValue())); + assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(2)); + assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(68.000)); + assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(0.000)); + assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(-1.0)); + assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(-1.0)); assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public.")); assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français")); From 7433c89d8d5eafe73ae9d39d7094506971fb91f2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 13 Apr 2021 09:32:31 +0900 Subject: [PATCH 5/6] update bounding box calculation for outline and management of nested items --- .../core/sax/PDFALTOOutlineSaxHandler.java | 26 +++++++++------ .../sax/PDFALTOOutlineSaxHandlerTest.java | 32 ++++++++++++++++--- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java index 9d3e2de29d..3dda27b313 100644 --- a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java @@ -55,11 +55,21 @@ public void endElement(java.lang.String uri, java.lang.String localName, if (qName.equals("STRING")) { currentNode.setLabel(getText()); } else if (qName.equals("ITEM")) { - currentNode.setBoundingBox(box); + //The box could come from a nested element + if (box != null) { + currentNode.setBoundingBox(box); + } + box = null; label = null; } else if (qName.equals("TOCITEMLIST")) { currentParentId = -1; + } else if (qName.equals("LINK")) { + // in case of nested item, we need to assign the box right away or we will lose it. + if (box != null) { + currentNode.setBoundingBox(box); + } + box = null; } } @@ -202,14 +212,12 @@ public void startElement(String namespaceURI, String localName, } // create the bounding box - double x = left; - double y = right; - double width = -1.0; - double height = -1.0; - if (right >= left) - width = right - left; - if (bottom >= top) - height = bottom - top; + //top is y, bottom is height, left is x, right is width. + double y = top; + double height = bottom; + double x = left; + double width = right; + box = BoundingBox .fromPointAndDimensions(page, x, y, width, height); } diff --git a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java index 54ddea7695..30ba221b3d 100644 --- a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java +++ b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java @@ -55,15 +55,34 @@ public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception { DocumentNode root = target.getRootNode(); assertThat(root.getChildren(), hasSize(5)); + assertThat(root.getChildren().get(0).getLabel(), is("A Identification")); assertThat(root.getChildren().get(0).getChildren(), is(nullValue())); + // assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(2)); + assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(71.000)); + assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(68.000)); - assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(0.000)); - assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(-1.0)); - assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(-1.0)); + assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); + assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public.")); assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); + // + assertThat(root.getChildren().get(1).getBoundingBox().getPage(), is(2)); + assertThat(root.getChildren().get(1).getBoundingBox().getY(), is(377.000)); + assertThat(root.getChildren().get(1).getBoundingBox().getHeight(), is(0.0)); + assertThat(root.getChildren().get(1).getBoundingBox().getX(), is(68.000)); + assertThat(root.getChildren().get(1).getBoundingBox().getWidth(), is(0.0)); + + assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); + assertThat(root.getChildren().get(1).getChildren().get(0).getLabel(), is("B.1 Résumé consolidé public en français")); + // + assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getPage(), is(2)); + assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getY(), is(412.000)); + assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); + assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getX(), is(68.000)); + assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); + assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français")); assertThat(root.getChildren().get(2).getChildren(), hasSize(6)); assertThat(root.getChildren().get(2).getChildren().get(2).getLabel(), is("C.3 Approche scientifique et technique")); @@ -73,7 +92,12 @@ public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception { assertThat(root.getChildren().get(4).getChildren(), hasSize(4)); assertThat(root.getChildren().get(4).getChildren().get(1).getLabel(), is("E.2 Liste des publications et communications")); assertThat(root.getChildren().get(4).getChildren().get(2).getLabel(), is("E.3 Liste des autres valorisations scientifiques")); - + // + assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getPage(), is(1)); + assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getY(), is(170.000)); + assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getHeight(), is(0.0)); + assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getX(), is(68.000)); + assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getWidth(), is(0.0)); } } \ No newline at end of file From 01e0e4f4d87e9b7a8cf90350b4ce79db343b0971 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 13 Apr 2021 09:54:52 +0900 Subject: [PATCH 6/6] revert modification to the bounding box information mapping --- .../core/sax/PDFALTOOutlineSaxHandler.java | 12 +++--- .../sax/PDFALTOOutlineSaxHandlerTest.java | 40 +++++++++++-------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java index 3dda27b313..4e37b59ed8 100644 --- a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java @@ -212,12 +212,14 @@ public void startElement(String namespaceURI, String localName, } // create the bounding box - //top is y, bottom is height, left is x, right is width. - double y = top; - double height = bottom; double x = left; - double width = right; - + double y = right; + double width = -1.0; + double height = -1.0; + if (right >= left) + width = right - left; + if (bottom >= top) + height = bottom - top; box = BoundingBox .fromPointAndDimensions(page, x, y, width, height); } diff --git a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java index 30ba221b3d..71102ef8a5 100644 --- a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java +++ b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java @@ -44,6 +44,14 @@ public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception { DocumentNode root = target.getRootNode(); assertTrue(root.getChildren().size() > 0); assertThat(root.getChildren(), hasSize(9)); + assertThat(root.getChildren().get(0).getLabel(), is("Abstract")); + assertThat(root.getChildren().get(0).getChildren(), is(nullValue())); + assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(1)); + // +// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(0.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(-1.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(0.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); } @Test @@ -60,28 +68,28 @@ public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception { assertThat(root.getChildren().get(0).getChildren(), is(nullValue())); // assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(2)); - assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(71.000)); - assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); - assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(68.000)); - assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(71.000)); +// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public.")); assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); // assertThat(root.getChildren().get(1).getBoundingBox().getPage(), is(2)); - assertThat(root.getChildren().get(1).getBoundingBox().getY(), is(377.000)); - assertThat(root.getChildren().get(1).getBoundingBox().getHeight(), is(0.0)); - assertThat(root.getChildren().get(1).getBoundingBox().getX(), is(68.000)); - assertThat(root.getChildren().get(1).getBoundingBox().getWidth(), is(0.0)); +// assertThat(root.getChildren().get(1).getBoundingBox().getY(), is(377.000)); +// assertThat(root.getChildren().get(1).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(1).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(1).getBoundingBox().getWidth(), is(0.0)); assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); assertThat(root.getChildren().get(1).getChildren().get(0).getLabel(), is("B.1 Résumé consolidé public en français")); // assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getPage(), is(2)); - assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getY(), is(412.000)); - assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); - assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getX(), is(68.000)); - assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getY(), is(412.000)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français")); assertThat(root.getChildren().get(2).getChildren(), hasSize(6)); @@ -94,10 +102,10 @@ public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception { assertThat(root.getChildren().get(4).getChildren().get(2).getLabel(), is("E.3 Liste des autres valorisations scientifiques")); // assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getPage(), is(1)); - assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getY(), is(170.000)); - assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getHeight(), is(0.0)); - assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getX(), is(68.000)); - assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getWidth(), is(0.0)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getY(), is(170.000)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getWidth(), is(0.0)); } } \ No newline at end of file