diff --git a/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java b/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java index cc93f386ef..2ecb71f915 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java +++ b/grobid-core/src/main/java/org/grobid/core/document/DocumentNode.java @@ -12,32 +12,33 @@ */ public class DocumentNode { + private Integer id = null; + // Gorn address for tree structure private String address = null; - // real numbering of the section, if any - private String realNumber = null; + private String realNumber = null; // normalized numbering of the section, if any - private String normalizedNumber = null; + private String normalizedNumber = null; // the string attached to this document level, e.g. section title - private String label = null; + private String label = null; // list of child document nodes - private List children = null; + private List children = null; // offset relatively to the document tokenization (so token offset, NOT character offset) + public int startToken = -1; public int endToken = -1; - // coordinates of the string attached to this document level, typically where an index link // action point in the document - private BoundingBox boundingBox = null; + private BoundingBox boundingBox = null; // parent document node, if null it is a root node - private DocumentNode father = null; + private DocumentNode father = null; public DocumentNode() { } @@ -128,8 +129,8 @@ public String toString() { } public String toString(int tab) { - StringBuffer sb = new StringBuffer(); - sb.append(address + " " + label + " " + startToken + " " + endToken + "\n"); + StringBuilder sb = new StringBuilder(); + sb.append(id).append(" ").append(address).append(" ").append(label).append(" ").append(startToken).append(" ").append(endToken).append("\n"); if (children != null) { for (DocumentNode node : children) { @@ -169,6 +170,7 @@ public DocumentNode getSpanningNode(int position) { } } + /*public DocumentNode nextSlibing() { if ( (children != null) && (children.size() > 0) ) { return children.get(0); @@ -182,5 +184,12 @@ else if (father == null) { } } }*/ + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java index 3e770543ef..4e37b59ed8 100644 --- a/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/PDFALTOOutlineSaxHandler.java @@ -32,7 +32,7 @@ public class PDFALTOOutlineSaxHandler extends DefaultHandler { private int currentParentId = -1; private Map nodes = null; - + public PDFALTOOutlineSaxHandler(Document doc) { this.doc = doc; } @@ -53,11 +53,24 @@ public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { if (qName.equals("STRING")) { - label = getText(); + currentNode.setLabel(getText()); } else if (qName.equals("ITEM")) { - currentNode.setLabel(label); - currentNode.setBoundingBox(box); - } + //The box could come from a nested element + if (box != null) { + currentNode.setBoundingBox(box); + } + + box = null; + label = null; + } else if (qName.equals("TOCITEMLIST")) { + currentParentId = -1; + } else if (qName.equals("LINK")) { + // in case of nested item, we need to assign the box right away or we will lose it. + if (box != null) { + currentNode.setBoundingBox(box); + } + box = null; + } } public void startElement(String namespaceURI, String localName, @@ -66,7 +79,7 @@ public void startElement(String namespaceURI, String localName, // this is the document root root = new DocumentNode(); nodes = new HashMap(); - } if (qName.equals("ITEM")) { + } else if (qName.equals("ITEM")) { currentNode = new DocumentNode(); // get the node id int length = atts.getLength(); @@ -78,7 +91,7 @@ public void startElement(String namespaceURI, String localName, String value = atts.getValue(i); if ((name != null) && (value != null)) { - if (name.equals("id")) { + if (name.equalsIgnoreCase("id")) { try { currentId = Integer.parseInt(value); } catch(Exception e) { @@ -88,14 +101,16 @@ public void startElement(String namespaceURI, String localName, } } } - //currentNode.setId(currentId); + currentNode.setId(currentId); nodes.put(currentId,currentNode); if (currentParentId != -1) { DocumentNode father = nodes.get(currentParentId); - if (father == null) - System.out.println("Warning, father not yet encountered! id is " + currentParentId); - currentNode.setFather(father); - father.addChild(currentNode); + if (father == null) + LOGGER.warn("Father not yet encountered! id is " + currentParentId); + else { + currentNode.setFather(father); + father.addChild(currentNode); + } } else { // parent is the root node currentNode.setFather(root); @@ -197,7 +212,7 @@ public void startElement(String namespaceURI, String localName, } // create the bounding box - double x = left; + double x = left; double y = right; double width = -1.0; double height = -1.0; @@ -211,4 +226,4 @@ public void startElement(String namespaceURI, String localName, accumulator.setLength(0); } -} \ No newline at end of file +} diff --git a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java new file mode 100644 index 0000000000..71102ef8a5 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxHandlerTest.java @@ -0,0 +1,111 @@ +package org.grobid.core.sax; + +import org.grobid.core.document.Document; +import org.grobid.core.document.DocumentSource; +import org.grobid.core.document.DocumentNode; +import org.junit.Before; +import org.junit.Test; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import java.io.InputStream; + +import static org.easymock.EasyMock.createMock; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.nullValue; +import static org.hamcrest.collection.IsCollectionWithSize.hasSize; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; + +public class PDFALTOOutlineSaxHandlerTest { + SAXParserFactory spf = SAXParserFactory.newInstance(); + + PDFALTOOutlineSaxHandler target; + DocumentSource mockDocumentSource; + Document document; + + @Before + public void setUp() throws Exception { + + mockDocumentSource = createMock(DocumentSource.class); + + document = Document.createFromText(""); + target = new PDFALTOOutlineSaxHandler(document); + } + + @Test + public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception { + InputStream is = this.getClass().getResourceAsStream("pdfalto.xml_outline.xml"); + + SAXParser p = spf.newSAXParser(); + p.parse(is, target); + + DocumentNode root = target.getRootNode(); + assertTrue(root.getChildren().size() > 0); + assertThat(root.getChildren(), hasSize(9)); + assertThat(root.getChildren().get(0).getLabel(), is("Abstract")); + assertThat(root.getChildren().get(0).getChildren(), is(nullValue())); + assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(1)); + // +// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(0.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(-1.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(0.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); + } + + @Test + public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception { + InputStream is = this.getClass().getResourceAsStream("test_outline.xml"); + + SAXParser p = spf.newSAXParser(); + p.parse(is, target); + + DocumentNode root = target.getRootNode(); + assertThat(root.getChildren(), hasSize(5)); + + assertThat(root.getChildren().get(0).getLabel(), is("A Identification")); + assertThat(root.getChildren().get(0).getChildren(), is(nullValue())); + // + assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(2)); +// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(71.000)); +// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); + + assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public.")); + assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); + // + assertThat(root.getChildren().get(1).getBoundingBox().getPage(), is(2)); +// assertThat(root.getChildren().get(1).getBoundingBox().getY(), is(377.000)); +// assertThat(root.getChildren().get(1).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(1).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(1).getBoundingBox().getWidth(), is(0.0)); + + assertThat(root.getChildren().get(1).getChildren(), hasSize(1)); + assertThat(root.getChildren().get(1).getChildren().get(0).getLabel(), is("B.1 Résumé consolidé public en français")); + // + assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getPage(), is(2)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getY(), is(412.000)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getWidth(), is(0.0)); + + assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français")); + assertThat(root.getChildren().get(2).getChildren(), hasSize(6)); + assertThat(root.getChildren().get(2).getChildren().get(2).getLabel(), is("C.3 Approche scientifique et technique")); + assertThat(root.getChildren().get(3).getLabel(), is("D Liste des livrables")); + assertThat(root.getChildren().get(3).getChildren(), is(nullValue())); + assertThat(root.getChildren().get(4).getLabel(), is("E Impact du projet")); + assertThat(root.getChildren().get(4).getChildren(), hasSize(4)); + assertThat(root.getChildren().get(4).getChildren().get(1).getLabel(), is("E.2 Liste des publications et communications")); + assertThat(root.getChildren().get(4).getChildren().get(2).getLabel(), is("E.3 Liste des autres valorisations scientifiques")); + // + assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getPage(), is(1)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getY(), is(170.000)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getHeight(), is(0.0)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getX(), is(68.000)); +// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getWidth(), is(0.0)); + } + +} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxParserTest.java b/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxParserTest.java deleted file mode 100644 index 247425328b..0000000000 --- a/grobid-core/src/test/java/org/grobid/core/sax/PDFALTOOutlineSaxParserTest.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.grobid.core.sax; - -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.DocumentNode; -import org.junit.Before; -import org.junit.Test; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import java.io.InputStream; - -import static org.easymock.EasyMock.createMock; -import static org.hamcrest.collection.IsCollectionWithSize.hasSize; -import static org.junit.Assert.assertThat; -import static org.junit.Assert.assertTrue; - -public class PDFALTOOutlineSaxParserTest { - SAXParserFactory spf = SAXParserFactory.newInstance(); - - PDFALTOOutlineSaxHandler target; - DocumentSource mockDocumentSource; - Document document; - - @Before - public void setUp() throws Exception { - - mockDocumentSource = createMock(DocumentSource.class); - - document = Document.createFromText(""); - target = new PDFALTOOutlineSaxHandler(document); - } - - @Test - public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception { - InputStream is = this.getClass().getResourceAsStream("pdfalto.xml_outline.xml"); - - SAXParser p = spf.newSAXParser(); - p.parse(is, target); - - DocumentNode root = target.getRootNode(); - assertTrue(root.getChildren().size() > 0); - assertThat(root.getChildren(), hasSize(9)); - } - -} \ No newline at end of file diff --git a/grobid-core/src/test/resources/org/grobid/core/sax/test_outline.xml b/grobid-core/src/test/resources/org/grobid/core/sax/test_outline.xml new file mode 100644 index 0000000000..be55ae88fa --- /dev/null +++ b/grobid-core/src/test/resources/org/grobid/core/sax/test_outline.xml @@ -0,0 +1,75 @@ + + + + + A Identification + + + + B Résumé consolidé public. + + + + B.1 Résumé consolidé public en français + + + + + + C Mémoire scientifique en français + + + + C.1 Résumé du mémoire + + + + C.2 Enjeux et problématique, état de l’art + + + + C.3 Approche scientifique et technique + + + + C.4 Résultats obtenus + + + + C.5 Discussion, conclusion + + + + C.6 Références + + + + + + D Liste des livrables + + + + E Impact du projet + + + + E.1 Indicateurs d’impact + + + + E.2 Liste des publications et communications + + + + E.3 Liste des autres valorisations scientifiques + + + + E.4 Bilan et suivi des personnels recrutés en CDD (hors stagiaires) + + + + + +