Skip to content

Commit

Permalink
Merge 300efd3 into bb8bd0e
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Mar 9, 2021
2 parents bb8bd0e + 300efd3 commit d06fe29
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 69 deletions.
Expand Up @@ -12,32 +12,33 @@
*/

public class DocumentNode {
private Integer id = null;

// Gorn address for tree structure
private String address = null;

// real numbering of the section, if any
private String realNumber = null;

private String realNumber = null;
// normalized numbering of the section, if any
private String normalizedNumber = null;

private String normalizedNumber = null;
// the string attached to this document level, e.g. section title
private String label = null;

private String label = null;
// list of child document nodes
private List<DocumentNode> children = null;

private List<DocumentNode> children = null;
// offset relatively to the document tokenization (so token offset, NOT character offset)

public int startToken = -1;
public int endToken = -1;

// coordinates of the string attached to this document level, typically where an index link
// action point in the document
private BoundingBox boundingBox = null;

private BoundingBox boundingBox = null;
// parent document node, if null it is a root node
private DocumentNode father = null;

private DocumentNode father = null;
public DocumentNode() {
}

Expand Down Expand Up @@ -128,8 +129,8 @@ public String toString() {
}

public String toString(int tab) {
StringBuffer sb = new StringBuffer();
sb.append(address + " " + label + " " + startToken + " " + endToken + "\n");
StringBuilder sb = new StringBuilder();
sb.append(id).append(" ").append(address).append(" ").append(label).append(" ").append(startToken).append(" ").append(endToken).append("\n");

if (children != null) {
for (DocumentNode node : children) {
Expand Down Expand Up @@ -169,6 +170,7 @@ public DocumentNode getSpanningNode(int position) {
}
}


/*public DocumentNode nextSlibing() {
if ( (children != null) && (children.size() > 0) ) {
return children.get(0);
Expand All @@ -182,5 +184,12 @@ else if (father == null) {
}
}
}*/
public Integer getId() {
return id;
}

public void setId(Integer id) {
this.id = id;
}
}

Expand Up @@ -32,6 +32,7 @@ public class PDFALTOOutlineSaxHandler extends DefaultHandler {
private int currentParentId = -1;

private Map<Integer,DocumentNode> nodes = null;
private Map<Integer, Integer> labels = new HashMap<>();

public PDFALTOOutlineSaxHandler(Document doc) {
this.doc = doc;
Expand All @@ -53,11 +54,14 @@ public void endElement(java.lang.String uri, java.lang.String localName,
java.lang.String qName) throws SAXException {

if (qName.equals("STRING")) {
label = getText();
currentNode.setLabel(getText());
currentNode.setBoundingBox(box);
} else if (qName.equals("ITEM")) {
currentNode.setLabel(label);
currentNode.setBoundingBox(box);
}
box = null;
label = null;
} else if (qName.equals("TOCITEMLIST")) {
currentParentId = -1;
}
}

public void startElement(String namespaceURI, String localName,
Expand All @@ -66,7 +70,7 @@ public void startElement(String namespaceURI, String localName,
// this is the document root
root = new DocumentNode();
nodes = new HashMap<Integer,DocumentNode>();
} if (qName.equals("ITEM")) {
} else if (qName.equals("ITEM")) {
currentNode = new DocumentNode();
// get the node id
int length = atts.getLength();
Expand All @@ -78,7 +82,7 @@ public void startElement(String namespaceURI, String localName,
String value = atts.getValue(i);

if ((name != null) && (value != null)) {
if (name.equals("id")) {
if (name.equalsIgnoreCase("id")) {
try {
currentId = Integer.parseInt(value);
} catch(Exception e) {
Expand All @@ -88,14 +92,16 @@ public void startElement(String namespaceURI, String localName,
}
}
}
//currentNode.setId(currentId);
currentNode.setId(currentId);
nodes.put(currentId,currentNode);
if (currentParentId != -1) {
DocumentNode father = nodes.get(currentParentId);
if (father == null)
System.out.println("Warning, father not yet encountered! id is " + currentParentId);
currentNode.setFather(father);
father.addChild(currentNode);
if (father == null)
LOGGER.warn("Father not yet encountered! id is " + currentParentId);
else {
currentNode.setFather(father);
father.addChild(currentNode);
}
} else {
// parent is the root node
currentNode.setFather(root);
Expand Down Expand Up @@ -211,4 +217,4 @@ public void startElement(String namespaceURI, String localName,
accumulator.setLength(0);
}

}
}
@@ -0,0 +1,74 @@
package org.grobid.core.sax;

import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.document.DocumentNode;
import org.junit.Before;
import org.junit.Test;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import java.io.InputStream;

import static org.easymock.EasyMock.createMock;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

public class PDFALTOOutlineSaxHandlerTest {
SAXParserFactory spf = SAXParserFactory.newInstance();

PDFALTOOutlineSaxHandler target;
DocumentSource mockDocumentSource;
Document document;

@Before
public void setUp() throws Exception {

mockDocumentSource = createMock(DocumentSource.class);

document = Document.createFromText("");
target = new PDFALTOOutlineSaxHandler(document);
}

@Test
public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception {
InputStream is = this.getClass().getResourceAsStream("pdfalto.xml_outline.xml");

SAXParser p = spf.newSAXParser();
p.parse(is, target);

DocumentNode root = target.getRootNode();
assertTrue(root.getChildren().size() > 0);
assertThat(root.getChildren(), hasSize(9));
}

@Test
public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception {
InputStream is = this.getClass().getResourceAsStream("test_outline.xml");

SAXParser p = spf.newSAXParser();
p.parse(is, target);

DocumentNode root = target.getRootNode();
assertThat(root.getChildren(), hasSize(5));
assertThat(root.getChildren().get(0).getLabel(), is("A Identification"));
assertThat(root.getChildren().get(0).getChildren(), is(nullValue()));
assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public."));
assertThat(root.getChildren().get(1).getChildren(), hasSize(1));
assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français"));
assertThat(root.getChildren().get(2).getChildren(), hasSize(6));
assertThat(root.getChildren().get(2).getChildren().get(2).getLabel(), is("C.3 Approche scientifique et technique"));
assertThat(root.getChildren().get(3).getLabel(), is("D Liste des livrables"));
assertThat(root.getChildren().get(3).getChildren(), is(nullValue()));
assertThat(root.getChildren().get(4).getLabel(), is("E Impact du projet"));
assertThat(root.getChildren().get(4).getChildren(), hasSize(4));
assertThat(root.getChildren().get(4).getChildren().get(1).getLabel(), is("E.2 Liste des publications et communications"));
assertThat(root.getChildren().get(4).getChildren().get(2).getLabel(), is("E.3 Liste des autres valorisations scientifiques"));

}

}

This file was deleted.

@@ -0,0 +1,75 @@
<?xml version="1.0"?>
<TOCITEMS nbPages="4">
<TOCITEMLIST level="0">
<ITEM ID="0">
<STRING>A Identification</STRING>
<LINK page="2" top="71.0000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="1">
<STRING>B R&#xE9;sum&#xE9; consolid&#xE9; public.</STRING>
<LINK page="2" top="377.000" bottom="0.0000" left="68.0000" right="0.0000"/>
<TOCITEMLIST level="1" idItemParent="1">
<ITEM ID="2">
<STRING>B.1 R&#xE9;sum&#xE9; consolid&#xE9; public en fran&#xE7;ais</STRING>
<LINK page="2" top="412.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
</TOCITEMLIST>
</ITEM>
<ITEM ID="3">
<STRING>C M&#xE9;moire scientifique en fran&#xE7;ais</STRING>
<LINK page="1" top="71.0000" bottom="0.0000" left="68.0000" right="0.0000"/>
<TOCITEMLIST level="1" idItemParent="3">
<ITEM ID="4">
<STRING>C.1 R&#xE9;sum&#xE9; du m&#xE9;moire</STRING>
<LINK page="1" top="109.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="5">
<STRING>C.2 Enjeux et probl&#xE9;matique, &#xE9;tat de l&#x2019;art</STRING>
<LINK page="1" top="334.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="6">
<STRING>C.3 Approche scientifique et technique</STRING>
<LINK page="1" top="235.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="7">
<STRING>C.4 R&#xE9;sultats obtenus</STRING>
<LINK page="1" top="264.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="8">
<STRING>C.5 Discussion, conclusion</STRING>
<LINK page="1" top="357.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="9">
<STRING>C.6 R&#xE9;f&#xE9;rences</STRING>
<LINK page="1" top="582.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
</TOCITEMLIST>
</ITEM>
<ITEM ID="10">
<STRING>D Liste des livrables</STRING>
<LINK page="1" top="71.0000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="11">
<STRING>E Impact du projet</STRING>
<LINK page="1" top="649.000" bottom="0.0000" left="68.0000" right="0.0000"/>
<TOCITEMLIST level="1" idItemParent="11">
<ITEM ID="12">
<STRING>E.1 Indicateurs d&#x2019;impact</STRING>
<LINK page="1" top="368.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="13">
<STRING>E.2 Liste des publications et communications</STRING>
<LINK page="1" top="317.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="14">
<STRING>E.3 Liste des autres valorisations scientifiques</STRING>
<LINK page="1" top="170.000" bottom="0.0000" left="68.0000" right="0.0000"/>
</ITEM>
<ITEM ID="15">
<STRING>E.4 Bilan et suivi des personnels recrut&#xE9;s en CDD (hors stagiaires)</STRING>
<LINK page="1" top="318.000" bottom="0.0000" left="40.0000" right="0.0000"/>
</ITEM>
</TOCITEMLIST>
</ITEM>
</TOCITEMLIST>
</TOCITEMS>

0 comments on commit d06fe29

Please sign in to comment.