Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix NullPointer exception in PDFALTOOutlineSaxHandler #729

Merged
merged 6 commits into from
Apr 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,33 @@
*/

public class DocumentNode {
private Integer id = null;

// Gorn address for tree structure
private String address = null;

// real numbering of the section, if any
private String realNumber = null;

private String realNumber = null;
// normalized numbering of the section, if any
private String normalizedNumber = null;

private String normalizedNumber = null;
// the string attached to this document level, e.g. section title
private String label = null;

private String label = null;
// list of child document nodes
private List<DocumentNode> children = null;

private List<DocumentNode> children = null;
// offset relatively to the document tokenization (so token offset, NOT character offset)

public int startToken = -1;
public int endToken = -1;

// coordinates of the string attached to this document level, typically where an index link
// action point in the document
private BoundingBox boundingBox = null;

private BoundingBox boundingBox = null;
// parent document node, if null it is a root node
private DocumentNode father = null;

private DocumentNode father = null;
public DocumentNode() {
}

Expand Down Expand Up @@ -128,8 +129,8 @@ public String toString() {
}

public String toString(int tab) {
StringBuffer sb = new StringBuffer();
sb.append(address + " " + label + " " + startToken + " " + endToken + "\n");
StringBuilder sb = new StringBuilder();
sb.append(id).append(" ").append(address).append(" ").append(label).append(" ").append(startToken).append(" ").append(endToken).append("\n");

if (children != null) {
for (DocumentNode node : children) {
Expand Down Expand Up @@ -169,6 +170,7 @@ public DocumentNode getSpanningNode(int position) {
}
}


/*public DocumentNode nextSlibing() {
if ( (children != null) && (children.size() > 0) ) {
return children.get(0);
Expand All @@ -182,5 +184,12 @@ else if (father == null) {
}
}
}*/
public Integer getId() {
return id;
}

public void setId(Integer id) {
this.id = id;
}
kermitt2 marked this conversation as resolved.
Show resolved Hide resolved
}

Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class PDFALTOOutlineSaxHandler extends DefaultHandler {
private int currentParentId = -1;

private Map<Integer,DocumentNode> nodes = null;

public PDFALTOOutlineSaxHandler(Document doc) {
this.doc = doc;
}
Expand All @@ -53,11 +53,24 @@ public void endElement(java.lang.String uri, java.lang.String localName,
java.lang.String qName) throws SAXException {

if (qName.equals("STRING")) {
label = getText();
currentNode.setLabel(getText());
} else if (qName.equals("ITEM")) {
currentNode.setLabel(label);
currentNode.setBoundingBox(box);
}
//The box could come from a nested element
if (box != null) {
currentNode.setBoundingBox(box);
}

box = null;
label = null;
} else if (qName.equals("TOCITEMLIST")) {
currentParentId = -1;
kermitt2 marked this conversation as resolved.
Show resolved Hide resolved
} else if (qName.equals("LINK")) {
// in case of nested item, we need to assign the box right away or we will lose it.
if (box != null) {
currentNode.setBoundingBox(box);
}
box = null;
}
}

public void startElement(String namespaceURI, String localName,
Expand All @@ -66,7 +79,7 @@ public void startElement(String namespaceURI, String localName,
// this is the document root
root = new DocumentNode();
nodes = new HashMap<Integer,DocumentNode>();
} if (qName.equals("ITEM")) {
} else if (qName.equals("ITEM")) {
currentNode = new DocumentNode();
// get the node id
int length = atts.getLength();
Expand All @@ -78,7 +91,7 @@ public void startElement(String namespaceURI, String localName,
String value = atts.getValue(i);

if ((name != null) && (value != null)) {
if (name.equals("id")) {
if (name.equalsIgnoreCase("id")) {
kermitt2 marked this conversation as resolved.
Show resolved Hide resolved
try {
currentId = Integer.parseInt(value);
} catch(Exception e) {
Expand All @@ -88,14 +101,16 @@ public void startElement(String namespaceURI, String localName,
}
}
}
//currentNode.setId(currentId);
currentNode.setId(currentId);
nodes.put(currentId,currentNode);
if (currentParentId != -1) {
DocumentNode father = nodes.get(currentParentId);
if (father == null)
System.out.println("Warning, father not yet encountered! id is " + currentParentId);
currentNode.setFather(father);
father.addChild(currentNode);
if (father == null)
LOGGER.warn("Father not yet encountered! id is " + currentParentId);
else {
currentNode.setFather(father);
father.addChild(currentNode);
}
} else {
// parent is the root node
currentNode.setFather(root);
Expand Down Expand Up @@ -197,7 +212,7 @@ public void startElement(String namespaceURI, String localName,
}

// create the bounding box
double x = left;
double x = left;
double y = right;
double width = -1.0;
double height = -1.0;
Expand All @@ -211,4 +226,4 @@ public void startElement(String namespaceURI, String localName,
accumulator.setLength(0);
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package org.grobid.core.sax;

import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.document.DocumentNode;
import org.junit.Before;
import org.junit.Test;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import java.io.InputStream;

import static org.easymock.EasyMock.createMock;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

public class PDFALTOOutlineSaxHandlerTest {
SAXParserFactory spf = SAXParserFactory.newInstance();

PDFALTOOutlineSaxHandler target;
DocumentSource mockDocumentSource;
Document document;

@Before
public void setUp() throws Exception {

mockDocumentSource = createMock(DocumentSource.class);

document = Document.createFromText("");
target = new PDFALTOOutlineSaxHandler(document);
}

@Test
public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception {
InputStream is = this.getClass().getResourceAsStream("pdfalto.xml_outline.xml");
kermitt2 marked this conversation as resolved.
Show resolved Hide resolved

SAXParser p = spf.newSAXParser();
p.parse(is, target);

DocumentNode root = target.getRootNode();
assertTrue(root.getChildren().size() > 0);
assertThat(root.getChildren(), hasSize(9));
assertThat(root.getChildren().get(0).getLabel(), is("Abstract"));
assertThat(root.getChildren().get(0).getChildren(), is(nullValue()));
assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(1));
//<LINK page="1" top="592.00" bottom="0.00" left="0.00" right="0.00"/>
// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(0.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(-1.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(0.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0));
}

@Test
public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception {
InputStream is = this.getClass().getResourceAsStream("test_outline.xml");

SAXParser p = spf.newSAXParser();
p.parse(is, target);

DocumentNode root = target.getRootNode();
assertThat(root.getChildren(), hasSize(5));

assertThat(root.getChildren().get(0).getLabel(), is("A Identification"));
assertThat(root.getChildren().get(0).getChildren(), is(nullValue()));
//<LINK page="2" top="71.0000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(2));
// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(71.000));
// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0));

assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public."));
assertThat(root.getChildren().get(1).getChildren(), hasSize(1));
//<LINK page="2" top="377.000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(1).getBoundingBox().getPage(), is(2));
// assertThat(root.getChildren().get(1).getBoundingBox().getY(), is(377.000));
// assertThat(root.getChildren().get(1).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(1).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(1).getBoundingBox().getWidth(), is(0.0));

assertThat(root.getChildren().get(1).getChildren(), hasSize(1));
assertThat(root.getChildren().get(1).getChildren().get(0).getLabel(), is("B.1 Résumé consolidé public en français"));
//<LINK page="2" top="412.000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getPage(), is(2));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getY(), is(412.000));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getWidth(), is(0.0));

assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français"));
assertThat(root.getChildren().get(2).getChildren(), hasSize(6));
assertThat(root.getChildren().get(2).getChildren().get(2).getLabel(), is("C.3 Approche scientifique et technique"));
assertThat(root.getChildren().get(3).getLabel(), is("D Liste des livrables"));
assertThat(root.getChildren().get(3).getChildren(), is(nullValue()));
assertThat(root.getChildren().get(4).getLabel(), is("E Impact du projet"));
assertThat(root.getChildren().get(4).getChildren(), hasSize(4));
assertThat(root.getChildren().get(4).getChildren().get(1).getLabel(), is("E.2 Liste des publications et communications"));
assertThat(root.getChildren().get(4).getChildren().get(2).getLabel(), is("E.3 Liste des autres valorisations scientifiques"));
//<LINK page="1" top="170.000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getPage(), is(1));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getY(), is(170.000));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getWidth(), is(0.0));
}

}

This file was deleted.