/
PDFALTOOutlineSaxHandlerTest.java
111 lines (91 loc) · 6.01 KB
/
PDFALTOOutlineSaxHandlerTest.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package org.grobid.core.sax;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.document.DocumentNode;
import org.junit.Before;
import org.junit.Test;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.InputStream;
import static org.easymock.EasyMock.createMock;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
public class PDFALTOOutlineSaxHandlerTest {
SAXParserFactory spf = SAXParserFactory.newInstance();
PDFALTOOutlineSaxHandler target;
DocumentSource mockDocumentSource;
Document document;
@Before
public void setUp() throws Exception {
mockDocumentSource = createMock(DocumentSource.class);
document = Document.createFromText("");
target = new PDFALTOOutlineSaxHandler(document);
}
@Test
public void testParsing_pdf2XMLOutline_ShouldWork() throws Exception {
InputStream is = this.getClass().getResourceAsStream("pdfalto.xml_outline.xml");
SAXParser p = spf.newSAXParser();
p.parse(is, target);
DocumentNode root = target.getRootNode();
assertTrue(root.getChildren().size() > 0);
assertThat(root.getChildren(), hasSize(9));
assertThat(root.getChildren().get(0).getLabel(), is("Abstract"));
assertThat(root.getChildren().get(0).getChildren(), is(nullValue()));
assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(1));
//<LINK page="1" top="592.00" bottom="0.00" left="0.00" right="0.00"/>
// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(0.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(-1.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(0.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0));
}
@Test
public void testParsing_pdf2XMLOutline_errorcase_ShouldWork() throws Exception {
InputStream is = this.getClass().getResourceAsStream("test_outline.xml");
SAXParser p = spf.newSAXParser();
p.parse(is, target);
DocumentNode root = target.getRootNode();
assertThat(root.getChildren(), hasSize(5));
assertThat(root.getChildren().get(0).getLabel(), is("A Identification"));
assertThat(root.getChildren().get(0).getChildren(), is(nullValue()));
//<LINK page="2" top="71.0000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(0).getBoundingBox().getPage(), is(2));
// assertThat(root.getChildren().get(0).getBoundingBox().getY(), is(71.000));
// assertThat(root.getChildren().get(0).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(0).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(0).getBoundingBox().getWidth(), is(0.0));
assertThat(root.getChildren().get(1).getLabel(), is("B Résumé consolidé public."));
assertThat(root.getChildren().get(1).getChildren(), hasSize(1));
//<LINK page="2" top="377.000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(1).getBoundingBox().getPage(), is(2));
// assertThat(root.getChildren().get(1).getBoundingBox().getY(), is(377.000));
// assertThat(root.getChildren().get(1).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(1).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(1).getBoundingBox().getWidth(), is(0.0));
assertThat(root.getChildren().get(1).getChildren(), hasSize(1));
assertThat(root.getChildren().get(1).getChildren().get(0).getLabel(), is("B.1 Résumé consolidé public en français"));
//<LINK page="2" top="412.000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getPage(), is(2));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getY(), is(412.000));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(1).getChildren().get(0).getBoundingBox().getWidth(), is(0.0));
assertThat(root.getChildren().get(2).getLabel(), is("C Mémoire scientifique en français"));
assertThat(root.getChildren().get(2).getChildren(), hasSize(6));
assertThat(root.getChildren().get(2).getChildren().get(2).getLabel(), is("C.3 Approche scientifique et technique"));
assertThat(root.getChildren().get(3).getLabel(), is("D Liste des livrables"));
assertThat(root.getChildren().get(3).getChildren(), is(nullValue()));
assertThat(root.getChildren().get(4).getLabel(), is("E Impact du projet"));
assertThat(root.getChildren().get(4).getChildren(), hasSize(4));
assertThat(root.getChildren().get(4).getChildren().get(1).getLabel(), is("E.2 Liste des publications et communications"));
assertThat(root.getChildren().get(4).getChildren().get(2).getLabel(), is("E.3 Liste des autres valorisations scientifiques"));
//<LINK page="1" top="170.000" bottom="0.0000" left="68.0000" right="0.0000"/>
assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getPage(), is(1));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getY(), is(170.000));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getHeight(), is(0.0));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getX(), is(68.000));
// assertThat(root.getChildren().get(4).getChildren().get(2).getBoundingBox().getWidth(), is(0.0));
}
}