Skip to content
Permalink
Browse files

Fix synchronization issue due to a starting block without LayoutTokens

This issue can happen when the first block of a segmentation zone
contains only bitmap and no LayoutToken
  • Loading branch information...
kermitt2 committed Dec 24, 2015
1 parent 1163c03 commit 1c3dca23405b78f9d8410987779fa4a0127743ac
@@ -711,7 +711,14 @@ static public Document generalResultSegmentation(Document doc, String labeledRes
// tokenization of the first token of the current line
String line = null;

DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER;
//DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER;
// the default first block might not contain tokens but only bitmap - in this case we move
// to the first block containing some LayoutToken objects
while(docBlocks.get(blockIndex).getTokens() == null) {
blockIndex++;
}
DocumentPointer pointerA = new DocumentPointer(doc, blockIndex, 0);

DocumentPointer currentPointer = null;
DocumentPointer lastPointer = null;

@@ -746,24 +753,25 @@ static public Document generalResultSegmentation(Document doc, String labeledRes
while( (line == null) && (blockIndex < docBlocks.size()) ) {
Block block = docBlocks.get(blockIndex);
List<LayoutToken> tokens = block.getTokens();
currentLineStartPos = block.getStartToken();
String localText = block.getText();
if ( (tokens == null) || (localText == null) || (localText.trim().length() == 0) ) {
if ( (tokens == null) || (localText == null) || (block.getStartToken() == -1) || (localText.trim().length() == 0) ) {
blockIndex++;
indexLine = 0;
if (blockIndex < docBlocks.size()) {
/*if (blockIndex < docBlocks.size()) {
block = docBlocks.get(blockIndex);
currentLineStartPos = block.getStartToken();
}
}*/
continue;
}
String[] lines = localText.split("[\\n\\r]");
if ( (lines.length == 0) || (indexLine >= lines.length)) {
blockIndex++;
indexLine = 0;
if (blockIndex < docBlocks.size()) {
/*if (blockIndex < docBlocks.size()) {
block = docBlocks.get(blockIndex);
currentLineStartPos = block.getStartToken();
}
}*/
continue;
}
else {
@@ -72,7 +72,6 @@ public void testFullTextParser() throws Exception {
getTestResourcePath();

File pdfPath = new File(testPath, "/Wang-paperAVE2008.pdf");

Document tei = GrobidFactory.getInstance().createEngine().fullTextToTEIDoc(pdfPath, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
//System.out.println(tei);
@@ -84,7 +83,6 @@ public void testFullTextParser() throws Exception {


pdfPath = new File(testPath + "/1001._0908.0054.pdf");

tei = GrobidFactory.getInstance().createEngine().fullTextToTEIDoc(pdfPath, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
//System.out.println(tei);
@@ -111,7 +109,7 @@ public void testFullTextParser() throws Exception {
}

private void assertTei(Document doc) {
//assertDocAndBlockTokenizationSync(doc);
assertDocAndBlockTokenizationSync(doc);
assertNotNull(doc.getTei());
}

@@ -144,6 +142,14 @@ private void assertDocAndBlockTokenizationSync(Document doc) {
for (DocumentPiece p : parts) {
DocumentPointer startPtr = p.a;
DocumentPointer endPtr = p.b;
/*if (doc.getBlocks().get(startPtr.getBlockPtr()).getTokens() == null) {
System.out.println("block invalid: " + startPtr.getBlockPtr() + ", block size:" + doc.getBlocks().size());
if (doc.getBlocks().get(startPtr.getBlockPtr()).getTokens() == null) {
System.out.println("token list is null in the block");
System.out.println(doc.getBlocks().get(startPtr.getBlockPtr()).toString());
System.out.println("while available start token in block is " + startPtr.getTokenBlockPos());
}
}*/
assertEquals(doc.getTokenizations().get(startPtr.getTokenDocPos()), doc.getBlocks().get(startPtr.getBlockPtr()).getTokens().get(startPtr.getTokenBlockPos()));
assertEquals(doc.getTokenizations().get(endPtr.getTokenDocPos()), doc.getBlocks().get(endPtr.getBlockPtr()).getTokens().get(endPtr.getTokenBlockPos()));

0 comments on commit 1c3dca2

Please sign in to comment.
You can’t perform that action at this time.