Skip to content

Commit

Permalink
fix #424, fix labeled abstract mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Sep 12, 2019
1 parent 345c6ae commit 6a9e167
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ grobid-smecta
grobid-example
grobid-software
software-mentions
grobid-keyterm
dataseer-ml
grobid-test-ant
grobid-home/models/quantities
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1558,7 +1558,7 @@ public String postProcessVolumeBlock() {
*
* To be done: use a short text model to structure abstract
*/
final String[] ABSTRACT_PREFIXES = {"abstract", "summary", "résumé", "abrégé", "a b s t r a c t"};
public static final String[] ABSTRACT_PREFIXES = {"abstract", "summary", "résumé", "abrégé", "a b s t r a c t"};

public String cleanAbstract(String string) {

Expand Down Expand Up @@ -4242,6 +4242,9 @@ public void generalResultMapping(Document doc, String labeledResult, List<Layout
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
if (clusterLabel.equals(TaggingLabels.HEADER_INTRO)) {
break;
}
List<LayoutToken> clusterTokens = cluster.concatTokens();
List<LayoutToken> theList = labeledTokens.get(clusterLabel.getLabel());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1269,6 +1269,8 @@ private StringBuilder toTEITextPiece(StringBuilder buffer,
curDiv.appendChild(note);
} else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) {
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
//if (biblio != null)
// clusterContent = biblio.cleanAbstract(clusterContent);
if (isNewParagraph(lastClusterLabel, curParagraph)) {
curParagraph = teiElement("p");
if (config.isGenerateTeiIds()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,11 @@ public Document processing(DocumentSource documentSource,
if (CollectionUtils.isNotEmpty(abstractTokens)) {
Pair<String, List<LayoutToken>> abstractProcessed = processShort(abstractTokens, doc);
if (abstractProcessed != null) {
resHeader.setLabeledAbstract(abstractProcessed.getLeft());
// neutralize figure and table annotations (will be considered as paragraphs)
String labeledAbstract = abstractProcessed.getLeft();
labeledAbstract = postProcessLabeledAbstract(labeledAbstract);
//System.out.println(labeledAbstract);
resHeader.setLabeledAbstract(labeledAbstract);
resHeader.setLayoutTokensForLabel(abstractProcessed.getRight(), TaggingLabels.HEADER_ABSTRACT);
}
}
Expand Down Expand Up @@ -361,7 +365,7 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
for(LayoutToken token : tokens) {
if (currentChunk.size() != 0) {
int tokenPos = token.getOffset();
if (currentPos+1 != tokenPos) {
if (currentPos != tokenPos) {
// new chunk
tokenChunks.add(currentChunk);
currentChunk = new ArrayList<LayoutToken>();
Expand Down Expand Up @@ -400,6 +404,40 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
return Pair.of(res, layoutTokenization);
}

static protected String postProcessLabeledAbstract(String labeledAbstract) {
if (labeledAbstract == null)
return null;
StringBuilder result = new StringBuilder();

String[] lines = labeledAbstract.split("\n");
String previousLabel = null;
for(int i=0; i<lines.length; i++) {
String line = lines[i];
if (line == null || line.trim().length() == 0)
continue;
String[] pieces = line.split("\t");
String label = pieces[pieces.length-1];
if (label.equals("I-"+TaggingLabels.FIGURE.getLabel()) || label.equals("I-"+TaggingLabels.TABLE.getLabel())) {
if (previousLabel == null || !previousLabel.endsWith(TaggingLabels.PARAGRAPH.getLabel())) {
pieces[pieces.length-1] = "I-"+TaggingLabels.PARAGRAPH.getLabel();
} else {
pieces[pieces.length-1] = TaggingLabels.PARAGRAPH.getLabel();
}
} else if (label.equals(TaggingLabels.FIGURE.getLabel()) || label.equals(TaggingLabels.TABLE.getLabel())) {
pieces[pieces.length-1] = TaggingLabels.PARAGRAPH.getLabel();
}
for(int j=0; j<pieces.length; j++) {
if (j != 0)
result.append("\t");
result.append(pieces[j]);
}
previousLabel = label;
result.append("\n");
}

return result.toString();
}

static public Pair<String, LayoutTokenization> getBodyTextFeatured(Document doc,
SortedSet<DocumentPiece> documentBodyParts) {
if ((documentBodyParts == null) || (documentBodyParts.size() == 0)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public static void tearDown() {
//
// }

@Test
/*@Test
public void testGetDocumentPieces1() throws Exception {
Document documentMock = createMock(Document.class);
Expand Down Expand Up @@ -129,6 +129,6 @@ public void testGetDocumentPieces1() throws Exception {
assertThat(documentPieces1.get(1).getLeft().getTokenDocPos(), is(25000));
assertThat(documentPieces1.get(1).getRight().getBlockPtr(), is(3));
assertThat(documentPieces1.get(1).getRight().getTokenDocPos(), is(25088));
}
}*/

}

0 comments on commit 6a9e167

Please sign in to comment.