Skip to content

Commit

Permalink
add new hack
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Oct 15, 2022
1 parent 27a25ad commit 52352a8
Showing 1 changed file with 45 additions and 1 deletion.
Expand Up @@ -418,6 +418,51 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
layoutTokenization = layouts.getTokenization();
if ( (featuredText != null) && (featuredText.trim().length() > 0) ) {
res = label(featuredText);

// post-process the labeling to address https://github.com/kermitt2/grobid/issues/956
//
// this is specific to processShort() and a very rare case:
// When the sequence starts with a figure or a table, this is considered as invalid
// for a short text (as we expect text right at the beginning) so we re-write here
// the labels to be interpreted as paragraph (as fallback).

// NOTE: this is a temporary hack to be removed when figure and table will be removed from
// the full text model

if (!StringUtils.isEmpty(res) &&
(res.indexOf(TaggingLabels.FIGURE.getLabel()) != -1 || res.indexOf(TaggingLabels.TABLE.getLabel()) != -1)) {
// we have some figure/table around, check if there are some at the beginning of the sequence
String[] resPieces = res.split("\n");
StringBuilder newRes = new StringBuilder();
boolean inPrefix = true;
for(int i=0; i<resPieces.length; i++) {
String resPiece = resPieces[i];
if (i==0 &&
(resPiece.endsWith(TaggingLabels.FIGURE.getLabel()) || resPiece.endsWith(TaggingLabels.TABLE.getLabel()))
) {
// we need to post-process the result
resPiece= resPiece.replace(TaggingLabels.FIGURE.getLabel(), TaggingLabels.PARAGRAPH.getLabel())
.replace(TaggingLabels.TABLE.getLabel(), TaggingLabels.PARAGRAPH.getLabel());
newRes.append(resPiece).append("\n");
} else if (i==0) {
// no table or figure at the beginning, nothing need to be done
newRes.append(res);
break;
} else {
// we are post-processing, but we change just the start of the sequence
if (inPrefix &&
resPiece.endsWith(TaggingLabels.FIGURE.getLabel()) || resPiece.endsWith(TaggingLabels.TABLE.getLabel())) {
resPiece = resPiece.replace(TaggingLabels.FIGURE.getLabel(), TaggingLabels.PARAGRAPH.getLabel())
.replace(TaggingLabels.TABLE.getLabel(), TaggingLabels.PARAGRAPH.getLabel());
} else {
inPrefix = false;
}

newRes.append(resPiece).append("\n");
}
}
res = newRes.toString();
}
}
}

Expand Down Expand Up @@ -2525,7 +2570,6 @@ private void toTEI(Document doc,
teiFormatter,
resCitations,
config);

if (fundingStmt.length() > 0) {
tei.append(fundingStmt);
}
Expand Down

0 comments on commit 52352a8

Please sign in to comment.