Skip to content

Commit

Permalink
Merge pull request #959 from kermitt2/feature/funding-statement
Browse files Browse the repository at this point in the history
Add funding statement in TEI output
  • Loading branch information
kermitt2 committed Oct 19, 2022
2 parents 8e53a7d + 6c8b888 commit dab259e
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 140 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ public Document processing(DocumentSource documentSource,
if (abstractProcessed != null) {
// neutralize figure and table annotations (will be considered as paragraphs)
String labeledAbstract = abstractProcessed.getLeft();
labeledAbstract = postProcessLabeledAbstract(labeledAbstract);
labeledAbstract = postProcessFullTextLabeledText(labeledAbstract);
resHeader.setLabeledAbstract(labeledAbstract);
resHeader.setLayoutTokensForLabel(abstractProcessed.getRight(), TaggingLabels.HEADER_ABSTRACT);
}
Expand Down Expand Up @@ -418,18 +418,23 @@ public Pair<String, List<LayoutToken>> processShort(List<LayoutToken> tokens, Do
layoutTokenization = layouts.getTokenization();
if ( (featuredText != null) && (featuredText.trim().length() > 0) ) {
res = label(featuredText);
res = postProcessFullTextLabeledText(res);
}
}

return Pair.of(res, layoutTokenization);
}

static protected String postProcessLabeledAbstract(String labeledAbstract) {
if (labeledAbstract == null)
/**
* Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure)
* It converts table and figure labels to paragraph labels.
*/
protected static String postProcessFullTextLabeledText(String fulltextLabeledText) {
if (fulltextLabeledText == null)
return null;
StringBuilder result = new StringBuilder();

String[] lines = labeledAbstract.split("\n");
String[] lines = fulltextLabeledText.split("\n");
String previousLabel = null;
for(int i=0; i<lines.length; i++) {
String line = lines[i];
Expand Down Expand Up @@ -2466,16 +2471,20 @@ private void toTEI(Document doc,
tei.append("\t\t<back>\n");

// acknowledgement is in the back
tei.append(getSectionAsTEI("acknowledgement", "\t\t\t",doc, SegmentationLabels.ACKNOWLEDGEMENT,
teiFormatter, resCitations, config));
StringBuilder acknowledgmentStmt = getSectionAsTEI("acknowledgement", "\t\t\t", doc, SegmentationLabels.ACKNOWLEDGEMENT,
teiFormatter, resCitations, config);

if (acknowledgmentStmt.length() > 0) {
tei.append(acknowledgmentStmt);
}

// availability statements in header
StringBuilder availabilityStmt = new StringBuilder();
if (StringUtils.isNotBlank(resHeader.getAvailabilityStmt())) {
List<LayoutToken> headerAvailabilityStatementTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_AVAILABILITY);
Pair<String, List<LayoutToken>> headerAvailabilityProcessed = processShort(headerAvailabilityStatementTokens, doc);
if (headerAvailabilityProcessed != null) {
availabilityStmt = teiFormatter.processTEIDivSection("availability",
availabilityStmt = teiFormatter.processTEIDivSection("availability",
"\t\t\t",
headerAvailabilityProcessed.getLeft(),
headerAvailabilityProcessed.getRight(),
Expand All @@ -2488,7 +2497,7 @@ private void toTEI(Document doc,
}

// availability statements in non-header part
availabilityStmt = getSectionAsTEI("availability",
availabilityStmt = getSectionAsTEI("availability",
"\t\t\t",
doc,
SegmentationLabels.AVAILABILITY,
Expand All @@ -2499,6 +2508,36 @@ private void toTEI(Document doc,
tei.append(availabilityStmt.toString());
}

// funding in header
StringBuilder fundingStmt = new StringBuilder();
if (StringUtils.isNotBlank(resHeader.getFunding())) {
List<LayoutToken> headerFundingTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_FUNDING);
Pair<String, List<LayoutToken>> headerFundingProcessed = processShort(headerFundingTokens, doc);
if (headerFundingProcessed != null) {
fundingStmt = teiFormatter.processTEIDivSection("funding",
"\t\t\t",
headerFundingProcessed.getLeft(),
headerFundingProcessed.getRight(),
resCitations,
config);
}
if (fundingStmt.length() > 0) {
tei.append(fundingStmt.toString());
}
}

// funding statements in non-header part
fundingStmt = getSectionAsTEI("funding",
"\t\t\t",
doc,
SegmentationLabels.FUNDING,
teiFormatter,
resCitations,
config);
if (fundingStmt.length() > 0) {
tei.append(fundingStmt);
}

tei = teiFormatter.toTEIAnnex(tei, reseAnnex, resHeader, resCitations,
tokenizationsAnnex, markerTypes, doc, config);

Expand Down Expand Up @@ -2542,6 +2581,7 @@ private StringBuilder getSectionAsTEI(String xmlType,
String resultLabelling = null;
if (StringUtils.isNotBlank(text) ) {
resultLabelling = label(text);
resultLabelling = postProcessFullTextLabeledText(resultLabelling);
}
output = teiFormatter.processTEIDivSection(xmlType, indentation, resultLabelling, tokens, resCitations, config);
}
Expand Down
Loading

0 comments on commit dab259e

Please sign in to comment.