Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added annex figures, tables, equations #738

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1131,6 +1131,9 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,
BiblioItem biblio,
List<BibDataSet> bds,
List<LayoutToken> tokenizations,
List<Figure> figures,
List<Table> tables,
List<Equation> equations,
Document doc,
GrobidAnalysisConfig config) throws Exception {
if ((result == null) || (tokenizations == null)) {
Expand All @@ -1139,7 +1142,7 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,

buffer.append("\t\t\t<div type=\"annex\">\n");
buffer = toTEITextPiece(buffer, result, biblio, bds, true,
new LayoutTokenization(tokenizations), null, null, null, doc, config);
new LayoutTokenization(tokenizations), figures, tables, equations, doc, config);
buffer.append("\t\t\t</div>\n");

return buffer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,29 +250,10 @@ else if (config.getConsolidateCitations() == 2)

// we apply now the figure and table models based on the fulltext labeled output
figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc);
// further parse the caption
for(Figure figure : figures) {
if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc);
figure.setLabeledCaption(captionProcess.getLeft());
figure.setCaptionLayoutTokens(captionProcess.getRight());
}
}
postProcessFigureCaptions(figures, doc);

tables = processTables(resultBody, layoutTokenization.getTokenization(), doc);
// further parse the caption
for(Table table : tables) {
if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(table.getCaptionLayoutTokens(), doc);
table.setLabeledCaption(captionProcess.getLeft());
table.setCaptionLayoutTokens(captionProcess.getRight());
}
if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) {
Pair<String, List<LayoutToken>> noteProcess = processShort(table.getNoteLayoutTokens(), doc);
table.setLabeledNote(noteProcess.getLeft());
table.setNoteLayoutTokens(noteProcess.getRight());
}
}
postProcessTableCaptions(tables, doc);

equations = processEquations(resultBody, layoutTokenization.getTokenization(), doc);
} else {
Expand All @@ -283,6 +264,9 @@ else if (config.getConsolidateCitations() == 2)
documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX);
featSeg = getBodyTextFeatured(doc, documentBodyParts);
String resultAnnex = null;
List<Figure> annexFigures = null;
List<Table> annexTables = null;
List<Equation> annexEquations = null;
List<LayoutToken> tokenizationsBody2 = null;
if (featSeg != null && isNotEmpty(trim(featSeg.getLeft()))) {
// if featSeg is null, it usually means that no body segment is found in the
Expand All @@ -291,6 +275,14 @@ else if (config.getConsolidateCitations() == 2)
tokenizationsBody2 = featSeg.getRight().getTokenization();
resultAnnex = label(bodytext);
//System.out.println(rese);

annexFigures = processFigures(resultAnnex, tokenizationsBody2, doc);
postProcessFigureCaptions(annexFigures, doc);

annexTables = processTables(resultAnnex, tokenizationsBody2, doc);
postProcessTableCaptions(annexTables, doc);

annexEquations = processEquations(resultAnnex, tokenizationsBody2, doc);
}

// final combination
Expand All @@ -299,6 +291,7 @@ else if (config.getConsolidateCitations() == 2)
layoutTokenization, tokenizationsBody2, // tokenization for body and annex
resHeader, // header
figures, tables, equations,
annexFigures, annexTables, annexEquations,
config);
return doc;
} catch (GrobidException e) {
Expand Down Expand Up @@ -1930,6 +1923,19 @@ protected List<Figure> processFigures(String rese, List<LayoutToken> layoutToken
return results;
}

protected void postProcessFigureCaptions(
List<Figure> figures,
Document doc
) {
// further parse the caption
for(Figure figure : figures) {
if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc);
figure.setLabeledCaption(captionProcess.getLeft());
figure.setCaptionLayoutTokens(captionProcess.getRight());
}
}
}

/**
* Create training data for the figures as identified by the full text model.
Expand Down Expand Up @@ -2103,6 +2109,24 @@ protected List<Table> processTables(String rese,
return results;
}

protected void postProcessTableCaptions(
List<Table> tables,
Document doc
) {
// further parse the caption
for(Table table : tables) {
if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(table.getCaptionLayoutTokens(), doc);
table.setLabeledCaption(captionProcess.getLeft());
table.setCaptionLayoutTokens(captionProcess.getRight());
}
if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) {
Pair<String, List<LayoutToken>> noteProcess = processShort(table.getNoteLayoutTokens(), doc);
table.setLabeledNote(noteProcess.getLeft());
table.setNoteLayoutTokens(noteProcess.getRight());
}
}
}

/**
* Create training data for the table as identified by the full text model.
Expand Down Expand Up @@ -2312,6 +2336,9 @@ private void toTEI(Document doc,
List<Figure> figures,
List<Table> tables,
List<Equation> equations,
List<Figure> annexFigures,
List<Table> annexTables,
List<Equation> annexEquations,
GrobidAnalysisConfig config) {
if (doc.getBlocks() == null) {
return;
Expand Down Expand Up @@ -2348,7 +2375,10 @@ private void toTEI(Document doc,
}

tei = teiFormatter.toTEIAnnex(tei, reseAnnex, resHeader, resCitations,
tokenizationsAnnex, doc, config);
tokenizationsAnnex,
annexFigures, annexTables, annexEquations,
doc, config
);

tei = teiFormatter.toTEIReferences(tei, resCitations, config);
doc.calculateTeiIdToBibDataSets();
Expand Down