Skip to content

Commit

Permalink
Merge e6ffaec into 5d2d814
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 14, 2021
2 parents 5d2d814 + e6ffaec commit f791f95
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 17 deletions.
Expand Up @@ -1936,15 +1936,15 @@ protected List<Figure> processFigures(String rese, List<LayoutToken> layoutToken
* Create training data for the figures as identified by the full text model.
* Return the pair (TEI fragment, CRF raw data).
*/
private Pair<String,String> processTrainingDataFigures(String rese,
protected Pair<String,String> processTrainingDataFigures(String rese,
List<LayoutToken> tokenizations, String id) {
StringBuilder tei = new StringBuilder();
StringBuilder featureVector = new StringBuilder();
int nb = 0;
StringTokenizer st1 = new StringTokenizer(rese, "\n");
boolean openFigure = false;
StringBuilder figureBlock = new StringBuilder();
List<LayoutToken> tokenizationsFigure = new ArrayList<LayoutToken>();
List<LayoutToken> tokenizationsFigure = new ArrayList<>();
List<LayoutToken> tokenizationsBuffer = null;
int p = 0; // position in tokenizations
int i = 0;
Expand All @@ -1954,7 +1954,7 @@ private Pair<String,String> processTrainingDataFigures(String rese,
String token = s[0].trim();
int p0 = p;
boolean strop = false;
tokenizationsBuffer = new ArrayList<LayoutToken>();
tokenizationsBuffer = new ArrayList<>();
while ((!strop) && (p < tokenizations.size())) {
String tokOriginal = tokenizations.get(p).getText().trim();
if (openFigure)
Expand All @@ -1980,20 +1980,17 @@ private Pair<String,String> processTrainingDataFigures(String rese,
String plainLabel = GenericTaggerUtils.getPlainLabel(label);
if (label.equals("<figure>") || ((label.equals("I-<figure>") && !openFigure))) {
if (!openFigure) {
for(LayoutToken lTok : tokenizationsBuffer) {
tokenizationsFigure.add(lTok);
}
openFigure = true;
openFigure = true;
tokenizationsFigure.addAll(tokenizationsBuffer);
}
// we remove the label in the CRF row
int ind = row.lastIndexOf("\t");
figureBlock.append(row.substring(0, ind)).append("\n");
}
else if (label.equals("I-<figure>") || openFigure) {
// remove last token
figureBlock.append(row, 0, ind).append("\n");
} else if (label.equals("I-<figure>") || openFigure) {
// remove last tokens
if (tokenizationsFigure.size() > 0) {
int nbToRemove = tokenizationsBuffer.size();
for(int q=0; q<nbToRemove; q++)
for(int q = 0; q < nbToRemove; q++)
tokenizationsFigure.remove(tokenizationsFigure.size()-1);
}
// parse the recognized figure area
Expand All @@ -2014,7 +2011,7 @@ else if (label.equals("I-<figure>") || openFigure) {
// process the "accumulated" figure
Pair<String,String> trainingData = parsers.getFigureParser()
.createTrainingData(tokenizationsFigure, figureBlock.toString(), "Fig" + nb);
tokenizationsFigure = new ArrayList<LayoutToken>();
tokenizationsFigure = new ArrayList<>();
figureBlock = new StringBuilder();
if (trainingData!= null) {
if (tei.length() == 0) {
Expand All @@ -2032,16 +2029,35 @@ else if (label.equals("I-<figure>") || openFigure) {
}
int ind = row.lastIndexOf("\t");
figureBlock.append(row.substring(0, ind)).append("\n");
}
else {
} else {
openFigure = false;
}
nb++;
}
else
} else
openFigure = false;
}

// If there still an open figure
if (openFigure) {
while((tokenizationsFigure.size() > 0) &&
(tokenizationsFigure.get(0).getText().equals("\n") ||
tokenizationsFigure.get(0).getText().equals(" ")) )
tokenizationsFigure.remove(0);

// process the "accumulated" figure
Pair<String,String> trainingData = parsers.getFigureParser()
.createTrainingData(tokenizationsFigure, figureBlock.toString(), "Fig" + nb);
if (trainingData!= null) {
if (tei.length() == 0) {
tei.append(parsers.getFigureParser().getTEIHeader(id)).append("\n\n");
}
if (trainingData.getLeft() != null)
tei.append(trainingData.getLeft()).append("\n\n");
if (trainingData.getRight() != null)
featureVector.append(trainingData.getRight()).append("\n\n");
}
}

if (tei.length() != 0) {
tei.append("\n </text>\n" +
"</tei>\n");
Expand Down
@@ -1,5 +1,7 @@
package org.grobid.core.engines;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentPiece;
Expand All @@ -14,6 +16,7 @@
import org.junit.Test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.SortedSet;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -43,6 +46,91 @@ public static void tearDown() {
GrobidFactory.reset();
}

@Test
public void testProcessTrainingDataFigures_single_figure() throws Exception {
String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-<paragraph>\n" +
"mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-<figure>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t<figure>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n\n";


Pair<String, String> stringStringPair = target.processTrainingDataFigures(rese, tokens, "123");

String tei = stringStringPair.getLeft();
String tokenisation = stringStringPair.getRight();
String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" "));

System.out.println(tei);
System.out.println(reconstructedText);
System.out.println(tokenisation);

assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO"));
assertThat(tokenisation.split("\n").length, is(13));

}

@Test
public void testProcessTrainingDataFigures_multiple_figures() throws Exception {
String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-<paragraph>\n" +
"mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-<figure>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t<figure>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-<figure>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n" +
"YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<figure>\n\n";


Pair<String, String> stringStringPair = target.processTrainingDataFigures(rese, tokens, "123");

String tei = stringStringPair.getLeft();
String tokenisation = stringStringPair.getRight();
List<String> output = new ArrayList<>();
for (String block : tokenisation.split("\n\n\n")) {
String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" "));
if (StringUtils.isNotBlank(collect)) {
output.add(collect);
}
}


System.out.println(tei);
System.out.println(output);
System.out.println(tokenisation);

assertThat(output, hasSize(2));
assertThat(output.get(0), is("FIG . 1 . λ ( T )"));
assertThat(output.get(1), is("vs . T for YBCO"));
assertThat(tokenisation.split("\n").length, is(15));

}

// @Test
// public void testProcess2() throws Exception {
// String text = "(a) shows the temperature variation of the 31 P-\n" +
Expand Down

0 comments on commit f791f95

Please sign in to comment.