Skip to content

Commit

Permalink
fixing possible miss of opened tables when the stream finishes
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 27, 2021
1 parent f5963e9 commit 2aec236
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 18 deletions.
Expand Up @@ -2113,7 +2113,7 @@ protected List<Table> processTables(String rese,
* Create training data for the table as identified by the full text model.
* Return the pair (TEI fragment, CRF raw data).
*/
private Pair<String,String> processTrainingDataTables(String rese,
protected Pair<String,String> processTrainingDataTables(String rese,
List<LayoutToken> tokenizations, String id) {
StringBuilder tei = new StringBuilder();
StringBuilder featureVector = new StringBuilder();
Expand Down Expand Up @@ -2156,19 +2156,15 @@ private Pair<String,String> processTrainingDataTables(String rese,
int ll = s.length;
String label = s[ll-1];
String plainLabel = GenericTaggerUtils.getPlainLabel(label);
if (label.equals("<table>") || (label.equals("I-<table>") && !openTable) ) {
if (label.equals("<table>") || ((label.equals("I-<table>") && !openTable) )) {
if (!openTable) {
for(LayoutToken lTok : tokenizationsBuffer) {
tokenizationsTable.add(lTok);
}
openTable = true;
}
openTable = true;
tokenizationsTable.addAll(tokenizationsBuffer); }
// we remove the label in the CRF row
int ind = row.lastIndexOf("\t");
tableBlock.append(row.substring(0, ind)).append("\n");
}
else if (label.equals("I-<table>") || openTable) {
// remove last token
} else if (label.equals("I-<table>") || openTable) {
// remove last tokens
if (tokenizationsTable.size() > 0) {
int nbToRemove = tokenizationsBuffer.size();
for(int q=0; q<nbToRemove; q++)
Expand All @@ -2191,7 +2187,7 @@ else if (label.equals("I-<table>") || openTable) {

// process the "accumulated" table
Pair<String,String> trainingData = parsers.getTableParser().createTrainingData(tokenizationsTable, tableBlock.toString(), "Fig"+nb);
tokenizationsTable = new ArrayList<LayoutToken>();
tokenizationsTable = new ArrayList<>();
tableBlock = new StringBuilder();
if (trainingData!= null) {
if (tei.length() == 0) {
Expand All @@ -2216,6 +2212,27 @@ else if (label.equals("I-<table>") || openTable) {
openTable = false;
}

// If there still an open table
if (openTable) {
while((tokenizationsTable.size() > 0) &&
(tokenizationsTable.get(0).getText().equals("\n") ||
tokenizationsTable.get(0).getText().equals(" ")) )
tokenizationsTable.remove(0);

// process the "accumulated" figure
Pair<String,String> trainingData = parsers.getTableParser()
.createTrainingData(tokenizationsTable, tableBlock.toString(), "Fig" + nb);
if (trainingData!= null) {
if (tei.length() == 0) {
tei.append(parsers.getTableParser().getTEIHeader(id)).append("\n\n");
}
if (trainingData.getLeft() != null)
tei.append(trainingData.getLeft()).append("\n\n");
if (trainingData.getRight() != null)
featureVector.append(trainingData.getRight()).append("\n\n");
}
}

if (tei.length() != 0) {
tei.append("\n </text>\n" +
"</tei>\n");
Expand Down
Expand Up @@ -75,10 +75,6 @@ public void testProcessTrainingDataFigures_single_figure() throws Exception {
String tokenisation = stringStringPair.getRight();
String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" "));

System.out.println(tei);
System.out.println(reconstructedText);
System.out.println(tokenisation);

assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO"));
assertThat(tokenisation.split("\n").length, is(13));

Expand Down Expand Up @@ -119,10 +115,81 @@ public void testProcessTrainingDataFigures_multiple_figures() throws Exception {
}
}

assertThat(output, hasSize(2));
assertThat(output.get(0), is("FIG . 1 . λ ( T )"));
assertThat(output.get(1), is("vs . T for YBCO"));
assertThat(tokenisation.split("\n").length, is(15));

System.out.println(tei);
System.out.println(output);
System.out.println(tokenisation);
}

@Test
public void testProcessTrainingDataTables_single_table() throws Exception {
String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-<paragraph>\n" +
"mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-<table>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t<table>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n\n";


Pair<String, String> stringStringPair = target.processTrainingDataTables(rese, tokens, "123");

String tei = stringStringPair.getLeft();
String tokenisation = stringStringPair.getRight();
String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" "));

assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO"));
assertThat(tokenisation.split("\n").length, is(13));

}

@Test
public void testProcessTrainingDataTable_multiple_tables() throws Exception {
String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-<paragraph>\n" +
"mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t<paragraph>\n" +
"FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-<table>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t<table>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-<table>\n" +
".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n" +
"YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t<table>\n\n";


Pair<String, String> stringStringPair = target.processTrainingDataTables(rese, tokens, "123");

String tei = stringStringPair.getLeft();
String tokenisation = stringStringPair.getRight();
List<String> output = new ArrayList<>();
for (String block : tokenisation.split("\n\n\n")) {
String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" "));
if (StringUtils.isNotBlank(collect)) {
output.add(collect);
}
}

assertThat(output, hasSize(2));
assertThat(output.get(0), is("FIG . 1 . λ ( T )"));
Expand Down

0 comments on commit 2aec236

Please sign in to comment.