From 2aec2362764f07b26ae88812b13dcae4ba636d2a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 27 Jan 2021 11:25:01 +0900 Subject: [PATCH] fixing possible miss of opened tables when the stream finishes --- .../grobid/core/engines/FullTextParser.java | 39 ++++++--- .../core/engines/FullTextParserTest.java | 81 +++++++++++++++++-- 2 files changed, 102 insertions(+), 18 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index a6cdf1fd58..e11fc015de 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -2113,7 +2113,7 @@ protected List processTables(String rese, * Create training data for the table as identified by the full text model. * Return the pair (TEI fragment, CRF raw data). */ - private Pair processTrainingDataTables(String rese, + protected Pair processTrainingDataTables(String rese, List tokenizations, String id) { StringBuilder tei = new StringBuilder(); StringBuilder featureVector = new StringBuilder(); @@ -2156,19 +2156,15 @@ private Pair processTrainingDataTables(String rese, int ll = s.length; String label = s[ll-1]; String plainLabel = GenericTaggerUtils.getPlainLabel(label); - if (label.equals("
") || (label.equals("I-
") && !openTable) ) { + if (label.equals("
") || ((label.equals("I-
") && !openTable) )) { if (!openTable) { - for(LayoutToken lTok : tokenizationsBuffer) { - tokenizationsTable.add(lTok); - } - openTable = true; - } + openTable = true; + tokenizationsTable.addAll(tokenizationsBuffer); } // we remove the label in the CRF row int ind = row.lastIndexOf("\t"); tableBlock.append(row.substring(0, ind)).append("\n"); - } - else if (label.equals("I-
") || openTable) { - // remove last token + } else if (label.equals("I-
") || openTable) { + // remove last tokens if (tokenizationsTable.size() > 0) { int nbToRemove = tokenizationsBuffer.size(); for(int q=0; q") || openTable) { // process the "accumulated" table Pair trainingData = parsers.getTableParser().createTrainingData(tokenizationsTable, tableBlock.toString(), "Fig"+nb); - tokenizationsTable = new ArrayList(); + tokenizationsTable = new ArrayList<>(); tableBlock = new StringBuilder(); if (trainingData!= null) { if (tei.length() == 0) { @@ -2216,6 +2212,27 @@ else if (label.equals("I-
") || openTable) { openTable = false; } + // If there still an open table + if (openTable) { + while((tokenizationsTable.size() > 0) && + (tokenizationsTable.get(0).getText().equals("\n") || + tokenizationsTable.get(0).getText().equals(" ")) ) + tokenizationsTable.remove(0); + + // process the "accumulated" figure + Pair trainingData = parsers.getTableParser() + .createTrainingData(tokenizationsTable, tableBlock.toString(), "Fig" + nb); + if (trainingData!= null) { + if (tei.length() == 0) { + tei.append(parsers.getTableParser().getTEIHeader(id)).append("\n\n"); + } + if (trainingData.getLeft() != null) + tei.append(trainingData.getLeft()).append("\n\n"); + if (trainingData.getRight() != null) + featureVector.append(trainingData.getRight()).append("\n\n"); + } + } + if (tei.length() != 0) { tei.append("\n \n" + "\n"); diff --git a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java index 872885bbc8..8cc785948c 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java @@ -75,10 +75,6 @@ public void testProcessTrainingDataFigures_single_figure() throws Exception { String tokenisation = stringStringPair.getRight(); String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - System.out.println(tei); - System.out.println(reconstructedText); - System.out.println(tokenisation); - assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); assertThat(tokenisation.split("\n").length, is(13)); @@ -119,10 +115,81 @@ public void testProcessTrainingDataFigures_multiple_figures() throws Exception { } } + assertThat(output, hasSize(2)); + assertThat(output.get(0), is("FIG . 1 . λ ( T )")); + assertThat(output.get(1), is("vs . T for YBCO")); + assertThat(tokenisation.split("\n").length, is(15)); - System.out.println(tei); - System.out.println(output); - System.out.println(tokenisation); + } + + @Test + public void testProcessTrainingDataTables_single_table() throws Exception { + String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; + + + Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); + + String tei = stringStringPair.getLeft(); + String tokenisation = stringStringPair.getRight(); + String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); + + assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); + assertThat(tokenisation.split("\n").length, is(13)); + + } + + @Test + public void testProcessTrainingDataTable_multiple_tables() throws Exception { + String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; + + + Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); + + String tei = stringStringPair.getLeft(); + String tokenisation = stringStringPair.getRight(); + List output = new ArrayList<>(); + for (String block : tokenisation.split("\n\n\n")) { + String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); + if (StringUtils.isNotBlank(collect)) { + output.add(collect); + } + } assertThat(output, hasSize(2)); assertThat(output.get(0), is("FIG . 1 . λ ( T )"));