From 6ff15ee87db55c010b846e6a8a7120123534c7bf Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 17 Apr 2024 08:30:20 +0700 Subject: [PATCH 1/5] keep convention on the token/character calculation --- .../main/java/org/grobid/core/lexicon/Lexicon.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index 5bd5e642b9..681a0da7a7 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -1195,24 +1195,24 @@ public static List characterPositionsUrlPatternWithPdfAnnotation List urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations); + // We need to adjust the end of the positions to avoid problems with the sublist + // that is used the following method + urlTokensPositions.stream().forEach(o -> o.end += 1); + // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text // which may be different (spaces, hypen, breakline) return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions); } /** - * This method returns the token positions in respect of the layout tokens + * This method returns the token positions in respect of the layout tokens, + * the output token offsets are (included, included) */ public static List tokenPositionUrlPatternWithPdfAnnotations( List layoutTokens, List pdfAnnotations) { - List offsetPositions = convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens); - // We need to adjust the end of the positions to avoid problems with the sublist - - offsetPositions.stream().forEach(o -> o.end += 1); - - return offsetPositions; + return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens); } /** From 3900dc228b462dd951605ef31a7809fb18e6233c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:54:46 +0800 Subject: [PATCH 2/5] update test to follow the convention --- .../src/test/java/org/grobid/core/lexicon/LexiconTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index abf407dbbf..c70c930435 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -165,7 +165,8 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr assertThat(offsetPositions, hasSize(1)); OffsetPosition url = offsetPositions.get(0); - assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2")); + // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive + assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2")); } @Test From ec52f13948f854fc28c44f4a37f357c1ee9f44b2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 12:50:26 +0900 Subject: [PATCH 3/5] get fixes on matchTokenAndString from PR #1099 --- .../grobid/core/utilities/TextUtilities.java | 38 +++-- .../core/utilities/TextUtilitiesTest.java | 136 +++++++++++++++++- 2 files changed, 160 insertions(+), 14 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 87224cace8..f0e6cf03af 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1557,22 +1557,25 @@ public static org.apache.commons.lang3.tuple.Pair matchTokenAndString(List layoutTokens, String text, List urlPositions) { + public static List matchTokenAndString(List layoutTokens, String text, List positions) { List newPositions = new ArrayList<>(); StringBuilder accumulator = new StringBuilder(); int pos = 0; + int textPositionOfToken = 0; - for (OffsetPosition urlPosition : urlPositions) { - List urlTokens = layoutTokens.subList(urlPosition.start, urlPosition.end); + for (OffsetPosition position : positions) { + List annotationTokens = layoutTokens.subList(position.start, position.end); boolean first = true; - for (int i = 0; i < urlTokens.size(); i++) { - LayoutToken token = urlTokens.get(i); + accumulator = new StringBuilder(); + for (int i = 0; i < annotationTokens.size(); i++) { + LayoutToken token = annotationTokens.get(i); if (StringUtils.isEmpty(token.getText())) continue; - int newPos = text.indexOf(token.getText(), pos); - if (newPos != -1) { + textPositionOfToken = text.indexOf(token.getText(), pos); + if (textPositionOfToken != -1) { + //We update pos only at the first token of the annotation positions if (first) { - pos = newPos; + pos = textPositionOfToken; first = false; } accumulator.append(token); @@ -1581,16 +1584,25 @@ public static List matchTokenAndString(List layoutT continue; } if (StringUtils.isNotEmpty(accumulator)) { + int accumulatorTextLength = accumulator.toString().length(); int start = text.indexOf(accumulator.toString(), pos); - newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); - accumulator = new StringBuilder(); - pos = newPos; - first = true; + int end = start + accumulatorTextLength; + newPositions.add(new OffsetPosition(start, end)); + pos = end; break; } - pos = newPos; + pos = textPositionOfToken; } } + if (StringUtils.isNotEmpty(accumulator)) { + int annotationTextLength = accumulator.toString().length(); + int start = text.indexOf(accumulator.toString(), pos); + int end = start + annotationTextLength; + newPositions.add(new OffsetPosition(start, end)); + pos = end; + accumulator = new StringBuilder(); + } + } if (StringUtils.isNotEmpty(accumulator)) { int start = text.indexOf(accumulator.toString(), pos); diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index 6303dc6450..8b53cc263e 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -13,7 +13,6 @@ import java.util.regex.Matcher; import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.CoreMatchers.startsWith; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.*; @@ -436,4 +435,139 @@ public void testMatchTokenAndString() throws Exception { assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); } + + + @Test + public void testMatchTokenAndString_twoElements() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + List urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23)); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens); + + assertThat(offsetPositions, hasSize(2)); + OffsetPosition url0 = offsetPositions.get(0); + assertThat(url0.start, is(0)); + assertThat(url0.end, is(9)); + + assertThat(inputReal.substring(url0.start, url0.end), is("This work")); + + OffsetPosition url1 = offsetPositions.get(1); + assertThat(url1.start, is(26)); + assertThat(url1.end, is(65)); + + assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); + + } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception { + final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List urlTokens = Arrays.asList( + new OffsetPosition(0, 3), + new OffsetPosition(5, 8), + new OffsetPosition(10, 13), + new OffsetPosition(15, 18) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens); + + assertThat(offsetPositions, hasSize(4)); + + OffsetPosition url0 = offsetPositions.get(0); + assertThat(url0.start, is(0)); + assertThat(url0.end, is(19)); + + assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne")); + + OffsetPosition url1 = offsetPositions.get(1); + assertThat(url1.start, is(21)); + assertThat(url1.end, is(34)); + + assertThat(input.substring(url1.start, url1.end), is("Claudie Marec")); + + OffsetPosition url2 = offsetPositions.get(2); + assertThat(url2.start, is(36)); + assertThat(url2.end, is(49)); + + assertThat(input.substring(url2.start, url2.end), is("Claudie Marec")); + + OffsetPosition url3 = offsetPositions.get(3); + assertThat(url3.start, is(51)); + assertThat(url3.end, is(66)); + + assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder")); + + } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception { + final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List annotationTokenPositions = Arrays.asList( + new OffsetPosition(4, 7), + new OffsetPosition(9, 12), + new OffsetPosition(15, 18), + new OffsetPosition(27, 30), + new OffsetPosition(49, 52), + new OffsetPosition(71, 74), + new OffsetPosition(103, 106), + new OffsetPosition(109, 110), + new OffsetPosition(125, 126) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions); + + assertThat(offsetPositions, hasSize(9)); + + OffsetPosition url7 = offsetPositions.get(7); + assertThat(url7.start, is(349)); + assertThat(url7.end, is(352)); + + assertThat(input.substring(url7.start, url7.end), is("IGC")); + + OffsetPosition url8 = offsetPositions.get(8); + assertThat(url8.start, is(397)); + assertThat(url8.end, is(400)); + + assertThat(input.substring(url8.start, url8.end), is("IGC")); + + } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception { + final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List annotationTokenPositions = Arrays.asList( + new OffsetPosition(4, 7), + new OffsetPosition(40, 41), + new OffsetPosition(62, 63), + new OffsetPosition(79, 84) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions); + + assertThat(offsetPositions, hasSize(4)); + + OffsetPosition url7 = offsetPositions.get(1); + assertThat(input.substring(url7.start, url7.end), is("NERC")); + + OffsetPosition url8 = offsetPositions.get(2); + assertThat(input.substring(url8.start, url8.end), is("ERC")); + } } From f983f2548813a02e9dc1a0b37e5fefe1eafd4abb Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 9 May 2024 12:17:59 +0900 Subject: [PATCH 4/5] Add additional test and fix to the method so that the offsets are correctly matching the real text (dehypenised) --- .../grobid/core/document/TEIFormatter.java | 2 +- .../org/grobid/core/lexicon/LexiconTest.java | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 20a7746388..7283a2e513 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1873,7 +1873,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } - List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations); + List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text); forbiddenPositions.addAll(offsetPositionsUrls); List theSentences = diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index c70c930435..8b3b501488 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -208,4 +208,40 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC OffsetPosition url = offsetPositions.get(0); assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2")); } + + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception { + final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" + + "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" + + "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" + + "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" + + "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" + + "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(97); + lastTokenOfTheURL.setPage(19); + lastTokenOfTheURL.setX(465.54675000000003); + lastTokenOfTheURL.setY(404.908); + lastTokenOfTheURL.setWidth(68.727); + lastTokenOfTheURL.setHeight(9.0873); + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(19); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("http://www.gencodegenes.org/releases/"); + annotation.setType(PDFAnnotation.Type.URI); + List pdfAnnotations = List.of(annotation); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. "; + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/")); + } } From 617aa16a29ccd578c5734d42c1a92fdfce01b811 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 9 May 2024 17:11:07 +0900 Subject: [PATCH 5/5] Apply url preservation also in tables description and notes --- grobid-core/src/main/java/org/grobid/core/data/Table.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 6356978837..14d468418c 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } if (desc != null && config.isWithSentenceSegmentation()) { - formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage()); + formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); // we need a sentence segmentation of the table caption, for that we need to introduce // a
, then a

@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form if (noteNode != null && config.isWithSentenceSegmentation()) { // we need a sentence segmentation of the figure caption - formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage()); + formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } // enclose note content in a

element