From 6ff15ee87db55c010b846e6a8a7120123534c7bf Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 17 Apr 2024 08:30:20 +0700
Subject: [PATCH 1/5] keep convention on the token/character calculation

---
 .../main/java/org/grobid/core/lexicon/Lexicon.java | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
index 5bd5e642b9..681a0da7a7 100755
--- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
+++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -1195,24 +1195,24 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
 
         List<OffsetPosition> urlTokensPositions = tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations);
 
+        // We need to adjust the end of the positions to avoid problems with the sublist
+        // that is used the following method
+        urlTokensPositions.stream().forEach(o -> o.end += 1);
+
         // here we need to match the offsetPositions related to the text obtained by the layoutTokens, with the text
         // which may be different (spaces, hypen, breakline)
         return TextUtilities.matchTokenAndString(layoutTokens, text, urlTokensPositions);
     }
 
     /**
-     * This method returns the token positions in respect of the layout tokens
+     * This method returns the token positions in respect of the layout tokens,
+     * the output token offsets are (included, included)
      */
     public static List<OffsetPosition> tokenPositionUrlPatternWithPdfAnnotations(
         List<LayoutToken> layoutTokens,
         List<PDFAnnotation> pdfAnnotations) {
 
-        List<OffsetPosition> offsetPositions = convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
-        // We need to adjust the end of the positions to avoid problems with the sublist
-
-        offsetPositions.stream().forEach(o -> o.end += 1);
-
-        return offsetPositions;
+        return convertStringOffsetToTokenOffset(characterPositionsUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations), layoutTokens);
     }
 
     /**

From 3900dc228b462dd951605ef31a7809fb18e6233c Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sun, 28 Apr 2024 09:54:46 +0800
Subject: [PATCH 2/5] update test to follow the convention

---
 .../src/test/java/org/grobid/core/lexicon/LexiconTest.java     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
index abf407dbbf..c70c930435 100644
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -165,7 +165,8 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr
 
         assertThat(offsetPositions, hasSize(1));
         OffsetPosition url = offsetPositions.get(0);
-        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end)), is("https://github.com/lfoppiano/ \nsupercon2"));
+        // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive
+        assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2"));
     }
 
     @Test

From ec52f13948f854fc28c44f4a37f357c1ee9f44b2 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Sat, 4 May 2024 12:50:26 +0900
Subject: [PATCH 3/5] get fixes on matchTokenAndString from PR #1099

---
 .../grobid/core/utilities/TextUtilities.java  |  38 +++--
 .../core/utilities/TextUtilitiesTest.java     | 136 +++++++++++++++++-
 2 files changed, 160 insertions(+), 14 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
index 87224cace8..f0e6cf03af 100755
--- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
+++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -1557,22 +1557,25 @@ public static org.apache.commons.lang3.tuple.Pair<OffsetPosition, OffsetPosition
             return null;
     }
 
-    public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> urlPositions) {
+    public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> positions) {
         List<OffsetPosition> newPositions = new ArrayList<>();
         StringBuilder accumulator = new StringBuilder();
         int pos = 0;
+        int textPositionOfToken = 0;
 
-        for (OffsetPosition urlPosition : urlPositions) {
-            List<LayoutToken> urlTokens = layoutTokens.subList(urlPosition.start, urlPosition.end);
+        for (OffsetPosition position : positions) {
+            List<LayoutToken> annotationTokens = layoutTokens.subList(position.start, position.end);
             boolean first = true;
-            for (int i = 0; i < urlTokens.size(); i++) {
-                LayoutToken token = urlTokens.get(i);
+            accumulator = new StringBuilder();
+            for (int i = 0; i < annotationTokens.size(); i++) {
+                LayoutToken token = annotationTokens.get(i);
                 if (StringUtils.isEmpty(token.getText()))
                     continue;
-                int newPos = text.indexOf(token.getText(), pos);
-                if (newPos != -1) {
+                textPositionOfToken = text.indexOf(token.getText(), pos);
+                if (textPositionOfToken != -1) {
+                    //We update pos only at the first token of the annotation positions
                     if (first) {
-                        pos = newPos;
+                        pos = textPositionOfToken;
                         first = false;
                     }
                     accumulator.append(token);
@@ -1581,16 +1584,25 @@ public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutT
                         continue;
                     }
                     if (StringUtils.isNotEmpty(accumulator)) {
+                        int accumulatorTextLength = accumulator.toString().length();
                         int start = text.indexOf(accumulator.toString(), pos);
-                        newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
-                        accumulator = new StringBuilder();
-                        pos = newPos;
-                        first = true;
+                        int end = start + accumulatorTextLength;
+                        newPositions.add(new OffsetPosition(start, end));
+                        pos = end;
                         break;
                     }
-                    pos = newPos;
+                    pos = textPositionOfToken;
                 }
             }
+            if (StringUtils.isNotEmpty(accumulator)) {
+                int annotationTextLength = accumulator.toString().length();
+                int start = text.indexOf(accumulator.toString(), pos);
+                int end = start + annotationTextLength;
+                newPositions.add(new OffsetPosition(start, end));
+                pos = end;
+                accumulator = new StringBuilder();
+            }
+
         }
         if (StringUtils.isNotEmpty(accumulator)) {
             int start = text.indexOf(accumulator.toString(), pos);
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
index 6303dc6450..8b53cc263e 100644
--- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java
@@ -13,7 +13,6 @@
 import java.util.regex.Matcher;
 
 import static org.hamcrest.CoreMatchers.is;
-import static org.hamcrest.CoreMatchers.startsWith;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.*;
@@ -436,4 +435,139 @@ public void testMatchTokenAndString() throws Exception {
         assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
 
     }
+
+
+    @Test
+    public void testMatchTokenAndString_twoElements() throws Exception {
+        final String input = "This work is available at https://github.com/lfoppiano/ \n" +
+            "supercon2. The repository contains the code of the \n" +
+            "SuperCon 2 interface, the curation workflow, and the \n" +
+            "\n" +
+            "Table 2. Data support, the number of entities for each label in \n" +
+            "each of the datasets used for evaluating the ML models. The \n" +
+            "base dataset is the original dataset described in [18], and the \n" +
+            "curation dataset is automatically collected based on the data-\n" +
+            "base corrections by the interface and manually corrected. \n" +
+            "\n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
+        List<OffsetPosition> urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23));
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);
+
+        assertThat(offsetPositions, hasSize(2));
+        OffsetPosition url0 = offsetPositions.get(0);
+        assertThat(url0.start, is(0));
+        assertThat(url0.end, is(9));
+
+        assertThat(inputReal.substring(url0.start, url0.end), is("This work"));
+
+        OffsetPosition url1 = offsetPositions.get(1);
+        assertThat(url1.start, is(26));
+        assertThat(url1.end, is(65));
+
+        assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception {
+        final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> urlTokens = Arrays.asList(
+            new OffsetPosition(0, 3),
+            new OffsetPosition(5, 8),
+            new OffsetPosition(10, 13),
+            new OffsetPosition(15, 18)
+        );
+        
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens);
+
+        assertThat(offsetPositions, hasSize(4));
+        
+        OffsetPosition url0 = offsetPositions.get(0);
+        assertThat(url0.start, is(0));
+        assertThat(url0.end, is(19));
+
+        assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne"));
+
+        OffsetPosition url1 = offsetPositions.get(1);
+        assertThat(url1.start, is(21));
+        assertThat(url1.end, is(34));
+
+        assertThat(input.substring(url1.start, url1.end), is("Claudie Marec"));
+
+        OffsetPosition url2 = offsetPositions.get(2);
+        assertThat(url2.start, is(36));
+        assertThat(url2.end, is(49));
+
+        assertThat(input.substring(url2.start, url2.end), is("Claudie Marec"));
+
+        OffsetPosition url3 = offsetPositions.get(3);
+        assertThat(url3.start, is(51));
+        assertThat(url3.end, is(66));
+
+        assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception {
+        final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities.";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> annotationTokenPositions = Arrays.asList(
+            new OffsetPosition(4, 7),
+            new OffsetPosition(9, 12),
+            new OffsetPosition(15, 18),
+            new OffsetPosition(27, 30),
+            new OffsetPosition(49, 52),
+            new OffsetPosition(71, 74),
+            new OffsetPosition(103, 106),
+            new OffsetPosition(109, 110),
+            new OffsetPosition(125, 126)
+        );
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+        assertThat(offsetPositions, hasSize(9));
+
+        OffsetPosition url7 = offsetPositions.get(7);
+        assertThat(url7.start, is(349));
+        assertThat(url7.end, is(352));
+
+        assertThat(input.substring(url7.start, url7.end), is("IGC"));
+
+        OffsetPosition url8 = offsetPositions.get(8);
+        assertThat(url8.start, is(397));
+        assertThat(url8.end, is(400));
+
+        assertThat(input.substring(url8.start, url8.end), is("IGC"));
+
+    }
+
+    @Test
+    public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception {
+        final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter.";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        List<OffsetPosition> annotationTokenPositions = Arrays.asList(
+            new OffsetPosition(4, 7),
+            new OffsetPosition(40, 41),
+            new OffsetPosition(62, 63),
+            new OffsetPosition(79, 84)
+        );
+
+        List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);
+
+        assertThat(offsetPositions, hasSize(4));
+
+        OffsetPosition url7 = offsetPositions.get(1);
+        assertThat(input.substring(url7.start, url7.end), is("NERC"));
+
+        OffsetPosition url8 = offsetPositions.get(2);
+        assertThat(input.substring(url8.start, url8.end), is("ERC"));
+    }
 }

From f983f2548813a02e9dc1a0b37e5fefe1eafd4abb Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 9 May 2024 12:17:59 +0900
Subject: [PATCH 4/5] Add additional test and fix to the method so that the
 offsets are correctly matching the real text (dehypenised)

---
 .../grobid/core/document/TEIFormatter.java    |  2 +-
 .../org/grobid/core/lexicon/LexiconTest.java  | 36 +++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 20a7746388..7283a2e513 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1873,7 +1873,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
-        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations);
+        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
         forbiddenPositions.addAll(offsetPositionsUrls);
 
         List<OffsetPosition> theSentences = 
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
index c70c930435..8b3b501488 100644
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -208,4 +208,40 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
         OffsetPosition url = offsetPositions.get(0);
         assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
     }
+
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+        final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" +
+            "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" +
+            "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" +
+            "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" +
+            "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" +
+            "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(97);
+        lastTokenOfTheURL.setPage(19);
+        lastTokenOfTheURL.setX(465.54675000000003);
+        lastTokenOfTheURL.setY(404.908);
+        lastTokenOfTheURL.setWidth(68.727);
+        lastTokenOfTheURL.setHeight(9.0873);
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(19);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("http://www.gencodegenes.org/releases/");
+        annotation.setType(PDFAnnotation.Type.URI);
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38.  ";
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/"));
+    }
 }

From 617aa16a29ccd578c5734d42c1a92fdfce01b811 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 9 May 2024 17:11:07 +0900
Subject: [PATCH 5/5] Apply url preservation also in tables description and
 notes

---
 grobid-core/src/main/java/org/grobid/core/data/Table.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
index 6356978837..14d468418c 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Table.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     if (desc != null && config.isWithSentenceSegmentation()) {
-                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 
                         // a <div>, then a <p>
@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
                     if (noteNode != null && config.isWithSentenceSegmentation()) {
                         // we need a sentence segmentation of the figure caption
-                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                     }
 
                     // enclose note content in a <p> element