Merge branch 'feature/preserve-urls' into feature/identify-urls

kermitt2 · May 10, 2024 · 6370de2 · 6370de2
2 parents d58633d + 5bcb8b1
commit 6370de2
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 3 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     if (desc != null && config.isWithSentenceSegmentation()) {
-                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 
                         // a <div>, then a <p>
@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
                     if (noteNode != null && config.isWithSentenceSegmentation()) {
                         // we need a sentence segmentation of the figure caption
-                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
+                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
                     }
 
                     // enclose note content in a <p> element 

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1904,7 +1904,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // We add URL that are identified using the PDF features for annotations, in this way we avoid mangling URLs
         // in different sentences.
-        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations);
+        List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
         forbiddenPositions.addAll(offsetPositionsUrls);
 
         List<OffsetPosition> theSentences = 

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -189,6 +189,7 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr
 
         assertThat(offsetPositions, hasSize(1));
         OffsetPosition url = offsetPositions.get(0);
+        // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive
         assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2"));
     }
 
@@ -232,6 +233,42 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
         assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
     }
 
+    @Test
+    public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
+        final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" +
+            "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" +
+            "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" +
+            "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" +
+            "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" +
+            "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n";
+
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+        LayoutToken lastTokenOfTheURL = tokenisedInput.get(97);
+        lastTokenOfTheURL.setPage(19);
+        lastTokenOfTheURL.setX(465.54675000000003);
+        lastTokenOfTheURL.setY(404.908);
+        lastTokenOfTheURL.setWidth(68.727);
+        lastTokenOfTheURL.setHeight(9.0873);
+
+        PDFAnnotation annotation = new PDFAnnotation();
+        annotation.setPageNumber(19);
+        List<BoundingBox> boundingBoxes = new ArrayList<>();
+        boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987));
+        annotation.setBoundingBoxes(boundingBoxes);
+        annotation.setDestination("http://www.gencodegenes.org/releases/");
+        annotation.setType(PDFAnnotation.Type.URI);
+        List<PDFAnnotation> pdfAnnotations = List.of(annotation);
+
+        //This is the actual text that is passed and is different from the layoutToken text.
+        final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38.  ";
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition url = offsetPositions.get(0);
+        assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/"));
+    }
+
 
     @Test
     public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText2() throws Exception {