diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 6356978837..14d468418c 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } if (desc != null && config.isWithSentenceSegmentation()) { - formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage()); + formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); // we need a sentence segmentation of the table caption, for that we need to introduce // a
, then a

@@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form if (noteNode != null && config.isWithSentenceSegmentation()) { // we need a sentence segmentation of the figure caption - formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage()); + formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations()); } // enclose note content in a

element diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index be20f49545..6c5b010f3a 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1904,7 +1904,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara // We add URL that are identified using the PDF features for annotations, in this way we avoid mangling URLs // in different sentences. - List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations); + List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text); forbiddenPositions.addAll(offsetPositionsUrls); List theSentences = diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index 7e32fb36ee..b957d12b90 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -189,6 +189,7 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr assertThat(offsetPositions, hasSize(1)); OffsetPosition url = offsetPositions.get(0); + // LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2")); } @@ -232,6 +233,42 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2")); } + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception { + final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" + + "a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" + + "GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" + + "org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" + + "union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" + + "(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL = tokenisedInput.get(97); + lastTokenOfTheURL.setPage(19); + lastTokenOfTheURL.setX(465.54675000000003); + lastTokenOfTheURL.setY(404.908); + lastTokenOfTheURL.setWidth(68.727); + lastTokenOfTheURL.setHeight(9.0873); + + PDFAnnotation annotation = new PDFAnnotation(); + annotation.setPageNumber(19); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987)); + annotation.setBoundingBoxes(boundingBoxes); + annotation.setDestination("http://www.gencodegenes.org/releases/"); + annotation.setType(PDFAnnotation.Type.URI); + List pdfAnnotations = List.of(annotation); + + //This is the actual text that is passed and is different from the layoutToken text. + final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. "; + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url = offsetPositions.get(0); + assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/")); + } + @Test public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText2() throws Exception {