Skip to content

Commit

Permalink
Merge branch 'feature/preserve-urls' into feature/identify-urls
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed May 10, 2024
2 parents d58633d + 5bcb8b1 commit 6370de2
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 3 deletions.
4 changes: 2 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

if (desc != null && config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the table caption, for that we need to introduce
// a <div>, then a <p>
Expand Down Expand Up @@ -215,7 +215,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

if (noteNode != null && config.isWithSentenceSegmentation()) {
// we need a sentence segmentation of the figure caption
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}

// enclose note content in a <p> element
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1904,7 +1904,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara

// We add URL that are identified using the PDF features for annotations, in this way we avoid mangling URLs
// in different sentences.
List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations);
List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
forbiddenPositions.addAll(offsetPositionsUrls);

List<OffsetPosition> theSentences =
Expand Down
37 changes: 37 additions & 0 deletions grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ public void testTokensPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorr

assertThat(offsetPositions, hasSize(1));
OffsetPosition url = offsetPositions.get(0);
// LF: we need a + 1 because the convention for the tokenPositionUrlPattern is inclusive, inclusive
assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(url.start, url.end + 1)), is("https://github.com/lfoppiano/ \nsupercon2"));
}

Expand Down Expand Up @@ -232,6 +233,42 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
}

@Test
public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" +
"a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" +
"GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" +
"org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" +
"union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" +
"(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
LayoutToken lastTokenOfTheURL = tokenisedInput.get(97);
lastTokenOfTheURL.setPage(19);
lastTokenOfTheURL.setX(465.54675000000003);
lastTokenOfTheURL.setY(404.908);
lastTokenOfTheURL.setWidth(68.727);
lastTokenOfTheURL.setHeight(9.0873);

PDFAnnotation annotation = new PDFAnnotation();
annotation.setPageNumber(19);
List<BoundingBox> boundingBoxes = new ArrayList<>();
boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987));
annotation.setBoundingBoxes(boundingBoxes);
annotation.setDestination("http://www.gencodegenes.org/releases/");
annotation.setType(PDFAnnotation.Type.URI);
List<PDFAnnotation> pdfAnnotations = List.of(annotation);

//This is the actual text that is passed and is different from the layoutToken text.
final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. ";

List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);

assertThat(offsetPositions, hasSize(1));
OffsetPosition url = offsetPositions.get(0);
assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/"));
}


@Test
public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText2() throws Exception {
Expand Down

0 comments on commit 6370de2

Please sign in to comment.