Skip to content

Commit

Permalink
Add additional test and fix to the method so that the offsets are cor…
Browse files Browse the repository at this point in the history
…rectly matching the real text (dehypenised)
  • Loading branch information
lfoppiano committed May 9, 2024
1 parent 322bf23 commit f983f25
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -1873,7 +1873,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}
}

List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations);
List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
forbiddenPositions.addAll(offsetPositionsUrls);

List<OffsetPosition> theSentences =
Expand Down
36 changes: 36 additions & 0 deletions grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,40 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
OffsetPosition url = offsetPositions.get(0);
assertThat(inputReal.substring(url.start, url.end), is("https://github.com/lfoppiano/ supercon2"));
}

@Test
public void testCharacterPositionsUrlPatternWithPDFAnnotations2_URL_shouldReturnCorrectIntervalBasedOnText() throws Exception {
final String input = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain \n" +
"a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for \n" +
"GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. \n" +
"org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the \n" +
"union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute \n" +
"(SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. \n";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
LayoutToken lastTokenOfTheURL = tokenisedInput.get(97);
lastTokenOfTheURL.setPage(19);
lastTokenOfTheURL.setX(465.54675000000003);
lastTokenOfTheURL.setY(404.908);
lastTokenOfTheURL.setWidth(68.727);
lastTokenOfTheURL.setHeight(9.0873);

PDFAnnotation annotation = new PDFAnnotation();
annotation.setPageNumber(19);
List<BoundingBox> boundingBoxes = new ArrayList<>();
boundingBoxes.add(BoundingBox.fromPointAndDimensions(19, 401.551, 402.396, 139.445, 12.901999999999987));
annotation.setBoundingBoxes(boundingBoxes);
annotation.setDestination("http://www.gencodegenes.org/releases/");
annotation.setType(PDFAnnotation.Type.URI);
List<PDFAnnotation> pdfAnnotations = List.of(annotation);

//This is the actual text that is passed and is different from the layoutToken text.
final String inputReal = "Table S1: Gene annotations from which exon-exon junctions were extracted and unioned to obtain a list of annotated junctions. All tracks were taken from the UCSC Genome Browser [10] except for GENCODE [2], which was downloaded from the GENCODE website http://www.gencodegenes. org/releases/. Junction coordinates from hg38 annotations were lifted over to hg19 before the union was performed. Of all gene annotations listed here, the Swedish Bioinformatics Institute (SIB) genes has the most, with over 400,000 junctions for each of hg19 and hg38. ";

List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations, inputReal);

assertThat(offsetPositions, hasSize(1));
OffsetPosition url = offsetPositions.get(0);
assertThat(inputReal.substring(url.start, url.end), is("http://www.gencodegenes. org/releases/"));
}
}

0 comments on commit f983f25

Please sign in to comment.