Skip to content

Commit

Permalink
When the annotations are missing and we capture a single closed paren…
Browse files Browse the repository at this point in the history
…thesis as last character of the url, we should back off
  • Loading branch information
lfoppiano committed May 10, 2024
1 parent 878d50c commit d58633d
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1375,6 +1375,12 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
// finally avoid ending a URL by a dot, because it can harm the sentence segmentation
if (Iterables.getLast(urlTokens).getText().endsWith(".")) {
endPos = endPos - 1;
} else if (Iterables.getLast(urlTokens).getText().endsWith(")")) {
long openedParenthesis = LayoutTokensUtil.toText(urlTokens).chars().filter(ch -> ch == '(').count();
long closedParenthesis = LayoutTokensUtil.toText(urlTokens).chars().filter(ch -> ch == ')').count();
if (openedParenthesis < closedParenthesis) {
endPos = endPos - 1;
}
}

OffsetPosition position = new OffsetPosition();
Expand Down
24 changes: 24 additions & 0 deletions grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,30 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
assertThat(StringUtils.substring(input, url.start, url.end), is("https://github.com/kermitt2/delft/issues/150"));
}

@Test
public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_NoPDFAnnotationAvailable_shouldReturnCorrectInterval() throws Exception {
final String input = "Data was analyzed using SPM8 software (http://www.fil.ion.ucl.ac.uk/spm). Images were \n" +
"\n" +
"spatially aligned to the first volume to correct for small movements; no run showed more than \n" +
"\n" +
"4mm displacement along the x, y or z dimension. Sinc interpolation minimized timing-errors \n" +
"\n" +
"between slices; functional images were coregistered to the anatomical image, normalized to the \n" +
"\n" +
"standard T1 Montreal Neurological Institute (MNI) template, and resliced at 4mm 3 resolution. \n" +
"\n";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

List<PDFAnnotation> pdfAnnotations = new ArrayList<>();
List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations);

assertThat(offsetPositions, hasSize(1));
OffsetPosition url = offsetPositions.get(0);
assertThat(StringUtils.substring(input, url.start, url.end), is("http://www.fil.ion.ucl.ac.uk/spm"));
}


@Test
public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectInterval2() throws Exception {
final String input = "This work is available at https://github.com/lfoppiano/ \n" +
Expand Down

0 comments on commit d58633d

Please sign in to comment.