Skip to content

Commit

Permalink
get fixes on matchTokenAndString from PR #1099
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed May 4, 2024
1 parent 3900dc2 commit ec52f13
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1557,22 +1557,25 @@ public static org.apache.commons.lang3.tuple.Pair<OffsetPosition, OffsetPosition
return null;
}

public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> urlPositions) {
public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutTokens, String text, List<OffsetPosition> positions) {
List<OffsetPosition> newPositions = new ArrayList<>();
StringBuilder accumulator = new StringBuilder();
int pos = 0;
int textPositionOfToken = 0;

for (OffsetPosition urlPosition : urlPositions) {
List<LayoutToken> urlTokens = layoutTokens.subList(urlPosition.start, urlPosition.end);
for (OffsetPosition position : positions) {
List<LayoutToken> annotationTokens = layoutTokens.subList(position.start, position.end);
boolean first = true;
for (int i = 0; i < urlTokens.size(); i++) {
LayoutToken token = urlTokens.get(i);
accumulator = new StringBuilder();
for (int i = 0; i < annotationTokens.size(); i++) {
LayoutToken token = annotationTokens.get(i);
if (StringUtils.isEmpty(token.getText()))
continue;
int newPos = text.indexOf(token.getText(), pos);
if (newPos != -1) {
textPositionOfToken = text.indexOf(token.getText(), pos);
if (textPositionOfToken != -1) {
//We update pos only at the first token of the annotation positions
if (first) {
pos = newPos;
pos = textPositionOfToken;
first = false;
}
accumulator.append(token);
Expand All @@ -1581,16 +1584,25 @@ public static List<OffsetPosition> matchTokenAndString(List<LayoutToken> layoutT
continue;
}
if (StringUtils.isNotEmpty(accumulator)) {
int accumulatorTextLength = accumulator.toString().length();
int start = text.indexOf(accumulator.toString(), pos);
newPositions.add(new OffsetPosition(start, start + accumulator.toString().length()));
accumulator = new StringBuilder();
pos = newPos;
first = true;
int end = start + accumulatorTextLength;
newPositions.add(new OffsetPosition(start, end));
pos = end;
break;
}
pos = newPos;
pos = textPositionOfToken;
}
}
if (StringUtils.isNotEmpty(accumulator)) {
int annotationTextLength = accumulator.toString().length();
int start = text.indexOf(accumulator.toString(), pos);
int end = start + annotationTextLength;
newPositions.add(new OffsetPosition(start, end));
pos = end;
accumulator = new StringBuilder();
}

}
if (StringUtils.isNotEmpty(accumulator)) {
int start = text.indexOf(accumulator.toString(), pos);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import java.util.regex.Matcher;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.startsWith;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.*;
Expand Down Expand Up @@ -436,4 +435,139 @@ public void testMatchTokenAndString() throws Exception {
assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));

}


@Test
public void testMatchTokenAndString_twoElements() throws Exception {
final String input = "This work is available at https://github.com/lfoppiano/ \n" +
"supercon2. The repository contains the code of the \n" +
"SuperCon 2 interface, the curation workflow, and the \n" +
"\n" +
"Table 2. Data support, the number of entities for each label in \n" +
"each of the datasets used for evaluating the ML models. The \n" +
"base dataset is the original dataset described in [18], and the \n" +
"curation dataset is automatically collected based on the data-\n" +
"base corrections by the interface and manually corrected. \n" +
"\n";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. ";
List<OffsetPosition> urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23));

List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens);

assertThat(offsetPositions, hasSize(2));
OffsetPosition url0 = offsetPositions.get(0);
assertThat(url0.start, is(0));
assertThat(url0.end, is(9));

assertThat(inputReal.substring(url0.start, url0.end), is("This work"));

OffsetPosition url1 = offsetPositions.get(1);
assertThat(url1.start, is(26));
assertThat(url1.end, is(65));

assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2"));

}

@Test
public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception {
final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
List<OffsetPosition> urlTokens = Arrays.asList(
new OffsetPosition(0, 3),
new OffsetPosition(5, 8),
new OffsetPosition(10, 13),
new OffsetPosition(15, 18)
);

List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens);

assertThat(offsetPositions, hasSize(4));

OffsetPosition url0 = offsetPositions.get(0);
assertThat(url0.start, is(0));
assertThat(url0.end, is(19));

assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne"));

OffsetPosition url1 = offsetPositions.get(1);
assertThat(url1.start, is(21));
assertThat(url1.end, is(34));

assertThat(input.substring(url1.start, url1.end), is("Claudie Marec"));

OffsetPosition url2 = offsetPositions.get(2);
assertThat(url2.start, is(36));
assertThat(url2.end, is(49));

assertThat(input.substring(url2.start, url2.end), is("Claudie Marec"));

OffsetPosition url3 = offsetPositions.get(3);
assertThat(url3.start, is(51));
assertThat(url3.end, is(66));

assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder"));

}

@Test
public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception {
final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities.";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
List<OffsetPosition> annotationTokenPositions = Arrays.asList(
new OffsetPosition(4, 7),
new OffsetPosition(9, 12),
new OffsetPosition(15, 18),
new OffsetPosition(27, 30),
new OffsetPosition(49, 52),
new OffsetPosition(71, 74),
new OffsetPosition(103, 106),
new OffsetPosition(109, 110),
new OffsetPosition(125, 126)
);

List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);

assertThat(offsetPositions, hasSize(9));

OffsetPosition url7 = offsetPositions.get(7);
assertThat(url7.start, is(349));
assertThat(url7.end, is(352));

assertThat(input.substring(url7.start, url7.end), is("IGC"));

OffsetPosition url8 = offsetPositions.get(8);
assertThat(url8.start, is(397));
assertThat(url8.end, is(400));

assertThat(input.substring(url8.start, url8.end), is("IGC"));

}

@Test
public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception {
final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter.";

List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
List<OffsetPosition> annotationTokenPositions = Arrays.asList(
new OffsetPosition(4, 7),
new OffsetPosition(40, 41),
new OffsetPosition(62, 63),
new OffsetPosition(79, 84)
);

List<OffsetPosition> offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions);

assertThat(offsetPositions, hasSize(4));

OffsetPosition url7 = offsetPositions.get(1);
assertThat(input.substring(url7.start, url7.end), is("NERC"));

OffsetPosition url8 = offsetPositions.get(2);
assertThat(input.substring(url8.start, url8.end), is("ERC"));
}
}

0 comments on commit ec52f13

Please sign in to comment.