Skip to content

Commit

Permalink
Merge pull request #169 from lfoppiano/fix-segmentation-quantifiedObj…
Browse files Browse the repository at this point in the history
…ects

Fix segmentation quantified objects when processing PDF documents
  • Loading branch information
lfoppiano committed Feb 26, 2024
2 parents 128b6fa + 1bd4783 commit 9d23f3b
Show file tree
Hide file tree
Showing 13 changed files with 531 additions and 280 deletions.
36 changes: 23 additions & 13 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,21 @@ repositories {

dependencies {
//Tests
testRuntimeOnly 'org.junit.vintage:junit-vintage-engine:5.9.3'
testImplementation(platform('org.junit:junit-bom:5.9.3'))
testImplementation(platform('org.junit:junit-bom:5.10.2'))
testRuntimeOnly("org.junit.platform:junit-platform-launcher") {
because("Only needed to run tests in a version of IntelliJ IDEA that bundles older versions")
}
testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine")
testRuntimeOnly("org.junit.vintage:junit-vintage-engine")
testImplementation('org.junit.jupiter:junit-jupiter')
testImplementation 'org.easymock:easymock:5.1.0'

testImplementation 'org.easymock:easymock:5.2.0'
testImplementation 'org.hamcrest:hamcrest-all:1.3'
testImplementation 'org.powermock:powermock-module-junit4:2.0.9'
testImplementation 'org.powermock:powermock-api-easymock:2.0.9'
testImplementation 'org.jetbrains.kotlin:kotlin-test'

testImplementation "io.mockk:mockk:1.13.9"

//GROBID
implementation 'org.grobid:grobid-core:0.8.0'
implementation 'org.grobid:grobid-trainer:0.8.0'
Expand Down Expand Up @@ -177,7 +183,7 @@ test {

def libraries = ""
if (Os.isFamily(Os.FAMILY_MAC)) {
if (Os.OS_ARCH.equals("aarch64")) {
if (Os.isArch("aarch64")) {
libraries = "${file("./grobid-home/lib/mac_arm-64").absolutePath}"
} else {
libraries = "${file("./grobid-home/lib/mac-64").absolutePath}"
Expand Down Expand Up @@ -211,7 +217,7 @@ run {

def libraries = ""
if (Os.isFamily(Os.FAMILY_MAC)) {
if (Os.OS_ARCH.equals("aarch64")) {
if (Os.isArch("aarch64")) {
libraries = "${file("../grobid-home/lib/mac_arm-64").absolutePath}"
} else {
libraries = "${file("../grobid-home/lib/mac-64").absolutePath}"
Expand Down Expand Up @@ -239,7 +245,7 @@ task integration(type: Test) {

def libraries = ""
if (Os.isFamily(Os.FAMILY_MAC)) {
if (Os.OS_ARCH.equals("aarch64")) {
if (Os.isArch("aarch64")) {
libraries = "${file("./grobid-home/lib/mac_arm-64").absolutePath}"
} else {
libraries = "${file("./grobid-home/lib/mac-64").absolutePath}"
Expand All @@ -254,12 +260,12 @@ task integration(type: Test) {
}

if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
// jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED",
// "--add-opens", "java.base/java.io=ALL-UNNAMED",
// "--add-opens", "java.base/java.lang=ALL-UNNAMED",
// "--add-opens", "java.base/java.util.regex=ALL-UNNAMED",
// "--add-opens", "java.base/java.math=ALL-UNNAMED",
// "--add-opens", "java.base/java.text=ALL-UNNAMED"
jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED",
"--add-opens", "java.base/java.io=ALL-UNNAMED",
"--add-opens", "java.base/java.lang=ALL-UNNAMED",
"--add-opens", "java.base/java.util.regex=ALL-UNNAMED",
"--add-opens", "java.base/java.math=ALL-UNNAMED",
"--add-opens", "java.base/java.text=ALL-UNNAMED"
}
systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries
}
Expand Down Expand Up @@ -381,6 +387,10 @@ wrapper {
gradleVersion "7.2"
}

test {
useJUnitPlatform()
}

jacocoTestReport {
reports {
xml.enabled = true // coveralls plugin depends on xml format report
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/grobid/core/data/SentenceParse.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import com.googlecode.clearnlp.morphology.AbstractMPAnalyzer;
import com.googlecode.clearnlp.morphology.EnglishMPAnalyzer;
import com.googlecode.clearnlp.util.UTInput;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.analyzers.QuantityAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -278,13 +279,13 @@ public String toString() {
public String getTokenStructureByPosition(int position) {
if (tokenStructures == null)
return null;
return tokenStructures.get(valueOf(position));
return tokenStructures.get(position);
}

public String getTokenStructureByIndex(String index) {
if (tokenIndex == null)
return null;
if ( (index == null) || (index.length() == 0) )
if (StringUtils.isEmpty(index))
return null;
return tokenIndex.get(index);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ protected javax.measure.Unit tryParsing(Unit parsedUnit) throws NormalizationExc
javax.measure.Unit<?> onlyUnitParsed = formatService.parse(onlyUnit);
unitList.add(onlyUnitParsed.pow(-1));
} catch (Throwable e2) {
LOGGER.warn("Trying excluding the negative power. Cannot parse " + onlyUnit + " with " + formatService.getClass().getName(), e2);
LOGGER.warn("Trying excluding the negative power. Cannot parse " + onlyUnit + " with " + formatService.getClass().getName());
LOGGER.debug("Exception", e2);
}
break;
}
Expand Down
184 changes: 41 additions & 143 deletions src/main/java/org/grobid/core/engines/DefaultQuantifiedObjectParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

Expand All @@ -36,90 +35,48 @@ public List<Measurement> process(List<LayoutToken> tokens, List<Measurement> mea
try {
String text = LayoutTokensUtil.toText(tokens);
TextParser textParser = TextParser.getInstance();
List<Sentence> parsedSentences = textParser.parseText(text);
int firstTokenOffsetStart = tokens.get(0).getOffset();
parsedSentences.stream().forEach(s -> {
s.getOffset().start = s.getOffsetStart() + firstTokenOffsetStart;
s.getOffset().end = s.getOffsetEnd() + firstTokenOffsetStart;
});
int firstOffset = Iterables.getFirst(tokens, new LayoutToken()).getOffset();
List<OffsetPosition> measurementsOffsetsAdjusted = measurements.stream()
.map(m -> new OffsetPosition(m.getRawOffsets().start - firstOffset, m.getRawOffsets().end - firstOffset))
.collect(Collectors.toList());
List<Sentence> parsedSentences = textParser.parseText(text, measurementsOffsetsAdjusted);
int indexMeasurement = 0;
int offset = 0;

// this part is for identifying for each sentence, the measurements belonging to the sentence
for (Sentence processedSentence : parsedSentences) {
// list of measurements for the current sentence
List<Measurement> sentenceMeasurements = new ArrayList<>();
// Positions of measurements
for (Sentence sentence : parsedSentences) {
List<Measurement> measurementsInSentence = new ArrayList<>();
List<OffsetPosition> measurementsOffsetsInSentence = new ArrayList<>();
List<Integer> positionMeasurements = new ArrayList<>();
while (indexMeasurement < measurements.size()) {
while (indexMeasurement < measurementsOffsetsAdjusted.size()) {
Measurement measurement = measurements.get(indexMeasurement);
int position = -1;

// is the measurement quantities in the current sentence?
if (measurement.getType() == UnitUtilities.Measurement_Type.VALUE) {
Quantity quantity = measurement.getQuantityAtomic();
if (quantity.getOffsetStart() > processedSentence.getOffsetEnd()) {
// next sentence
break;
}
if (quantity.getOffsetEnd() < processedSentence.getOffsetStart()) {
// next measurement
indexMeasurement++;
continue;
}
position = quantity.getOffsetStart();
} else if ((measurement.getType() == UnitUtilities.Measurement_Type.INTERVAL_MIN_MAX) ||
(measurement.getType() == UnitUtilities.Measurement_Type.INTERVAL_BASE_RANGE)) {
// values of the interval do not matter if min/max or base/range
List<Quantity> sortedQuantities = QuantityOperations.toQuantityList(measurement).stream()
.sorted(Comparator.comparingInt(Quantity::getOffsetStart))
.collect(Collectors.toList());

Quantity firstQuantity = Iterables.getFirst(sortedQuantities, null);
Quantity lastQuantity = Iterables.getLast(sortedQuantities);

if ((lastQuantity != null) && (lastQuantity.getOffsetEnd() < processedSentence.getOffsetStart())) {
// next measurement
indexMeasurement++;
continue;
}
if ((firstQuantity != null) && (firstQuantity.getOffsetStart() > processedSentence.getOffsetEnd())) {
// next sentence
break;
}

position = firstQuantity.getOffsetStart();
} else if (measurement.getType() == UnitUtilities.Measurement_Type.CONJUNCTION) {
// list must be consistent in unit type, and avoid too large chunk
List<Quantity> quantities = measurement.getQuantityList();
if (CollectionUtils.isNotEmpty(quantities)) {
// just exploit the first quantity for positioning
Quantity quantity = quantities.get(0);
if (quantity.getOffsetEnd() < processedSentence.getOffsetStart()) {
// next sentence
break;
}
if (quantity.getOffsetStart() > processedSentence.getOffsetEnd()) {
// next measurement
indexMeasurement++;
continue;
}
position = quantity.getOffsetStart();
}
OffsetPosition measurementOffsets = measurementsOffsetsAdjusted.get(indexMeasurement);

if (measurementOffsets.start > sentence.getOffsetEnd()) {
// next sentence
break;
}

if (measurementOffsets.end < sentence.getOffsetStart()) {
// next measurement
indexMeasurement++;
continue;
}

// if we arrive here, this measurement is in the current sentence

sentenceMeasurements.add(measurement);
positionMeasurements.add(position);
measurementsInSentence.add(measurement);
measurementsOffsetsInSentence.add(measurementOffsets);
positionMeasurements.add(measurementOffsets.start);
indexMeasurement++;
}

// Note to myself:
// - measurementsInSentence do not have any offset normalisation, they are the original offsets.
// - measurementOffsetsInSentence, they are normalised only by firstToken. They reference to the beginning of the sentence
// - position measurements are the same of measurementOffsetsInSentence
// get the list of indexes corresponding to measurement parts
List<String> indexMeasurementTokens = getIndexMeasurementTokens(sentenceMeasurements, processedSentence);
List<String> indexMeasurementTokens = getIndexMeasurementTokens(measurementsOffsetsInSentence, sentence);

// find the syntactic head... this will define the QuantifiedObject to the measurements
setHeads(processedSentence, sentenceMeasurements, positionMeasurements, indexMeasurementTokens);
setHeads(sentence, measurementsInSentence, positionMeasurements, indexMeasurementTokens);
}
} catch (Exception e) {
logger.error("error in substance parser: ", e);
Expand All @@ -140,7 +97,7 @@ private void setHeads(Sentence processedSentence,

List<SentenceParse> parses = processedSentence.getParses();
// we're just considering the first best parse
if ((parses == null) || (parses.size() == 0))
if (CollectionUtils.isEmpty(parses))
return;
SentenceParse parse = parses.get(0);
int p = 0;
Expand Down Expand Up @@ -189,7 +146,6 @@ private void setHeads(Sentence processedSentence,
// case direct modifier "... of something"

// if case of an interval, we need to take the last quantity object for the position
UnitUtilities.Measurement_Type type = measurement.getType();
if ((measurement.getType() == UnitUtilities.Measurement_Type.INTERVAL_MIN_MAX) ||
(measurement.getType() == UnitUtilities.Measurement_Type.INTERVAL_BASE_RANGE)) {

Expand Down Expand Up @@ -396,9 +352,9 @@ private OffsetPosition getFullPhrase(OffsetPosition offsetPosition,
}


private List<String> getIndexMeasurementTokens(List<Measurement> measurements,
private List<String> getIndexMeasurementTokens(List<OffsetPosition> measurementsOffsets,
Sentence processedSentence) {
if (CollectionUtils.isEmpty(measurements)) {
if (CollectionUtils.isEmpty(measurementsOffsets)) {
return null;
}

Expand All @@ -412,102 +368,44 @@ private List<String> getIndexMeasurementTokens(List<Measurement> measurements,

SentenceParse parse = parses.get(0);

for (Measurement measurement : measurements) {
UnitUtilities.Measurement_Type type = measurement.getType();

if (type == UnitUtilities.Measurement_Type.VALUE) {
Quantity quantity = measurement.getQuantityAtomic();
int position = quantity.getOffsetStart();
addTokenIndex(position - startSentencePosition, quantity.getOffsetEnd() - quantity.getOffsetStart(), parse, result);

// unit position
Unit rawUnit = quantity.getRawUnit();
if (rawUnit != null) {
position = rawUnit.getOffsetStart();
addTokenIndex(position - startSentencePosition, rawUnit.getOffsetEnd() - rawUnit.getOffsetStart(), parse, result);
}
} else if ((type == UnitUtilities.Measurement_Type.INTERVAL_MIN_MAX) ||
(type == UnitUtilities.Measurement_Type.INTERVAL_BASE_RANGE)) {
// values of the interval do not matter if min/max or base/range
Quantity quantityLeast = measurement.getQuantityLeast();
if (quantityLeast == null)
quantityLeast = measurement.getQuantityBase();
Quantity quantityMost = measurement.getQuantityMost();
if (quantityMost == null)
quantityMost = measurement.getQuantityRange();

if (quantityLeast != null) {
int position = quantityLeast.getOffsetStart();
addTokenIndex(position - startSentencePosition, quantityLeast.getOffsetEnd() - quantityLeast.getOffsetStart(), parse, result);

// unit position
Unit rawUnit = quantityLeast.getRawUnit();
if (rawUnit != null) {
position = rawUnit.getOffsetStart();
addTokenIndex(position - startSentencePosition, rawUnit.getOffsetEnd() - rawUnit.getOffsetStart(), parse, result);
}
}

if (quantityMost != null) {
int position = quantityMost.getOffsetStart();
addTokenIndex(position - startSentencePosition, quantityMost.getOffsetEnd() - quantityMost.getOffsetStart(), parse, result);
for (OffsetPosition measurementOffset : measurementsOffsets) {
int measurementPositionInSentence = measurementOffset.start - startSentencePosition;
int measurementLength = measurementOffset.end - measurementOffset.start;

// unit position
Unit rawUnit = quantityMost.getRawUnit();
if (rawUnit != null) {
position = rawUnit.getOffsetStart();
addTokenIndex(position - startSentencePosition, rawUnit.getOffsetEnd() - rawUnit.getOffsetStart(), parse, result);
}
}
} else if (measurement.getType() == UnitUtilities.Measurement_Type.CONJUNCTION) {
// list must be consistent in unit type, and avoid too large chunk
List<Quantity> quantities = measurement.getQuantityList();
if ((quantities != null) && (quantities.size() > 0)) {
// just exploit the first quantity for positioning
Quantity quantity = quantities.get(0);
int position = quantity.getOffsetStart();
addTokenIndex(position - startSentencePosition, quantity.getOffsetEnd() - quantity.getOffsetStart(), parse, result);

// unit position
Unit rawUnit = quantity.getRawUnit();
if (rawUnit != null) {
position = rawUnit.getOffsetStart();
addTokenIndex(position - startSentencePosition, rawUnit.getOffsetEnd() - rawUnit.getOffsetStart(), parse, result);
}
}
}
addTokenIndex(measurementPositionInSentence, measurementLength, parse, result);
}
return result;
}

private List<String> addTokenIndex(int position, int length, SentenceParse parse, List<String> result) {
protected List<String> addTokenIndex(int position, int length, SentenceParse parse, List<String> result) {
String tokenStruct = parse.getTokenStructureByPosition(position);
if (tokenStruct != null) {
String[] pieces = tokenStruct.split("\t");
if (pieces.length == 8) {
String index = pieces[0].trim();
if (result == null)
result = new ArrayList<String>();
result = new ArrayList<>();
if (!result.contains(index))
result.add(index);
}
} else {
logger.info("Invalid position: " + position + " - no parse result find at this position.");
}
// brute force adding all subtokens in the specified interval
// brute force adding all sub-tokens in the specified interval
for (int i = 1; i < length; i++) {
tokenStruct = parse.getTokenStructureByPosition(position + i);
if (tokenStruct != null) {
String[] pieces = tokenStruct.split("\t");
if (pieces.length == 8) {
String index = pieces[0].trim();
if (result == null)
result = new ArrayList<String>();
result = new ArrayList<>();
if (!result.contains(index))
result.add(index);
}
}
}

return result;
}

Expand Down

0 comments on commit 9d23f3b

Please sign in to comment.