Skip to content

Commit

Permalink
Support LayoutToken input and bounding box positions
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed May 16, 2017
1 parent 7078ce4 commit b137bf1
Show file tree
Hide file tree
Showing 9 changed files with 335 additions and 27 deletions.
103 changes: 92 additions & 11 deletions grobid-ner/src/main/java/org/grobid/core/data/Entity.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,12 @@ public String getName() {
private Sense sense = null;

// optional bounding box in the source document
private BoundingBox box = null;
private List<BoundingBox> boundingBoxes = null;

// potition in the global tokenization
//private int startTokenPos = -1;
//private int endTokenPos = -1;

// orign of the entity definition
private Origin origin = Origin.GROBID;

Expand All @@ -80,6 +84,9 @@ public Entity(Entity ent) {
conf = ent.conf;
sense = ent.sense;
origin = ent.origin;
boundingBoxes = ent.boundingBoxes;
//startTokenPos = ent.startTokenPos;
//endTokenPos = ent.startTokenPos;
}

public String getRawName() {
Expand Down Expand Up @@ -186,15 +193,55 @@ public void normalise() {
// TBD
}

/*public int getStartTokenPos() {
return startTokenPos;
}
public void setStartTokenPos(int startTokenPos) {
this.startTokenPos = startTokenPos;
}
public int getEndTokenPos() {
return endTokenPos;
}
public void setEndTokenPos(int endTokenPos) {
this.endTokenPos = endTokenPos;
}*/

public void setBoundingBoxes(List<BoundingBox> boundingBoxes) {
this.boundingBoxes = boundingBoxes;
}

public List<BoundingBox> getBoundingBoxes() {
return boundingBoxes;
}

public void addBoundingBoxes(BoundingBox boundingBox) {
if (this.boundingBoxes == null)
this.boundingBoxes = new ArrayList<BoundingBox>();
this.boundingBoxes.add(boundingBox);
}

@Override
public boolean equals(Object object) {
boolean result = false;
if ( (object != null) && object instanceof Entity) {
int start = ((Entity)object).getOffsetStart();
int end = ((Entity)object).getOffsetEnd();
if ( (start == offsets.start) && (end == offsets.end) ) {
result = true;
}
if ( (start != -1) && (end != -1) ) {
if ( (start == offsets.start) && (end == offsets.end) ) {
result = true;
}
} /*else {
int startToken = ((Entity)object).getStartTokenPos();
int endToken = ((Entity)object).getEndTokenPos();
if ( (startToken != -1) && (endToken != -1) ) {
if ( (startToken == startTokenPos) && (endToken == endTokenPos) ) {
result = true;
}
}
}*/
}
return result;
}
Expand All @@ -204,10 +251,26 @@ public int compareTo(Entity theEntity) {
int start = theEntity.getOffsetStart();
int end = theEntity.getOffsetEnd();

if (offsets.start != start)
return offsets.start - start;
else
return offsets.end - end;
//if ((start != -1) && (end != -1)) {
if (offsets.start != start)
return offsets.start - start;
else
return offsets.end - end;
/*} else {
int startToken = theEntity.getStartTokenPos();
int endToken =theEntity.getEndTokenPos();
if ( (startToken != -1) && (endToken != -1) ) {
if (startToken != startTokenPos)
return startTokenPos - startToken;
else
return endTokenPos - endToken;
} else {
// it's too underspecified to be comparable, and for
// sure it's not equal
// throw an exception ?
return -1;
}
}*/
}

public String toJson() {
Expand All @@ -234,9 +297,27 @@ public String toJson() {
buffer.append(" ] \"");
}

buffer.append(", \"offsetStart\" : " + offsets.start);
buffer.append(", \"offsetEnd\" : " + offsets.end);

if ( (offsets != null) && (offsets.start != -1) && (offsets.end != -1) ) {
buffer.append(", \"offsetStart\" : " + offsets.start);
buffer.append(", \"offsetEnd\" : " + offsets.end);
}

// start and end token index not to be outputed

if ( (boundingBoxes != null) && (boundingBoxes.size() > 0) ) {
buffer.append(", \"pos\" : [");
boolean start = true;
for(BoundingBox box : boundingBoxes) {
if (start) {
buffer.append("{").append(box.toJson()).append("}");
start = false;
} else {
buffer.append(", {").append(box.toJson()).append("}");
}
}
buffer.append("]");
}

buffer.append(", \"conf\" : \"" + conf + "\"");
buffer.append(", \"prob\" : \"" + prob + "\"");

Expand Down
37 changes: 35 additions & 2 deletions grobid-ner/src/main/java/org/grobid/core/engines/NEREnParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import org.grobid.core.lexicon.LexiconPositionsIndexes;
import org.grobid.core.utilities.Pair;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.LayoutTokensUtil;

import org.grobid.core.layout.LayoutToken;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -36,7 +39,9 @@ public NEREnParser() {
}

/**
* Extract all occurrences of named entity from a simple piece of text.
* Extract all occurrences of named entities from a simple piece of text.
* The positions of the recognized entities are given as character offsets
* (following Java specification of characters).
*/
public List<Entity> extractNE(String text) {
List<String> tokens = null;
Expand Down Expand Up @@ -65,6 +70,34 @@ public List<Entity> extractNE(String text) {
return entities;
}

/**
* Extract all occurrences of named entities from a list of LayoutToken
* coming from a document with fixed/preserved layout, e.g. PDF.
* The positions of the recognized entities are given with coordinates in
* the input document.
*/
public List<Entity> extractNE(List<LayoutToken> tokens) {
if (tokens == null)
return null;

LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
positionsIndexes.computeIndexes(tokens);

String res = NERParserCommon.toFeatureVectorLayout(tokens, positionsIndexes);
String result = label(res);
//List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(result);

//String text = LayoutTokensUtil.toText(tokens);
List<Entity> entities = NERParserCommon.resultExtraction(GrobidModels.ENTITIES_NER, result, tokens);

// we use now the sense tagger for the recognized named entity
//List<Sense> senses = senseTagger.extractSenses(labeled, tokens, positionsIndexes);

//NERParserCommon.merge(entities, senses);

return entities;
}

public String createCONNLTrainingFromText(String text) {
if (isEmpty(text))
return null;
Expand All @@ -75,7 +108,7 @@ public String createCONNLTrainingFromText(String text) {
try {
tokens = GrobidAnalyzer.getInstance().tokenize(text, new Language(Language.EN, 1.0));
} catch(Exception e) {
LOGGER.error("Tokenization failed. ", e);
LOGGER.error("Tokenization failed", e);
return null;
}
LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
Expand Down
16 changes: 16 additions & 0 deletions grobid-ner/src/main/java/org/grobid/core/engines/NERFrParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.lexicon.LexiconPositionsIndexes;
import org.grobid.core.utilities.Pair;
import org.grobid.core.layout.LayoutToken;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -32,6 +34,8 @@ public NERFrParser() {

/**
* Extract all occurrences of named entity from a simple piece of text.
* The positions of the recognized entities are given as character offsets
* (following Java specification of characters).
*/
public List<Entity> extractNE(String text) {
List<String> tokens = null;
Expand All @@ -55,6 +59,18 @@ public List<Entity> extractNE(String text) {
return entities;
}

/**
* Extract all occurrences of named entities from a list of LayoutToken
* coming from a document with fixed/preserved layout, e.g. PDF.
* The positions of the recognized entities are given with coordinates in
* the input document.
*/
public List<Entity> extractNE(List<LayoutToken> tokens) {


return null;
}

public String createCONNLTrainingFromText(String text) {
if (isEmpty(text))
return null;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.grobid.core.engines;

import org.grobid.core.data.Entity;
import org.grobid.core.layout.LayoutToken;

import java.util.List;

Expand All @@ -13,6 +14,8 @@ public interface NERParser {

List<Entity> extractNE(String text);

List<Entity> extractNE(List<LayoutToken> tokens);

String createCONNLTrainingFromText(String text);

//String createXMLTrainingFromText(String text, StringBuilder sb);
Expand Down

0 comments on commit b137bf1

Please sign in to comment.