Support LayoutToken input and bounding box positions

kermitt2 · May 16, 2017 · b137bf1 · b137bf1
1 parent 7078ce4
commit b137bf1
Show file tree

Hide file tree

Showing 9 changed files with 335 additions and 27 deletions.
diff --git a/grobid-ner/src/main/java/org/grobid/core/data/Entity.java b/grobid-ner/src/main/java/org/grobid/core/data/Entity.java
@@ -56,8 +56,12 @@ public String getName() {
 	private Sense sense = null;
 
 	// optional bounding box in the source document
-	private BoundingBox box = null;
+	private List<BoundingBox> boundingBoxes = null;
 
+	// potition in the global tokenization
+	//private int startTokenPos = -1;
+	//private int endTokenPos = -1;
+
 	// orign of the entity definition
 	private Origin origin = Origin.GROBID;
 
@@ -80,6 +84,9 @@ public Entity(Entity ent) {
 		conf = ent.conf;
 		sense = ent.sense;
 		origin = ent.origin;
+		boundingBoxes = ent.boundingBoxes;
+		//startTokenPos = ent.startTokenPos;
+		//endTokenPos = ent.startTokenPos;
 	}
 
     public String getRawName() {
@@ -186,15 +193,55 @@ public void normalise() {
 		// TBD
 	}
 
+	/*public int getStartTokenPos() {
+		return startTokenPos;
+	}
+
+	public void setStartTokenPos(int startTokenPos) {
+		this.startTokenPos = startTokenPos;
+	}
+
+	public int getEndTokenPos() {
+		return endTokenPos;
+	}
+
+	public void setEndTokenPos(int endTokenPos) {
+		this.endTokenPos = endTokenPos;
+	}*/
+
+	public void setBoundingBoxes(List<BoundingBox> boundingBoxes) {
+		this.boundingBoxes = boundingBoxes;
+	}
+
+	public List<BoundingBox> getBoundingBoxes() {
+		return boundingBoxes;
+	}
+
+	public void addBoundingBoxes(BoundingBox boundingBox) {
+		if (this.boundingBoxes == null)
+			this.boundingBoxes = new ArrayList<BoundingBox>();
+		this.boundingBoxes.add(boundingBox);
+	}
+
 	@Override
 	public boolean equals(Object object) {
 		boolean result = false;
 		if ( (object != null) && object instanceof Entity) {
 			int start = ((Entity)object).getOffsetStart();
 			int end = ((Entity)object).getOffsetEnd();
-			if ( (start == offsets.start) && (end == offsets.end) ) {
-				result = true;
-			}
+			if ( (start != -1) && (end != -1) ) {
+				if ( (start == offsets.start) && (end == offsets.end) ) {
+					result = true;
+				}
+			} /*else {
+				int startToken = ((Entity)object).getStartTokenPos();
+				int endToken = ((Entity)object).getEndTokenPos();
+				if ( (startToken != -1) && (endToken != -1) ) {
+					if ( (startToken == startTokenPos) && (endToken == endTokenPos) ) {
+						result = true;
+					}
+				}
+			}*/
 		}
 		return result;
 	}
@@ -204,10 +251,26 @@ public int compareTo(Entity theEntity) {
 		int start = theEntity.getOffsetStart();
 		int end = theEntity.getOffsetEnd();
 
-		if (offsets.start != start) 
-			return offsets.start - start;
-		else 
-			return offsets.end - end;
+		//if ((start != -1) && (end != -1)) {
+			if (offsets.start != start) 
+				return offsets.start - start;
+			else 
+				return offsets.end - end;
+		/*} else {
+			int startToken = theEntity.getStartTokenPos();
+			int endToken =theEntity.getEndTokenPos();
+			if ( (startToken != -1) && (endToken != -1) ) {
+				if (startToken != startTokenPos) 
+					return startTokenPos - startToken;
+				else 
+					return endTokenPos - endToken;
+			} else {
+				// it's too underspecified to be comparable, and for 
+				// sure it's not equal
+				// throw an exception ?
+				return -1;
+			}
+		}*/
 	}
 
 	public String toJson() {
@@ -234,9 +297,27 @@ public String toJson() {
 			buffer.append(" ] \"");
 		}
 
-		buffer.append(", \"offsetStart\" : " + offsets.start);
-		buffer.append(", \"offsetEnd\" : " + offsets.end);	
-
+		if ( (offsets != null) && (offsets.start != -1) && (offsets.end != -1) ) {
+			buffer.append(", \"offsetStart\" : " + offsets.start);
+			buffer.append(", \"offsetEnd\" : " + offsets.end);	
+		}
+
+		// start and end token index not to be outputed 
+
+		if ( (boundingBoxes != null) && (boundingBoxes.size() > 0) ) {
+			buffer.append(", \"pos\" : [");
+			boolean start = true; 
+			for(BoundingBox box : boundingBoxes) {
+				if (start) {
+					buffer.append("{").append(box.toJson()).append("}");
+					start = false;
+				} else {
+					buffer.append(", {").append(box.toJson()).append("}");
+				}
+			}
+			buffer.append("]");
+		}
+
 		buffer.append(", \"conf\" : \"" + conf + "\"");
 		buffer.append(", \"prob\" : \"" + prob + "\"");
 

diff --git a/grobid-ner/src/main/java/org/grobid/core/engines/NEREnParser.java b/grobid-ner/src/main/java/org/grobid/core/engines/NEREnParser.java
@@ -10,6 +10,9 @@
 import org.grobid.core.lexicon.LexiconPositionsIndexes;
 import org.grobid.core.utilities.Pair;
 import org.grobid.core.utilities.OffsetPosition;
+import org.grobid.core.utilities.LayoutTokensUtil;
+
+import org.grobid.core.layout.LayoutToken;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -36,7 +39,9 @@ public NEREnParser() {
     }
 
     /**
-     * Extract all occurrences of named entity from a simple piece of text.
+     * Extract all occurrences of named entities from a simple piece of text. 
+     * The positions of the recognized entities are given as character offsets 
+     * (following Java specification of characters).
      */
     public List<Entity> extractNE(String text) {
         List<String> tokens = null;
@@ -65,6 +70,34 @@ public List<Entity> extractNE(String text) {
         return entities;
     }
 
+    /**
+     * Extract all occurrences of named entities from a list of LayoutToken
+     * coming from a document with fixed/preserved layout, e.g. PDF. 
+     * The positions of the recognized entities are given with coordinates in 
+     * the input document.
+     */
+    public List<Entity> extractNE(List<LayoutToken> tokens) {
+        if (tokens == null)
+            return null;
+
+        LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
+        positionsIndexes.computeIndexes(tokens);
+
+        String res = NERParserCommon.toFeatureVectorLayout(tokens, positionsIndexes);
+        String result = label(res);
+        //List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(result);
+
+        //String text = LayoutTokensUtil.toText(tokens);
+        List<Entity> entities = NERParserCommon.resultExtraction(GrobidModels.ENTITIES_NER, result, tokens);
+
+        // we use now the sense tagger for the recognized named entity
+        //List<Sense> senses = senseTagger.extractSenses(labeled, tokens, positionsIndexes);
+
+        //NERParserCommon.merge(entities, senses);
+
+        return entities;
+    }
+
     public String createCONNLTrainingFromText(String text) {
         if (isEmpty(text))
             return null;
@@ -75,7 +108,7 @@ public String createCONNLTrainingFromText(String text) {
         try {
             tokens =  GrobidAnalyzer.getInstance().tokenize(text, new Language(Language.EN, 1.0));
         } catch(Exception e) {
-            LOGGER.error("Tokenization failed. ", e);
+            LOGGER.error("Tokenization failed", e);
             return null;
         }
         LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);

diff --git a/grobid-ner/src/main/java/org/grobid/core/engines/NERFrParser.java b/grobid-ner/src/main/java/org/grobid/core/engines/NERFrParser.java
@@ -8,6 +8,8 @@
 import org.grobid.core.lexicon.Lexicon;
 import org.grobid.core.lexicon.LexiconPositionsIndexes;
 import org.grobid.core.utilities.Pair;
+import org.grobid.core.layout.LayoutToken;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -32,6 +34,8 @@ public NERFrParser() {
 
     /**
      * Extract all occurrences of named entity from a simple piece of text.
+     * The positions of the recognized entities are given as character offsets 
+     * (following Java specification of characters).
      */
     public List<Entity> extractNE(String text) {
         List<String> tokens = null;
@@ -55,6 +59,18 @@ public List<Entity> extractNE(String text) {
         return entities;
     }
 
+    /**
+     * Extract all occurrences of named entities from a list of LayoutToken
+     * coming from a document with fixed/preserved layout, e.g. PDF. 
+     * The positions of the recognized entities are given with coordinates in 
+     * the input document.
+     */
+    public List<Entity> extractNE(List<LayoutToken> tokens) {
+
+
+        return null;
+    }
+
     public String createCONNLTrainingFromText(String text) {
         if (isEmpty(text))
             return null;

diff --git a/grobid-ner/src/main/java/org/grobid/core/engines/NERParser.java b/grobid-ner/src/main/java/org/grobid/core/engines/NERParser.java
@@ -1,6 +1,7 @@
 package org.grobid.core.engines;
 
 import org.grobid.core.data.Entity;
+import org.grobid.core.layout.LayoutToken;
 
 import java.util.List;
 
@@ -13,6 +14,8 @@ public interface NERParser {
 
 	List<Entity> extractNE(String text);
 
+	List<Entity> extractNE(List<LayoutToken> tokens);
+
 	String createCONNLTrainingFromText(String text);
 
 	//String createXMLTrainingFromText(String text, StringBuilder sb);