Adding trainer for english NER using the new XML format

kermitt2 · Jul 17, 2017 · 26c24f8 · 26c24f8
1 parent 22946ac
commit 26c24f8
Show file tree

Hide file tree

Showing 16 changed files with 1,119 additions and 462 deletions.
diff --git a/grobid-ner/pom.xml b/grobid-ner/pom.xml
@@ -58,6 +58,13 @@
 			<version>4.12</version>
 			<scope>test</scope>
 		</dependency>
+		<dependency>
+			<groupId>org.hamcrest</groupId>
+			<artifactId>hamcrest-all</artifactId>
+			<version>1.3</version>
+			<scope>test</scope>
+		</dependency>
+
 		<dependency>
 			<groupId>log4j</groupId>
 			<artifactId>log4j</artifactId>
@@ -66,7 +73,11 @@
 			<optional>true</optional>
 			<!--scope>runtime</scope -->
 		</dependency>
-
+		<dependency>
+			<groupId>org.slf4j</groupId>
+			<artifactId>slf4j-api</artifactId>
+			<version>1.6.6</version>
+		</dependency>
 
 		<!-- for pooling e.g. Engine -->
 		<dependency>
@@ -95,12 +106,6 @@
 			<version>4.1</version>
 		</dependency>
 
-		<dependency>
-			<groupId>org.slf4j</groupId>
-			<artifactId>slf4j-api</artifactId>
-			<version>1.6.6</version>
-		</dependency>
-
 		<dependency>
 			<groupId>directory-naming</groupId>
 			<artifactId>naming-java</artifactId>
@@ -117,6 +122,7 @@
             <artifactId>langdetect</artifactId>
             <version>1.1-20120112</version>
         </dependency>
+
         <dependency>
             <groupId>org.codehaus.woodstox</groupId>
             <artifactId>stax2-api</artifactId>
@@ -132,7 +138,7 @@
 		<dependency>
 		    <groupId>com.googlecode.clearnlp</groupId>
 		    <artifactId>clearnlp</artifactId>
-		    <version>1.3.1</version>
+		    <version>1.4.2</version>
 		</dependency>
 
     </dependencies>
@@ -189,8 +195,8 @@
 				<artifactId>maven-compiler-plugin</artifactId>
 				<version>2.5.1</version>
 				<configuration>
-					<source>1.6</source>
-					<target>1.6</target>
+					<source>1.7</source>
+					<target>1.7</target>
 					<encoding>UTF-8</encoding>
 				</configuration>
 			</plugin>

diff --git a/grobid-ner/resources/dataset/ner/reports/training-170717.txt b/grobid-ner/resources/dataset/ner/reports/training-170717.txt
@@ -0,0 +1,63 @@
+
+===== Token-level results =====
+
+
+label                accuracy     precision    recall       f1
+
+ARTIFACT             99.68        0            0            0
+BUSINESS             99.78        0            0            0
+CONCEPT              99.84        100          17.39        29.63
+CONCEPTUAL           99.87        83.33        25           38.46
+CREATION             99.83        10           40           16
+EVENT                96.76        76.78        43.89        55.86
+INSTALLATION         99.95        0            0            0
+INSTITUTION          97.82        46.27        24.41        31.96
+LEGAL                99.19        63.33        58.46        60.8
+LOCATION             96.92        80.04        81.95        80.98
+MEASURE              99.77        83.04        91.18        86.92
+MEDIA                99.86        0            0            0
+NATIONAL             98.91        74.25        57.94        65.09
+ORGANISATION         99.2         35.87        46.48        40.49
+PERIOD               98.81        83.61        83.02        83.31
+PERSON               98.4         42.37        83.89        56.31
+PERSON_TYPE          99.56        81.97        54.95        65.79
+TITLE                99.68        76.54        76.54        76.54
+UNKNOWN              99.91        0            0            0
+WEBSITE              99.97        0            0            0
+
+all fields           99.18        70.82        64.25        67.37   (micro average)
+                     99.14        49.34        41.32        41.48   (macro average)
+
+===== Field-level results =====
+
+label                accuracy     precision    recall       f1
+
+ARTIFACT             99.51        0            0            0
+BUSINESS             99.75        0            0            0
+CONCEPT              99.66        100          15.38        26.67
+CONCEPTUAL           99.85        83.33        55.56        66.67
+CREATION             99.88        40           66.67        50
+EVENT                96.88        80.72        44.08        57.02
+INSTALLATION         99.94        0            0            0
+INSTITUTION          96.23        50.98        21.14        29.89
+LEGAL                99.16        58.82        60.61        59.7
+LOCATION             96.2         86.81        88.73        87.76
+MEASURE              99.26        74.6         85.45        79.66
+MEDIA                99.75        0            0            0
+NATIONAL             97.34        73.58        72.67        73.12
+ORGANISATION         98.89        29.63        32           30.77
+PERIOD               98.42        88.27        84.04        86.1
+PERSON               98.79        63.41        85.25        72.73
+PERSON_TYPE          98.89        80.95        54.84        65.38
+TITLE                99.6         63.64        43.75        51.85
+UNKNOWN              99.85        0            0            0
+WEBSITE              99.88        0            0            0
+
+all fields           98.88        77.88        69.1         73.23   (micro average)
+                     98.78        54.15        45.01        46.52   (macro average)
+
+===== Instance-level results =====
+
+Total expected instances:   430
+Correct instances:          140
+Instance-level recall:      32.56
diff --git a/grobid-ner/src/main/java/org/grobid/core/data/Sentence.java b/grobid-ner/src/main/java/org/grobid/core/data/Sentence.java
@@ -2,33 +2,40 @@
 
 import org.grobid.core.utilities.OffsetPosition;
 
-import java.util.List;    
 import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.commons.collections4.CollectionUtils.isNotEmpty;
 
 /**
- * This class represents a sentence with stand-off position to mark its boundaries in a text. 
- * 
- * @author Patrice Lopez
+ * This class represents a sentence with stand-off position to mark its boundaries in a text.
  *
+ * @author Patrice Lopez
  */
 public class Sentence {
-
-	// relative offset positions in context
-	private OffsetPosition offsets = null;
-
-	public Sentence() {
-		this.offsets = new OffsetPosition();
-    }
-
-	public OffsetPosition getOffsets() {
-		return this.offsets;
-	}
-
-	public void setOffsets(OffsetPosition offsets) {
-		this.offsets = offsets;
-	}
-
-	public void setOffsetStart(int start) {
+
+    private String rawValue = "";
+    private String id;
+    private List<Entity> entities = new ArrayList<>();
+
+    // relative offset positions in context
+    private OffsetPosition offsets = null;
+    private List<String> tokenisedValue;
+    private List<Integer> entityIndexList = new ArrayList<>();
+
+    public Sentence() {
+        this.offsets = new OffsetPosition();
+    }
+
+    public OffsetPosition getOffsets() {
+        return this.offsets;
+    }
+
+    public void setOffsets(OffsetPosition offsets) {
+        this.offsets = offsets;
+    }
+
+    public void setOffsetStart(int start) {
         offsets.start = start;
     }
 
@@ -43,9 +50,86 @@ public void setOffsetEnd(int end) {
     public int getOffsetEnd() {
         return offsets.end;
     }
-
-	public String toJSON() {
-		return "{ \"offsetStart\" : " + offsets.start + ", \"offsetEnd\" : " + offsets.end + " }";
-	}
-
+
+    public String toJSON() {
+        return "{ \"offsetStart\" : " + offsets.start + ", \"offsetEnd\" : " + offsets.end + " }";
+    }
+
+    public String getRawValue() {
+        return rawValue;
+    }
+
+    public void setRawValue(String rawValue) {
+        this.rawValue = rawValue;
+    }
+
+    public String getId() {
+        return id;
+    }
+
+    public void setId(String id) {
+        this.id = id;
+    }
+
+    public List<Entity> getEntities() {
+        return entities;
+    }
+
+    public void setEntities(List<Entity> entities) {
+        this.entities = entities;
+    }
+
+    public void addEntity(Entity currentEntity) {
+        this.entities.add(currentEntity);
+    }
+
+    /**
+     * Set the tokenised version of the sentence, the responsibility is delegated to the caller.
+     * When the tokens are set, if the entities are present (the list is not empty), a reversed index
+     * (to simplify the identification of entities starting from the tokens) is calculated.
+     */
+    public void setTokenisedValue(List<String> tokenisedValue) {
+        this.tokenisedValue = tokenisedValue;
+        List<Integer> entityIndexList = new ArrayList<>();
+        if (isNotEmpty(getEntities())) {
+            int checkIndex = 0;
+            int startEntityIndex = 0;
+            out:
+            for (int i = 0; i < tokenisedValue.size(); i++) {
+                int idxExpectedStart = checkIndex;
+                int idxExpectedEnd = checkIndex + tokenisedValue.get(i).length();
+
+                for (int j = startEntityIndex; j < getEntities().size(); j++) {
+                    Entity entity = getEntities().get(j);
+                    if (idxExpectedStart >= entity.getOffsetStart() && idxExpectedEnd <= entity.getOffsetEnd()) {
+                        entityIndexList.add(j);
+                        idxExpectedStart = idxExpectedEnd;
+                        checkIndex = idxExpectedEnd;
+                        continue out;
+                    }
+                }
+                entityIndexList.add(-1);
+                idxExpectedStart = idxExpectedEnd;
+                checkIndex = idxExpectedEnd;
+            }
+
+        } else {
+            for (String token : getTokenisedValue()) {
+                entityIndexList.add(-1);
+            }
+        }
+        this.entityIndexList.addAll(entityIndexList);
+    }
+
+    public List<String> getTokenisedValue() {
+        return this.tokenisedValue;
+    }
+
+    public void setEntityIndexList(List<Integer> entityIndexList) {
+        this.entityIndexList = entityIndexList;
+    }
+
+    public List<Integer> getEntityIndexList() {
+        return entityIndexList;
+    }
 }