Skip to content

Commit

Permalink
Adding trainer for english NER using the new XML format
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jul 17, 2017
1 parent 22946ac commit 26c24f8
Show file tree
Hide file tree
Showing 16 changed files with 1,119 additions and 462 deletions.
26 changes: 16 additions & 10 deletions grobid-ner/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<version>1.3</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
Expand All @@ -66,7 +73,11 @@
<optional>true</optional>
<!--scope>runtime</scope -->
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.6</version>
</dependency>

<!-- for pooling e.g. Engine -->
<dependency>
Expand Down Expand Up @@ -95,12 +106,6 @@
<version>4.1</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.6</version>
</dependency>

<dependency>
<groupId>directory-naming</groupId>
<artifactId>naming-java</artifactId>
Expand All @@ -117,6 +122,7 @@
<artifactId>langdetect</artifactId>
<version>1.1-20120112</version>
</dependency>

<dependency>
<groupId>org.codehaus.woodstox</groupId>
<artifactId>stax2-api</artifactId>
Expand All @@ -132,7 +138,7 @@
<dependency>
<groupId>com.googlecode.clearnlp</groupId>
<artifactId>clearnlp</artifactId>
<version>1.3.1</version>
<version>1.4.2</version>
</dependency>

</dependencies>
Expand Down Expand Up @@ -189,8 +195,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<source>1.7</source>
<target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
Expand Down
63 changes: 63 additions & 0 deletions grobid-ner/resources/dataset/ner/reports/training-170717.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@

===== Token-level results =====


label accuracy precision recall f1

ARTIFACT 99.68 0 0 0
BUSINESS 99.78 0 0 0
CONCEPT 99.84 100 17.39 29.63
CONCEPTUAL 99.87 83.33 25 38.46
CREATION 99.83 10 40 16
EVENT 96.76 76.78 43.89 55.86
INSTALLATION 99.95 0 0 0
INSTITUTION 97.82 46.27 24.41 31.96
LEGAL 99.19 63.33 58.46 60.8
LOCATION 96.92 80.04 81.95 80.98
MEASURE 99.77 83.04 91.18 86.92
MEDIA 99.86 0 0 0
NATIONAL 98.91 74.25 57.94 65.09
ORGANISATION 99.2 35.87 46.48 40.49
PERIOD 98.81 83.61 83.02 83.31
PERSON 98.4 42.37 83.89 56.31
PERSON_TYPE 99.56 81.97 54.95 65.79
TITLE 99.68 76.54 76.54 76.54
UNKNOWN 99.91 0 0 0
WEBSITE 99.97 0 0 0

all fields 99.18 70.82 64.25 67.37 (micro average)
99.14 49.34 41.32 41.48 (macro average)

===== Field-level results =====

label accuracy precision recall f1

ARTIFACT 99.51 0 0 0
BUSINESS 99.75 0 0 0
CONCEPT 99.66 100 15.38 26.67
CONCEPTUAL 99.85 83.33 55.56 66.67
CREATION 99.88 40 66.67 50
EVENT 96.88 80.72 44.08 57.02
INSTALLATION 99.94 0 0 0
INSTITUTION 96.23 50.98 21.14 29.89
LEGAL 99.16 58.82 60.61 59.7
LOCATION 96.2 86.81 88.73 87.76
MEASURE 99.26 74.6 85.45 79.66
MEDIA 99.75 0 0 0
NATIONAL 97.34 73.58 72.67 73.12
ORGANISATION 98.89 29.63 32 30.77
PERIOD 98.42 88.27 84.04 86.1
PERSON 98.79 63.41 85.25 72.73
PERSON_TYPE 98.89 80.95 54.84 65.38
TITLE 99.6 63.64 43.75 51.85
UNKNOWN 99.85 0 0 0
WEBSITE 99.88 0 0 0

all fields 98.88 77.88 69.1 73.23 (micro average)
98.78 54.15 45.01 46.52 (macro average)

===== Instance-level results =====

Total expected instances: 430
Correct instances: 140
Instance-level recall: 32.56
136 changes: 110 additions & 26 deletions grobid-ner/src/main/java/org/grobid/core/data/Sentence.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,40 @@

import org.grobid.core.utilities.OffsetPosition;

import java.util.List;
import java.util.ArrayList;
import java.util.List;

import static org.apache.commons.collections4.CollectionUtils.isNotEmpty;

/**
* This class represents a sentence with stand-off position to mark its boundaries in a text.
*
* @author Patrice Lopez
* This class represents a sentence with stand-off position to mark its boundaries in a text.
*
* @author Patrice Lopez
*/
public class Sentence {

// relative offset positions in context
private OffsetPosition offsets = null;

public Sentence() {
this.offsets = new OffsetPosition();
}

public OffsetPosition getOffsets() {
return this.offsets;
}

public void setOffsets(OffsetPosition offsets) {
this.offsets = offsets;
}

public void setOffsetStart(int start) {

private String rawValue = "";
private String id;
private List<Entity> entities = new ArrayList<>();

// relative offset positions in context
private OffsetPosition offsets = null;
private List<String> tokenisedValue;
private List<Integer> entityIndexList = new ArrayList<>();

public Sentence() {
this.offsets = new OffsetPosition();
}

public OffsetPosition getOffsets() {
return this.offsets;
}

public void setOffsets(OffsetPosition offsets) {
this.offsets = offsets;
}

public void setOffsetStart(int start) {
offsets.start = start;
}

Expand All @@ -43,9 +50,86 @@ public void setOffsetEnd(int end) {
public int getOffsetEnd() {
return offsets.end;
}

public String toJSON() {
return "{ \"offsetStart\" : " + offsets.start + ", \"offsetEnd\" : " + offsets.end + " }";
}


public String toJSON() {
return "{ \"offsetStart\" : " + offsets.start + ", \"offsetEnd\" : " + offsets.end + " }";
}

public String getRawValue() {
return rawValue;
}

public void setRawValue(String rawValue) {
this.rawValue = rawValue;
}

public String getId() {
return id;
}

public void setId(String id) {
this.id = id;
}

public List<Entity> getEntities() {
return entities;
}

public void setEntities(List<Entity> entities) {
this.entities = entities;
}

public void addEntity(Entity currentEntity) {
this.entities.add(currentEntity);
}

/**
* Set the tokenised version of the sentence, the responsibility is delegated to the caller.
* When the tokens are set, if the entities are present (the list is not empty), a reversed index
* (to simplify the identification of entities starting from the tokens) is calculated.
*/
public void setTokenisedValue(List<String> tokenisedValue) {
this.tokenisedValue = tokenisedValue;
List<Integer> entityIndexList = new ArrayList<>();
if (isNotEmpty(getEntities())) {
int checkIndex = 0;
int startEntityIndex = 0;
out:
for (int i = 0; i < tokenisedValue.size(); i++) {
int idxExpectedStart = checkIndex;
int idxExpectedEnd = checkIndex + tokenisedValue.get(i).length();

for (int j = startEntityIndex; j < getEntities().size(); j++) {
Entity entity = getEntities().get(j);
if (idxExpectedStart >= entity.getOffsetStart() && idxExpectedEnd <= entity.getOffsetEnd()) {
entityIndexList.add(j);
idxExpectedStart = idxExpectedEnd;
checkIndex = idxExpectedEnd;
continue out;
}
}
entityIndexList.add(-1);
idxExpectedStart = idxExpectedEnd;
checkIndex = idxExpectedEnd;
}

} else {
for (String token : getTokenisedValue()) {
entityIndexList.add(-1);
}
}
this.entityIndexList.addAll(entityIndexList);
}

public List<String> getTokenisedValue() {
return this.tokenisedValue;
}

public void setEntityIndexList(List<Integer> entityIndexList) {
this.entityIndexList = entityIndexList;
}

public List<Integer> getEntityIndexList() {
return entityIndexList;
}
}

0 comments on commit 26c24f8

Please sign in to comment.