Skip to content

Commit

Permalink
Revert "Adding trainer for english NER using the new XML format"
Browse files Browse the repository at this point in the history
This reverts commit 26c24f8.
  • Loading branch information
lfoppiano committed Jul 17, 2017
1 parent 26c24f8 commit 63949e2
Show file tree
Hide file tree
Showing 16 changed files with 462 additions and 1,119 deletions.
26 changes: 10 additions & 16 deletions grobid-ner/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,6 @@
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<version>1.3</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
Expand All @@ -73,11 +66,7 @@
<optional>true</optional>
<!--scope>runtime</scope -->
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.6</version>
</dependency>


<!-- for pooling e.g. Engine -->
<dependency>
Expand Down Expand Up @@ -106,6 +95,12 @@
<version>4.1</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.6</version>
</dependency>

<dependency>
<groupId>directory-naming</groupId>
<artifactId>naming-java</artifactId>
Expand All @@ -122,7 +117,6 @@
<artifactId>langdetect</artifactId>
<version>1.1-20120112</version>
</dependency>

<dependency>
<groupId>org.codehaus.woodstox</groupId>
<artifactId>stax2-api</artifactId>
Expand All @@ -138,7 +132,7 @@
<dependency>
<groupId>com.googlecode.clearnlp</groupId>
<artifactId>clearnlp</artifactId>
<version>1.4.2</version>
<version>1.3.1</version>
</dependency>

</dependencies>
Expand Down Expand Up @@ -195,8 +189,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
Expand Down
63 changes: 0 additions & 63 deletions grobid-ner/resources/dataset/ner/reports/training-170717.txt

This file was deleted.

136 changes: 26 additions & 110 deletions grobid-ner/src/main/java/org/grobid/core/data/Sentence.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,33 @@

import org.grobid.core.utilities.OffsetPosition;

import java.util.List;
import java.util.ArrayList;
import java.util.List;

import static org.apache.commons.collections4.CollectionUtils.isNotEmpty;

/**
* This class represents a sentence with stand-off position to mark its boundaries in a text.
*
* This class represents a sentence with stand-off position to mark its boundaries in a text.
*
* @author Patrice Lopez
*
*/
public class Sentence {

private String rawValue = "";
private String id;
private List<Entity> entities = new ArrayList<>();

// relative offset positions in context
private OffsetPosition offsets = null;
private List<String> tokenisedValue;
private List<Integer> entityIndexList = new ArrayList<>();

public Sentence() {
this.offsets = new OffsetPosition();
}

public OffsetPosition getOffsets() {
return this.offsets;
}

public void setOffsets(OffsetPosition offsets) {
this.offsets = offsets;
}

public void setOffsetStart(int start) {

// relative offset positions in context
private OffsetPosition offsets = null;

public Sentence() {
this.offsets = new OffsetPosition();
}

public OffsetPosition getOffsets() {
return this.offsets;
}

public void setOffsets(OffsetPosition offsets) {
this.offsets = offsets;
}

public void setOffsetStart(int start) {
offsets.start = start;
}

Expand All @@ -50,86 +43,9 @@ public void setOffsetEnd(int end) {
public int getOffsetEnd() {
return offsets.end;
}

public String toJSON() {
return "{ \"offsetStart\" : " + offsets.start + ", \"offsetEnd\" : " + offsets.end + " }";
}

public String getRawValue() {
return rawValue;
}

public void setRawValue(String rawValue) {
this.rawValue = rawValue;
}

public String getId() {
return id;
}

public void setId(String id) {
this.id = id;
}

public List<Entity> getEntities() {
return entities;
}

public void setEntities(List<Entity> entities) {
this.entities = entities;
}

public void addEntity(Entity currentEntity) {
this.entities.add(currentEntity);
}

/**
* Set the tokenised version of the sentence, the responsibility is delegated to the caller.
* When the tokens are set, if the entities are present (the list is not empty), a reversed index
* (to simplify the identification of entities starting from the tokens) is calculated.
*/
public void setTokenisedValue(List<String> tokenisedValue) {
this.tokenisedValue = tokenisedValue;
List<Integer> entityIndexList = new ArrayList<>();
if (isNotEmpty(getEntities())) {
int checkIndex = 0;
int startEntityIndex = 0;
out:
for (int i = 0; i < tokenisedValue.size(); i++) {
int idxExpectedStart = checkIndex;
int idxExpectedEnd = checkIndex + tokenisedValue.get(i).length();

for (int j = startEntityIndex; j < getEntities().size(); j++) {
Entity entity = getEntities().get(j);
if (idxExpectedStart >= entity.getOffsetStart() && idxExpectedEnd <= entity.getOffsetEnd()) {
entityIndexList.add(j);
idxExpectedStart = idxExpectedEnd;
checkIndex = idxExpectedEnd;
continue out;
}
}
entityIndexList.add(-1);
idxExpectedStart = idxExpectedEnd;
checkIndex = idxExpectedEnd;
}

} else {
for (String token : getTokenisedValue()) {
entityIndexList.add(-1);
}
}
this.entityIndexList.addAll(entityIndexList);
}

public List<String> getTokenisedValue() {
return this.tokenisedValue;
}

public void setEntityIndexList(List<Integer> entityIndexList) {
this.entityIndexList = entityIndexList;
}

public List<Integer> getEntityIndexList() {
return entityIndexList;
}

public String toJSON() {
return "{ \"offsetStart\" : " + offsets.start + ", \"offsetEnd\" : " + offsets.end + " }";
}

}

0 comments on commit 63949e2

Please sign in to comment.