Skip to content

Commit

Permalink
Add French NER
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Oct 20, 2016
1 parent 6cf223e commit 6769cf8
Show file tree
Hide file tree
Showing 27 changed files with 404,165 additions and 1,097 deletions.
402,161 changes: 402,161 additions & 0 deletions grobid-home/models/nerfr/model.wapiti

Large diffs are not rendered by default.

33 changes: 28 additions & 5 deletions grobid-ner/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
<parent>
<groupId>org.grobid</groupId>
<artifactId>grobid-parent</artifactId>
<version>0.4.1-SNAPSHOT</version>
<version>0.4.2-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

<groupId>org.grobid</groupId>
<artifactId>grobid-ner</artifactId>
<version>0.4.1-SNAPSHOT</version>
<version>0.4.2-SNAPSHOT</version>
<packaging>jar</packaging>
<name>${project.artifactId}</name>

Expand Down Expand Up @@ -44,12 +44,12 @@
<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-core</artifactId>
<version>0.4.1-SNAPSHOT</version>
<version>0.4.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-trainer</artifactId>
<version>0.4.1-SNAPSHOT</version>
<version>0.4.2-SNAPSHOT</version>
</dependency>

<!--dependency>
Expand Down Expand Up @@ -93,7 +93,7 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.0.1</version>
<version>3.4</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
Expand Down Expand Up @@ -239,6 +239,29 @@
</plugins>
</build>
</profile>
<profile>
<id>train_nerfr</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.1.1</version>
<executions>
<execution>
<phase>generate-resources</phase>
<goals>
<goal>java</goal>
</goals>
<configuration>
<mainClass>org.grobid.trainer.NERFrenchTrainer</mainClass>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<!-- mvn generate-resources -Ptrain_nersense -e -->
<profile>
<id>train_nersense</id>
Expand Down
Empty file.
108 changes: 108 additions & 0 deletions grobid-ner/resources/dataset/nerfr/crfpp-templates/nerfr.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Token 0
U00:%x[-4,0]
U01:%x[-3,0]
U02:%x[-2,0]
U03:%x[-1,0]
U04:%x[0,0]
U05:%x[1,0]
U06:%x[2,0]
U07:%x[3,0]
U08:%x[4,0]
#U09:%x[-1,0]/%x[0,0]
#U0A:%x[0,0]/%x[1,0]
#U0B:%x[1,0]/%x[2,0]
#U0C:%x[-2,0]/%x[-1,0]

# Lowercase token 1
U10:%x[-2,1]
U11:%x[-1,1]
U12:%x[0,1]
U13:%x[1,1]
U14:%x[2,1]

# Prefix 1-4 characters (2-6)
U20:%x[0,2]
U21:%x[0,3]
U22:%x[0,4]
U23:%x[0,5]
U24:%x[0,6]

# Suffix 1-4 characters (7-11)
U30:%x[0,7]
U31:%x[0,8]
U32:%x[0,9]
U33:%x[0,10]
U34:%x[0,11]

# Capitalization (12)
U40:%x[0,12]
U41:%x[1,12]
U42:%x[-1,12]
U43:%x[-2,12]
#U44:%x[0,12]/%x[1,12]
#U45:%x[-1,12]/%x[0,12]

# Digits (13)
U50:%x[0,13]
U51:%x[-1,13]
U52:%x[1,13]

# Lexical information (14-20)
U80:%x[0,14]
U81:%x[0,15]
U82:%x[0,16]
U83:%x[0,17]
U84:%x[0,18]
U85:%x[0,19]
U86:%x[0,20]
U89:%x[-1,14]
U8A:%x[-1,15]
U8B:%x[-1,16]
U8C:%x[-1,17]
U8D:%x[-1,18]
U8E:%x[-1,19]
U8F:%x[-1,20]
U8I:%x[1,14]
U8J:%x[1,15]
U8K:%x[1,16]
U8L:%x[1,17]
U8M:%x[1,18]
U8N:%x[1,19]
U8O:%x[1,20]

# lexical feature: belongs to a known location (21)
U90:%x[0,21]
U91:%x[-1,21]
U92:%x[1,21]
U93:%x[-2,21]

# lexical feature: belongs to a known person title (22)
UA0:%x[0,22]
UA1:%x[-1,22]
UA2:%x[1,22]
UA3:%x[-2,22]

# lexical feature: belongs to a known organisation (23)
UB0:%x[0,23]
UB1:%x[-1,23]
UB2:%x[1,23]
UB3:%x[-2,23]

# lexical feature: belongs to a known organisation form name (24)
UC0:%x[0,24]
UC1:%x[-1,24]
UC2:%x[1,24]
UC3:%x[-2,24]

# word shape (25)
UD0:%x[0,25]
UD1:%x[-1,25]
UD2:%x[1,25]

# word shape trimmed (26)
UE0:%x[0,26]
UE1:%x[-1,26]
UE2:%x[1,26]

# Output
B
Empty file.
26 changes: 20 additions & 6 deletions grobid-ner/src/main/java/org/grobid/core/data/Entity.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,22 @@
*
*/
public class Entity implements Comparable<Entity> {

// Orign of the entity definition
public enum Origin {
GROBID ("grobid"),
USER ("user");

private String name;

private Origin(String name) {
this.name = name;
}

public String getName() {
return name;
}
};

// name of the entity = entity type
private String rawName = null;
Expand All @@ -32,7 +48,7 @@ public class Entity implements Comparable<Entity> {

// probability of the entity in context, if defined
private double prob = 1.0;

// confidence score of the entity in context, if defined
private double conf = 0.8;

Expand All @@ -43,9 +59,7 @@ public class Entity implements Comparable<Entity> {
private BoundingBox box = null;

// orign of the entity definition
public static int GROBID = 0;
public static int USER = 1;
private int origin = 0;
private Origin origin = Origin.GROBID;

public Entity() {
this.offsets = new OffsetPosition();
Expand Down Expand Up @@ -160,11 +174,11 @@ public void setSense(Sense sense) {
this.sense = sense;
}

public int getOrigin() {
public Origin getOrigin() {
return origin;
}

public void setOrigin(int origin) {
public void setOrigin(Origin origin) {
this.origin = origin;
}

Expand Down
61 changes: 0 additions & 61 deletions grobid-ner/src/main/java/org/grobid/core/data/TextBlocks.java

This file was deleted.

0 comments on commit 6769cf8

Please sign in to comment.