Skip to content

Commit

Permalink
Merge pull request #4 from lfoppiano/training_generation
Browse files Browse the repository at this point in the history
Training data can be generated automatically from the command line -  the training data format is the CONLL format which is shared among NER project. See page: http://www.cnts.ua.ac.be/conll2003/ner/
  • Loading branch information
lfoppiano committed Aug 25, 2016
2 parents 7d5c7bb + a2bd6d3 commit eb2fd7e
Show file tree
Hide file tree
Showing 31 changed files with 1,082 additions and 868 deletions.
11 changes: 0 additions & 11 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,6 @@ Thumbs.db
*.log
*.log.*
*.old

# *.log
# grobid-core/log/debug.log
# *.log
# grobid-core/log/error.log
# grobid-core/log/warn.log
# grobid-core/log/debug.log
# grobid-core/log/error.log
# grobid-core/log/warn.log


grobid-core/src/test/resources/org/grobid/core/annotations/resTeiStAXParser/out.tei.xml

grobid-home/models/affiliation-address/model.crf.old
Expand Down
22 changes: 11 additions & 11 deletions grobid-ner/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<groupId>org.grobid</groupId>
<artifactId>grobid-parent</artifactId>
<version>0.4.1-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
<relativePath>../../pom.xml</relativePath>
</parent>

<groupId>org.grobid</groupId>
Expand Down Expand Up @@ -46,7 +46,6 @@
<artifactId>grobid-core</artifactId>
<version>0.4.1-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-trainer</artifactId>
Expand All @@ -65,7 +64,6 @@
<version>4.8.2</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
Expand All @@ -74,6 +72,8 @@
<optional>true</optional>
<!--scope>runtime</scope -->
</dependency>


<!-- for pooling e.g. Engine -->
<dependency>
<groupId>commons-pool</groupId>
Expand All @@ -95,17 +95,17 @@
<artifactId>commons-lang3</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.1</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.6</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-jdk14</artifactId>
<version>1.6.1</version>
</dependency>

<dependency>
<groupId>directory-naming</groupId>
Expand Down Expand Up @@ -141,14 +141,14 @@
<!--plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId>
<configuration> <archive> <manifest> <mainClass>org.grobid.core.engines.Engine</mainClass>
</manifest> </archive> </configuration> </plugin -->
<!--plugin>
<plugin>
<groupId>com.jolira</groupId>
<artifactId>onejar-maven-plugin</artifactId>
<version>1.4.4</version>
<executions>
<execution>
<configuration>
<mainClass>org.grobid.core.main.batch.GrobidMain</mainClass>
<mainClass>org.grobid.core.main.batch.NERMain</mainClass>
<onejarVersion>0.97</onejarVersion>
<attachToBuild>true</attachToBuild>
<classifier>onejar</classifier>
Expand All @@ -158,7 +158,7 @@
</goals>
</execution>
</executions>
</plugin-->
</plugin>

<!--plugin>
<artifactId>maven-assembly-plugin</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,8 @@
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.lexicon.NERLexicon;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.Comparator;
import java.io.BufferedReader;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.List;

/**
* Common representation of an unresolved entity mention for the NER components.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,6 @@
package org.grobid.core.data;

import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.lexicon.NERLexicon;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;

import java.io.BufferedReader;
import java.util.List;

import org.apache.commons.lang3.StringUtils;

/**
* Common representation of a sense.
Expand Down
60 changes: 60 additions & 0 deletions grobid-ner/src/main/java/org/grobid/core/data/TextBlocks.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package org.grobid.core.data;

import org.grobid.core.utilities.TextUtilities;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

public class TextBlocks {
private List<String> textBlocks = new ArrayList<String>();
private List<String> tokens = new ArrayList<String>();
private List<Integer> textBlocksPositions = new ArrayList<Integer>();


public List<String> getTextBlocks() {
return textBlocks;
}

public void setTextBlocks(List<String> textBlocks) {
this.textBlocks = textBlocks;
}

public List<String> getTokens() {
return tokens;
}

public void setTokens(List<String> tokens) {
this.tokens = tokens;
}

public List<Integer> getTextBlocksPositions() {
return textBlocksPositions;
}

public void setTextBlocksPositions(List<Integer> textBlocksPositions) {
this.textBlocksPositions = textBlocksPositions;
}

public static TextBlocks getTextBlocks(String text) {

TextBlocks blocks = new TextBlocks();

StringTokenizer st = new StringTokenizer(text, TextUtilities.fullPunctuations, true);
if (st.countTokens() == 0) {
return null;
}

int pos = 0; // current offset
while (st.hasMoreTokens()) {
String tok = st.nextToken();
blocks.getTokens().add(tok);
if (!tok.equals(" ")) {
blocks.getTextBlocks().add(tok + "\t<ner>");
blocks.getTextBlocksPositions().add(pos);
}
pos += tok.length();
}
return blocks;
}
}

0 comments on commit eb2fd7e

Please sign in to comment.