Skip to content

Commit

Permalink
Removing TextBlocks.java and adding GrobidAnalyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Oct 10, 2016
1 parent 369d651 commit 6cf223e
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 44 deletions.
4 changes: 4 additions & 0 deletions grobid-ner/src/main/java/org/grobid/core/data/Entity.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.lexicon.NERLexicon;
import org.grobid.core.layout.BoundingBox;

import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -38,6 +39,9 @@ public class Entity implements Comparable<Entity> {
// all the sense information related to the entity
private Sense sense = null;

// optional bounding box in the source document
private BoundingBox box = null;

// orign of the entity definition
public static int GROBID = 0;
public static int USER = 1;
Expand Down
4 changes: 4 additions & 0 deletions grobid-ner/src/main/java/org/grobid/core/data/Sense.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.grobid.core.data;

import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.layout.BoundingBox;

/**
* Common representation of a sense.
Expand Down Expand Up @@ -29,6 +30,9 @@ public class Sense {
// relative offset positions in context, if defined
private OffsetPosition offsets = null;

// optional bounding box in the source document
private BoundingBox box = null;

public Sense() {
this.offsets = new OffsetPosition();
}
Expand Down
71 changes: 53 additions & 18 deletions grobid-ner/src/main/java/org/grobid/core/engines/NERParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import org.grobid.core.GrobidModels;
import org.grobid.core.data.Entity;
import org.grobid.core.data.Sense;
import org.grobid.core.data.TextBlocks;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorNER;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.lang.Language;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.lexicon.LexiconPositionsIndexes;
import org.grobid.core.utilities.Pair;
import org.slf4j.Logger;
Expand Down Expand Up @@ -41,39 +42,57 @@ public NERParser() {

/**
* Extract all occurrences of named entity from a simple piece of text.
* Default language is English...
*/
public List<Entity> extractNE(String text) {
public List<Entity> extractNE(String text) {
return extractNE(new Language(Language.EN, 1.0), text);
}

public List<Entity> extractNE(Language lang, String text) {
if (isEmpty(text))
return null;

text = text.replace("\n", " ");

TextBlocks blocks = TextBlocks.getTextBlocks(text);
if ( (!lang.getLangId().equals(Language.EN)) && (!lang.getLangId().equals(Language.FR)) ) {
// this language is not supported
throw new GrobidException("Language not supported by grobid-ner: " + lang.toString());
}

//text = text.replace("\n", " ");
List<String> tokens = null;
try {
tokens = GrobidAnalyzer.getInstance().tokenize(lang, text);
} catch(Exception e) {
LOGGER.error("Tokenization failed", e);
}
if (tokens == null)
return null;

LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
positionsIndexes.computeIndexes(text);

String res = toFeatureVector(blocks, positionsIndexes);
String res = toFeatureVector(tokens, positionsIndexes);
String result = label(res);
List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(result);

List<Entity> entities = resultExtraction(text, labeled, blocks.getTokens());
List<Entity> entities = resultExtraction(text, labeled, tokens);

// we use now the sense tagger for the recognized named entity
List<Sense> senses = senseTagger.extractSenses(text, labeled, blocks.getTokens(), positionsIndexes);
List<Sense> senses = senseTagger.extractSenses(text, labeled, tokens, positionsIndexes);

merge(entities, senses);

return entities;
}

public String toFeatureVector(TextBlocks blocks, LexiconPositionsIndexes positionsIndexes) {
public String toFeatureVector(List<String> tokens, LexiconPositionsIndexes positionsIndexes) {
StringBuffer ress = new StringBuffer();
int posit = 0; // keep track of the position index in the list of positions

int currentPosition = 0;

for (String block : blocks.getTextBlocks()) {
currentPosition += blocks.getTextBlocksPositions().get(posit);
for (String token : tokens) {
if (token.equals(" ") || token.equals("\t") || token.equals("\n") || token.equals("\r")) {
//posit++;
continue;
}

// check if the token is a known NE
// do we have a NE at position posit?
Expand All @@ -87,7 +106,7 @@ public String toFeatureVector(TextBlocks blocks, LexiconPositionsIndexes positio
.isTokenInLexicon(positionsIndexes.getLocalOrgFormPositions(), posit);

ress.append(FeaturesVectorNER
.addFeaturesNER(block,
.addFeaturesNER(token,
isLocationToken, isPersonTitleToken, isOrganisationToken, isOrgFormToken)
.printVector());
ress.append("\n");
Expand Down Expand Up @@ -250,23 +269,39 @@ public void createTraining(String inputFile,
}

protected String createTrainingText(File file) throws IOException {
// default language is English
return createTrainingText(file, new Language(Language.EN, 1.0));
}

protected String createTrainingText(File file, Language lang) throws IOException {
String text = FileUtils.readFileToString(file);

return createTrainingFromText(text, file.getName());
return createTrainingFromText(text, file.getName(), lang);
}

protected String createTrainingFromText(String text, String fileLabel) {
protected String createTrainingFromText(String text, String fileLabel) {
// default language is English
return createTrainingFromText(text, fileLabel, new Language(Language.EN, 1.0));
}

protected String createTrainingFromText(String text, String fileLabel, Language lang) {
if (isEmpty(text))
return null;

//TODO: find a solution to avoid loosing the sentence delimiters
text = text.replace("\n", " ");

TextBlocks blocks = TextBlocks.getTextBlocks(text);
List<String> tokens = null;
try {
tokens = GrobidAnalyzer.getInstance().tokenize(lang, text);
} catch(Exception e) {
LOGGER.error("Tokenization failed", e);
return null;
}
LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
positionsIndexes.computeIndexes(text);

String featuresVector = toFeatureVector(blocks, positionsIndexes);
String featuresVector = toFeatureVector(tokens, positionsIndexes);
String res = label(featuresVector);

List<Pair<String, String>> labeledEntries = GenericTaggerUtils.getTokensAndLabels(res);
Expand Down
44 changes: 34 additions & 10 deletions grobid-ner/src/main/java/org/grobid/core/engines/SenseTagger.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.lexicon.LexiconPositionsIndexes;
import org.grobid.core.lexicon.NERLexicon;
import org.grobid.core.lang.Language;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.Pair;
import org.grobid.core.utilities.TextUtilities;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
Expand All @@ -23,7 +28,9 @@
* @author Patrice Lopez
*/
public class SenseTagger extends AbstractParser {


private static Logger LOGGER = LoggerFactory.getLogger(SenseTagger.class);

protected NERLexicon nerLexicon = NERLexicon.getInstance();
protected Lexicon lexicon = Lexicon.getInstance();

Expand All @@ -35,32 +42,49 @@ public SenseTagger() {
* Extract all occurences of NER senses from a simple piece of text.
*/
public List<Sense> extractSenses(String text) throws Exception {
// default language is English
return extractSenses(text, new Language(Language.EN, 1.0));
}

/**
* Extract all occurences of NER senses from a simple piece of text.
*/
public List<Sense> extractSenses(String text, Language lang) throws Exception {
if (text == null)
return null;
if (text.length() == 0)
return null;
List<Sense> senses = null;
try {
text = text.replace("\n", " ");
//text = text.replace("\n", " ");
int sentence = 0;
List<OffsetPosition> localLocationPositions = lexicon.inLocationNames(text);
List<OffsetPosition> localPersonTitlePositions = lexicon.inPersonTitleNames(text);
List<OffsetPosition> localOrganisationPositions = lexicon.inOrganisationNames(text);
List<OffsetPosition> localOrgFormPositions = lexicon.inOrgFormNames(text);
int currentPosition = 0;
StringTokenizer st = new StringTokenizer(text, TextUtilities.fullPunctuations, true);
//StringTokenizer st = new StringTokenizer(text, TextUtilities.fullPunctuations, true);
List<String> tokenizations = null;
try {
tokenizations = GrobidAnalyzer.getInstance().tokenize(lang, text);
} catch(Exception e) {
LOGGER.error("Tokenization failed", e);
}
if (tokenizations == null)
return null;

if (st.countTokens() == 0)
return null;
//if (st.countTokens() == 0)
// return null;

List<String> textBlocks = new ArrayList<String>();
List<String> tokenizations = new ArrayList<String>();
//List<String> tokenizations = new ArrayList<String>();
int pos = 0; // current offset
List<Integer> positions = new ArrayList<Integer>();
while (st.hasMoreTokens()) {
String tok = st.nextToken();
tokenizations.add(tok);
if (!tok.equals(" ")) {
//while (st.hasMoreTokens()) {
for(String tok : tokenizations) {
//String tok = st.nextToken();
//tokenizations.add(tok);
if (!tok.equals(" ") && !tok.equals("\t") && !tok.equals("\n") && !tok.equals("\r")) {
textBlocks.add(tok + "\t<sense>");
positions.add(pos);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,27 @@
import com.ctc.wstx.stax.WstxInputFactory;
import org.apache.commons.lang3.StringUtils;
import org.codehaus.stax2.XMLStreamReader2;
import org.grobid.core.data.TextBlocks;
import org.grobid.core.lexicon.NERLexicon;

import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.lang.Language;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import java.io.*;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.commons.lang3.StringUtils.*;

/**
* Created by lfoppiano on 29/08/16.
*/
public class INRIALeMondeCorpusStaxHandler implements StaxParserContentHandler {
private static Logger LOGGER = LoggerFactory.getLogger(INRIALeMondeCorpusStaxHandler.class);

private Writer writer;
private StringBuilder sb;
Expand All @@ -35,10 +41,9 @@ public class INRIALeMondeCorpusStaxHandler implements StaxParserContentHandler {
private String comment = null;
private String gender = null;

TextBlocks blocks = new TextBlocks();
private GrobidAnalyzer analyzer = GrobidAnalyzer.getInstance();

public INRIALeMondeCorpusStaxHandler() {

this.sb = new StringBuilder();
}

Expand All @@ -47,7 +52,6 @@ public INRIALeMondeCorpusStaxHandler(Writer writer) {
this.writer = writer;
}


@Override
public void onStartDocument(XMLStreamReader2 xmlStreamReader2) {
}
Expand Down Expand Up @@ -117,21 +121,28 @@ public void onEndElement(XMLStreamReader2 reader) {
public void onCharacter(XMLStreamReader2 reader) {
if (inSentence || inNamedEntity) {
String text = reader.getText();
text = trim(text);
//text = trim(text);
if (isEmpty(text)) {
return;
}

TextBlocks textBlocks = blocks.getTextBlocks(text);

for (String textBlock : textBlocks.getTextBlocks()) {
String textBlockCleaned = StringUtils.replace(textBlock, TextBlocks.SUFFIX_NER, "");

List<String> tokens = null;
try {
tokens = analyzer.tokenize(new Language(Language.FR, 1.0), text);
} catch(Exception e) {
LOGGER.error("Tokenization failed", e);
}
if (tokens == null)
return;
for(String token : tokens) {
if (token.equals(" ") || token.equals("\t") || token.equals("\n") || token.equals("\r")) {
continue;
}
if ((inNamedEntity) && (isNotEmpty(entityType))) {
sb.append(textBlockCleaned).append("\t").append(translate(entityType, entitySubType));
if (isNotEmpty(entitySubType)) {
sb.append(token).append("\t").append(translate(entityType, entitySubType));
/*if (isNotEmpty(entitySubType)) {
sb.append("\t").append(entitySubType);
}
}*/

if (isNotBlank(disambiguatedName)) {
sb.append("\t").append(disambiguatedName);
Expand All @@ -143,7 +154,7 @@ public void onCharacter(XMLStreamReader2 reader) {

sb.append("\n");
} else {
sb.append(textBlockCleaned).append("\t").append("O").append("\n");
sb.append(token).append("\t").append("O").append("\n");
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public void testSampleParsing_shouldWork() throws Exception {
assertThat(splitted[1], is("zzbbzb\tO"));
assertThat(splitted[2], is(",\tO"));

assertThat(splitted[52], is(""));
assertThat(splitted[50], is(""));
}

@Test
Expand Down

0 comments on commit 6cf223e

Please sign in to comment.