Skip to content

Commit

Permalink
Updates changes due to changes in grobid-core
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 4, 2017
1 parent 66c3fb0 commit 575052f
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public NEREnParser() {
public List<Entity> extractNE(String text) {
List<String> tokens = null;
try {
tokens = GrobidAnalyzer.getInstance().tokenize(new Language(LANG_ID, 1.0), text);
tokens = GrobidAnalyzer.getInstance().tokenize(text, new Language(LANG_ID, 1.0));
} catch(Exception e) {
LOGGER.error("Tokenization failed", e);
}
Expand Down Expand Up @@ -82,7 +82,7 @@ public String createTrainingFromText(String text) {

List<String> tokens = null;
try {
tokens = GrobidAnalyzer.getInstance().tokenize(new Language(LANG_ID, 1.0), text);
tokens = GrobidAnalyzer.getInstance().tokenize(text, new Language(LANG_ID, 1.0));
} catch(Exception e) {
LOGGER.error("Tokenization failed", e);
return null;
Expand Down
24 changes: 7 additions & 17 deletions grobid-ner/src/main/java/org/grobid/core/engines/NERFrParser.java
Original file line number Diff line number Diff line change
@@ -1,26 +1,16 @@
package org.grobid.core.engines;

import org.apache.commons.io.FileUtils;
import org.grobid.core.GrobidModels;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.data.Entity;
import org.grobid.core.data.Sense;
import org.grobid.core.exceptions.GrobidResourceException;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorNER;
import org.grobid.core.lang.Language;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.lexicon.LexiconPositionsIndexes;
import org.grobid.core.lang.Language;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.utilities.Pair;
import org.grobid.core.utilities.LanguageUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.apache.commons.lang3.StringUtils.isEmpty;
Expand Down Expand Up @@ -48,13 +38,13 @@ public NERFrParser() {
public List<Entity> extractNE(String text) {
List<String> tokens = null;
try {
tokens = GrobidAnalyzer.getInstance().tokenize(new Language(LANG_ID, 1.0), text);
} catch(Exception e) {
tokens = GrobidAnalyzer.getInstance().tokenize(text, new Language(LANG_ID, 1.0));
} catch (Exception e) {
LOGGER.error("Tokenization failed", e);
}
if (tokens == null)
return null;

LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
positionsIndexes.computeIndexes(text);

Expand All @@ -75,8 +65,8 @@ public String createTrainingFromText(String text) {

List<String> tokens = null;
try {
tokens = GrobidAnalyzer.getInstance().tokenize(new Language(LANG_ID, 1.0), text);
} catch(Exception e) {
tokens = GrobidAnalyzer.getInstance().tokenize(text, new Language(LANG_ID, 1.0));
} catch (Exception e) {
LOGGER.error("Tokenization failed", e);
}
LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ public List<Entity> extractNE(String text, Language lang) throws GrobidResourceE
return extractNE(text);
}

NERParser parser = parsers.get(lang.getLangId());
NERParser parser = parsers.get(lang.getLang());
if (parser == null) {
throw new GrobidResourceException("The automatically identified labnguage is currently not supported by grobid-ner: " +
lang.getLangId());
lang.getLang());
}

return parser.extractNE(text);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,26 @@
import com.ctc.wstx.stax.WstxInputFactory;
import org.apache.commons.lang3.StringUtils;
import org.codehaus.stax2.XMLStreamReader2;
import org.grobid.core.lexicon.NERLexicon;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.lang.Language;

import org.grobid.core.lexicon.NERLexicon;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import java.io.*;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.commons.lang3.StringUtils.*;

/**
* Created by lfoppiano on 29/08/16.
*/
public class INRIALeMondeCorpusStaxHandler implements StaxParserContentHandler {
private static Logger LOGGER = LoggerFactory.getLogger(INRIALeMondeCorpusStaxHandler.class);
private static Logger LOGGER = LoggerFactory.getLogger(INRIALeMondeCorpusStaxHandler.class);

private Writer writer;
private StringBuilder sb;
Expand All @@ -41,7 +40,7 @@ public class INRIALeMondeCorpusStaxHandler implements StaxParserContentHandler {
private String comment = null;
private String gender = null;

private GrobidAnalyzer analyzer = GrobidAnalyzer.getInstance();
private GrobidAnalyzer analyzer = GrobidAnalyzer.getInstance();

public INRIALeMondeCorpusStaxHandler() {
this.sb = new StringBuilder();
Expand Down Expand Up @@ -127,18 +126,18 @@ public void onCharacter(XMLStreamReader2 reader) {
return;
}

List<String> tokens = null;
try {
tokens = analyzer.tokenize(new Language(Language.FR, 1.0), text);
} catch(Exception e) {
LOGGER.error("Tokenization failed", e);
}
if (tokens == null)
return;
for(String token : tokens) {
if (token.equals(" ") || token.equals("\t") || token.equals("\n") || token.equals("\r")) {
continue;
}
List<String> tokens = null;
try {
tokens = analyzer.tokenize(text, new Language(Language.FR, 1.0));
} catch (Exception e) {
LOGGER.error("Tokenization failed", e);
}
if (tokens == null)
return;
for (String token : tokens) {
if (token.equals(" ") || token.equals("\t") || token.equals("\n") || token.equals("\r")) {
continue;
}
if ((inNamedEntity) && (isNotEmpty(entityType))) {
sb.append(token).append("\t").append(translate(entityType, entitySubType));
/*if (isNotEmpty(entitySubType)) {
Expand Down

0 comments on commit 575052f

Please sign in to comment.