Permalink
Browse files

fixing de-synchronisation of tokens in grobid-ner, now the layoutToke…

…ns are already included in the mention
  • Loading branch information...
lfoppiano committed Mar 1, 2018
1 parent 4b66ccb commit ce3d5082b44233b928c5abe210c409ce1e2d6255
Binary file not shown.
View
@@ -796,7 +796,7 @@
<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-ner</artifactId>
<version>0.5.0-SNAPSHOT</version>
<version>0.5.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.grobid</groupId>
@@ -9,6 +9,7 @@
import com.scienceminer.nerd.mention.ProcessText;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.data.Sense;
@@ -61,6 +62,9 @@
// optional bounding box in the source document
private List<BoundingBox> boundingBoxes = null;
// layout tokens corresponding to the entity in the text
private List<LayoutToken> layoutTokens = null;
// probability of the entity in context, if defined
private double prob = 1.0;
@@ -156,6 +160,7 @@ else if (rawName != null) {
boundingBoxes = mention.getBoundingBoxes();
isAcronym = mention.getIsAcronym();
source = mention.getSource();
layoutTokens = mention.getLayoutTokens();
}
public NerdEntity(NerdEntity entity) {
@@ -182,6 +187,7 @@ public NerdEntity(NerdEntity entity) {
source = entity.getSource();
nerdScore = entity.getNerdScore();
selectionScore = entity.getSelectionScore();
layoutTokens = entity.getLayoutTokens();
}
public String getRawName() {
@@ -431,7 +437,7 @@ public void setWikipediaMultilingualRef(Map<String,String> translations,
wikipediaMultilingualArticle = subArticleCorrespondance;
}
public int getWiktionaryExternalRef() {
return wiktionaryExternalRef;
@@ -537,21 +543,22 @@ public BiblioItem getBiblio() {
return biblio;
}
@Override
public boolean equals(Object object) {
boolean result = false;
if ( (object != null) && object instanceof NerdEntity) {
int start = ((NerdEntity)object).getOffsetStart();
int end = ((NerdEntity)object).getOffsetEnd();
if ( (start == offsets.start) && (end == offsets.end)
&& (this.wikipediaExternalRef == ((NerdEntity)object).getWikipediaExternalRef()) ) {
result = true;
}
}
return result;
if (this == object) return true;
if (!(object instanceof NerdEntity)) return false;
NerdEntity ne = (NerdEntity) object;
int start = ne.getOffsetStart();
int end = ne.getOffsetEnd();
if (start != offsets.start || end != offsets.end) return false;
if(this.wikipediaExternalRef != ne.getWikipediaExternalRef())
return false;
return StringUtils.equals(wikidataId, ne.getWikidataId());
}
@Override
@@ -1076,4 +1083,12 @@ public static boolean subSequence(NerdEntity entity, NerdEntity otherEntity, boo
return false;
}
public List<LayoutToken> getLayoutTokens() {
return layoutTokens;
}
public void setLayoutTokens(List<LayoutToken> layoutTokens) {
this.layoutTokens = layoutTokens;
}
}
@@ -1,5 +1,6 @@
package com.scienceminer.nerd.mention;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.lexicon.NERLexicon;
import org.grobid.core.layout.BoundingBox;
@@ -46,6 +47,9 @@
// optional bounding box in the source document
protected List<BoundingBox> boundingBoxes = null;
// optional layout tokens corresponding to the current mention
private List<LayoutToken> layoutTokens = null;
// if the mention is an acronym; if true, the normalisedName will give the found expended form
private boolean isAcronym = false;
@@ -78,6 +82,7 @@ public Mention(Entity ent) {
boundingBoxes = ent.getBoundingBoxes();
isAcronym = ent.getIsAcronym();
entity = ent;
layoutTokens = ent.getLayoutTokens();
//startTokenPos = ent.startTokenPos;
//endTokenPos = ent.startTokenPos;
}
@@ -90,6 +95,7 @@ public Mention(Mention ent) {
isAcronym = ent.isAcronym;
entity = ent.entity;
source = ent.source;
layoutTokens = ent.layoutTokens;
//startTokenPos = ent.startTokenPos;
//endTokenPos = ent.startTokenPos;
}
@@ -336,4 +342,12 @@ public String toString() {
return buffer.toString();
}
public List<LayoutToken> getLayoutTokens() {
return layoutTokens;
}
public void setLayoutTokens(List<LayoutToken> layoutTokens) {
this.layoutTokens = layoutTokens;
}
}
@@ -238,17 +238,6 @@ public static boolean isAllLowerCase(String text) {
*/
private List<Mention> processTokens(NerdQuery nerdQuery) throws NerdException {
List<LayoutToken> tokens = nerdQuery.getTokens();
//Re-align the tokens
if(CollectionUtils.isNotEmpty(tokens)) {
final int initialOffset = tokens.get(0).getOffset();
if(initialOffset > 0) {
for (LayoutToken token : tokens) {
token.setOffset(token.getOffset() - initialOffset);
}
}
}
List<Mention> results = new ArrayList<>();
Language language = nerdQuery.getLanguage();
@@ -354,43 +343,15 @@ public static boolean isAllLowerCase(String text) {
// associate bounding boxes to identified mentions
List<Mention> finalResults = new ArrayList<>();
int tokenPos = 0;
int lastTokenIndex = 0;
int lastTokenPos = 0;
for (Mention entity : results) {
// synchronize layout token with the selected ngrams
List<LayoutToken> entityTokens = null;
tokenPos = lastTokenPos;
for (int j = lastTokenIndex; j < tokens.size(); j++) {
if (tokenPos < entity.getOffsetStart()) {
tokenPos += tokens.get(j).getText().length();
continue;
}
if (tokenPos + tokens.get(j).getText().length() > entity.getOffsetEnd()) {
break;
}
if (tokenPos == entity.getOffsetStart()) {
entityTokens = new ArrayList<>();
entityTokens.add(tokens.get(j));
lastTokenIndex = j;
lastTokenPos = tokenPos;
} else if ((tokenPos >= entity.getOffsetStart()) && (tokenPos <= entity.getOffsetEnd())) {
if (entityTokens == null) {
entityTokens = new ArrayList<>();
lastTokenIndex = j;
lastTokenPos = tokenPos;
}
entityTokens.add(tokens.get(j));
}
tokenPos += tokens.get(j).getText().length();
}
// synchronize layout token with the selected n-grams
List<LayoutToken> entityTokens = entity.getLayoutTokens();
if (entityTokens != null)
entity.setBoundingBoxes(BoundingBoxCalculator.calculate(entityTokens));
else
LOGGER.warn("LayoutToken sequence not found for mention: " + entity.getRawName());
LOGGER.warn("processNER: LayoutToken sequence not found for mention: " + entity.getRawName());
// we have an additional check of validity based on language
if (validEntity(entity, language.getLang())) {
if (!finalResults.contains(entity)) {
@@ -452,7 +413,7 @@ public static boolean isAllLowerCase(String text) {
// candidates which start and end with a stop word are removed.
// beware not to be too aggressive.
List<Integer> toRemove = new ArrayList<Integer>();
List<Integer> toRemove = new ArrayList<>();
for (int i = 0; i < pool.size(); i++) {
StringPos termPosition = pool.get(i);
@@ -465,7 +426,7 @@ public static boolean isAllLowerCase(String text) {
continue;
}*/
// remove term starting or ending with a stopword, and term starting with a separator (conservative
// remove term starting or ending with a stop-word, and term starting with a separator (conservative
// it should never be the case)
if (stopwords != null) {
if ((delimiters.indexOf(termValueLowercase.charAt(0)) != -1) ||
@@ -488,11 +449,9 @@ public static boolean isAllLowerCase(String text) {
}
}
List<StringPos> subPool = new ArrayList<StringPos>();
List<StringPos> subPool = new ArrayList<>();
for (int i = 0; i < pool.size(); i++) {
if (toRemove.contains(i)) {
continue;
} else {
if (!toRemove.contains(i)) {
subPool.add(pool.get(i));
}
}
@@ -563,7 +522,7 @@ public static boolean isAllLowerCase(String text) {
if (entityTokens != null)
entity.setBoundingBoxes(BoundingBoxCalculator.calculate(entityTokens));
else
LOGGER.warn("LayoutToken sequence not found for mention: " + candidate.string);
LOGGER.warn("processWikipedia: LayoutToken sequence not found for mention: " + candidate.string);
// we have an additional check of validity based on language
if (validEntity(entity, lang.getLang())) {
if (!results.contains(entity))
@@ -804,7 +763,7 @@ else if (terms.length() == 0) {
public static List<StringPos> ngrams(String str, int ngram, Language lang) {
int actualNgram = (ngram * 2) - 1; // for taking into account separators
List<StringPos> ngrams = new ArrayList<StringPos>();
List<StringPos> ngrams = new ArrayList<>();
if (str == null) {
return ngrams;
}
@@ -204,4 +204,14 @@ public void testDeserializeQuery_singleQuote() throws Exception {
final NerdQuery nerdQuery2 = target.fromJson("{\"mentions\": [\"ner\"]}");
assertThat(nerdQuery2.getMentions(), hasSize(1));
}
@Test
public void testDeserializeQuery_minRankScore() throws Exception {
NerdQuery nerdQuery = target.fromJson("{'minRankerScore': 0.03}");
assertThat(nerdQuery.getMinRankerScore(), is(0.03));
nerdQuery = target.fromJson("{'minRankerScore': 0.0123}");
assertThat(nerdQuery.getMinRankerScore(), is(0.0123));
}
}

2 comments on commit ce3d508

@kermitt2

This comment has been minimized.

Show comment
Hide comment
@kermitt2

kermitt2 Mar 7, 2018

Owner

Just a remark, I think it is either stop word or stopword, but never stop-word

Owner

kermitt2 replied Mar 7, 2018

Just a remark, I think it is either stop word or stopword, but never stop-word

@lfoppiano

This comment has been minimized.

Show comment
Hide comment
@lfoppiano

lfoppiano Mar 8, 2018

Collaborator

Noted. In my defence I can say it was Intellij!! :-)

Collaborator

lfoppiano replied Mar 8, 2018

Noted. In my defence I can say it was Intellij!! :-)

Please sign in to comment.