Skip to content

Commit

Permalink
Merge pull request #562 from kermitt2/512_PDF_and_authors_with_ORCID
Browse files Browse the repository at this point in the history
Check whether annotation is orcid and add orcid to author.
  • Loading branch information
kermitt2 committed Aug 11, 2020
2 parents 227a804 + 753b072 commit 678c816
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3838,6 +3838,10 @@ public String toTEIAuthorBlock(int nbTag, GrobidAnalysisConfig config) {
TextUtilities.appendN(tei, '\t', nbTag + 1);
tei.append("<email>" + TextUtilities.HTMLEncode(author.getEmail()) + "</email>\n");
}
if (author.getORCID() != null) {
TextUtilities.appendN(tei, '\t', nbTag + 1);
tei.append("<idno type=\"ORCID\">" + TextUtilities.HTMLEncode(author.getORCID()) + "</idno>\n");
}

if (author.getAffiliations() != null) {
for (Affiliation aff : author.getAffiliations()) {
Expand Down
12 changes: 12 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Person.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class Person {
private String title = null;
private String suffix = null;
private String rawName = null; // raw full name if relevant/available, e.g. name exactly as displayed
private String orcid = null;
private boolean corresp = false;
private List<LayoutToken> layoutTokens = new ArrayList<>();

Expand Down Expand Up @@ -103,6 +104,14 @@ public void setCorresp(boolean b) {
corresp = b;
}

public String getORCID() {
return orcid;
}

public void setORCID(String id) {
orcid = id;
}

public List<String> getAffiliationBlocks() {
return affiliationBlocks;
}
Expand Down Expand Up @@ -182,6 +191,9 @@ public String toString() {
if (email != null) {
res += " (email:" + email + ")";
}
if (orcid != null) {
res += " (orcid:" + orcid + ")";
}
if (affiliations != null) {
for(Affiliation aff : affiliations) {
res += " (affiliation: " + aff.toString() + ") ";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
import org.grobid.core.engines.tagging.TaggerFactory;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorName;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.PDFAnnotation;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
Expand All @@ -28,6 +30,7 @@
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;

/**
* @author Patrice Lopez
Expand All @@ -54,14 +57,14 @@ public List<Person> processingCitation(String input) throws Exception {

// for language to English for the analyser to avoid any bad surprises
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
return processing(tokens, false);
return processing(tokens, null, false);
}

public List<Person> processingCitationLayoutTokens(List<LayoutToken> tokens) throws Exception {
if (CollectionUtils.isEmpty(tokens)) {
return null;
}
return processing(tokens, false);
return processing(tokens, null, false);
}

/**
Expand All @@ -76,11 +79,11 @@ public List<Person> processingHeader(String input) throws Exception {

// for language to English for the analyser to avoid any bad surprises
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
return processing(tokens, true);
return processing(tokens, null, true);
}

public List<Person> processingHeaderWithLayoutTokens(List<LayoutToken> inputs) {
return processing(inputs, true);
public List<Person> processingHeaderWithLayoutTokens(List<LayoutToken> inputs, List<PDFAnnotation> pdfAnnotations) {
return processing(inputs, pdfAnnotations, true);
}

/**
Expand All @@ -90,7 +93,7 @@ public List<Person> processingHeaderWithLayoutTokens(List<LayoutToken> inputs) {
* @param head - if true use the model for header's name, otherwise the model for names in citation
* @return List of identified Person entites as POJO.
*/
public List<Person> processing(List<LayoutToken> tokens, boolean head) {
public List<Person> processing(List<LayoutToken> tokens, List<PDFAnnotation> pdfAnnotations, boolean head) {
if (CollectionUtils.isEmpty(tokens)) {
return null;
}
Expand All @@ -116,13 +119,35 @@ public List<Person> processing(List<LayoutToken> tokens, boolean head) {
continue;
}

if(pdfAnnotations !=null) {
for (LayoutToken authorsToken : cluster.concatTokens()) {
for (PDFAnnotation pdfAnnotation : pdfAnnotations) {
BoundingBox intersectBox = pdfAnnotation.getIntersectionBox(authorsToken);
if (intersectBox != null) {
BoundingBox authorsBox = BoundingBox.fromLayoutToken(authorsToken);
if (intersectBox.equals(authorsBox)) {
} else {
double pixPerChar = authorsToken.getWidth() / authorsToken.getText().length();
int charsCovered = (int) ((intersectBox.getWidth() / pixPerChar) + 0.5);
// !! here we consider the annot is at the tail or end of the names
String newToken = authorsToken.getText().substring(0, authorsToken.getText().length() - charsCovered);
Matcher orcidMatcher = TextUtilities.ORCIDPattern.matcher(pdfAnnotation.getDestination());
if (orcidMatcher.find()) {
aut.setORCID(orcidMatcher.group(1) + "-"
+ orcidMatcher.group(2) + "-" + orcidMatcher.group(3)+ "-" + orcidMatcher.group(4));
authorsToken.setText(newToken);
}
}
}
}
}
}
TaggingLabel clusterLabel = cluster.getTaggingLabel();
Engine.getCntManager().i(clusterLabel);
//String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens()));
String clusterContent = StringUtils.normalizeSpace(LayoutTokensUtil.toText(cluster.concatTokens()));
if (clusterContent.trim().length() == 0)
continue;

if (clusterLabel.equals(TaggingLabels.NAMES_HEADER_MARKER)) {
// a marker introduces a new author, and the marker could be attached to the previous (usual)
// or following author (rare)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import org.grobid.core.lang.Language;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.layout.Block;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.PDFAnnotation;
import org.grobid.core.tokenization.LabeledTokensContainer;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,27 @@ public boolean cover(LayoutToken token) {
}
return res;
}

/**
* Return the intersection box between token and annotation
*/
public BoundingBox getIntersectionBox(LayoutToken token) {
if (token == null)
return null;
BoundingBox intersectBox = null;
int pageToken = token.getPage();
if (pageToken == pageNumber) {
BoundingBox tokenBox = BoundingBox.fromLayoutToken(token);
for(BoundingBox box : boundingBoxes) {
if (box.intersect(tokenBox)) {
if (box.contains(tokenBox)) {
intersectBox = tokenBox;
break;
}
intersectBox = box.boundingBoxIntersection(tokenBox);
}
}
}
return intersectBox;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ public class TextUtilities {
public static final String SLASH = "/";

// note: be careful of catastrophic backtracking here as a consequence of PDF noise!

private static final String ORCIDRegex =
"^\\s*(?:(?:https?://)?orcid.org/)?([0-9]{4})\\-?([0-9]{4})\\-?([0-9]{4})\\-?([0-9]{3}[\\dX])\\s*$";
static public final Pattern ORCIDPattern = Pattern.compile(ORCIDRegex);

// the magical DOI regular expression...
static public final Pattern DOIPattern = Pattern
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.startsWith;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.*;

@Ignore
public class TextUtilitiesTest extends EngineTest {
Expand Down Expand Up @@ -390,4 +390,21 @@ public void testIsAllUpperCaseOrDigitOrDot() throws Exception {
assertThat(TextUtilities.isAllUpperCaseOrDigitOrDot("P.C.T."), is(true));
assertThat(TextUtilities.isAllUpperCaseOrDigitOrDot("P.C,T."), is(false));
}

@Test
public void testOrcidPattern() {
String[] falseOrcids = {"1234", "1234-5698-137X", "0000-0001-9877-137Y","http://orcid.fr/0000-0001-9877-137X"};
String[] trueOrcids = {"0000-0001-9877-137X", "http://orcid.org/0000-0001-9877-137X", "orcid.org/0000-0001-9877-137X"};
for(String falseOrcid : falseOrcids) {
Matcher orcidMatcher = TextUtilities.ORCIDPattern.matcher(falseOrcid);
assertFalse (orcidMatcher.find());
}
for(String trueOrcid : trueOrcids) {
Matcher orcidMatcher = TextUtilities.ORCIDPattern.matcher(trueOrcid);
if (orcidMatcher.find()) {
assertThat(orcidMatcher.group(1) + "-"
+ orcidMatcher.group(2) + "-" + orcidMatcher.group(3) + "-" + orcidMatcher.group(4) , is("0000-0001-9877-137X"));
}
}
}
}

0 comments on commit 678c816

Please sign in to comment.