diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index c6abc5a11b..5734e5aa14 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -4243,22 +4243,30 @@ public static void correct(BiblioItem bib, BiblioItem bibo) { // authors present in fullAuthors list should be in the existing resources // at least the corresponding author - if (bibo.getFullAuthors() != null) { - if ( (bib.getFullAuthors() == null) || (bib.getFullAuthors().size() == 0) ) + if (!CollectionUtils.isEmpty(bibo.getFullAuthors())) { + if (CollectionUtils.isEmpty(bib.getFullAuthors())) bib.setFullAuthors(bibo.getFullAuthors()); else if (bibo.getFullAuthors().size() == 1) { - // we have the corresponding author + // we have the corresponding author // check if the author exists in the obtained list Person auto = (Person) bibo.getFullAuthors().get(0); List auts = bib.getFullAuthors(); if (auts != null) { for (Person aut : auts) { - if (aut.getLastName() != null) { - if (aut.getLastName().equals(auto.getLastName())) { - aut.setCorresp(true); - if (StringUtils.isNotBlank(auto.getEmail())) - aut.setEmail(auto.getEmail()); - // should we also check the country ? affiliation? + if (StringUtils.isNotBlank(aut.getLastName()) && StringUtils.isNotBlank(auto.getLastName())) { + if (aut.getLastName().toLowerCase().equals(auto.getLastName().toLowerCase())) { + if (StringUtils.isBlank(aut.getFirstName()) || + (auto.getFirstName() != null && + aut.getFirstName().length() <= auto.getFirstName().length() && + auto.getFirstName().toLowerCase().startsWith(aut.getFirstName().toLowerCase()))) { + aut.setFirstName(auto.getFirstName()); + aut.setCorresp(true); + if (StringUtils.isNotBlank(auto.getEmail())) + aut.setEmail(auto.getEmail()); + // should we also check the country ? affiliation? + if (StringUtils.isNotBlank(auto.getMiddleName()) && (StringUtils.isBlank(aut.getMiddleName()))) + aut.setMiddleName(auto.getMiddleName()); + } } } } @@ -4270,31 +4278,54 @@ else if (bibo.getFullAuthors().size() == 1) { for (Person aut : bibo.getFullAuthors()) { // try to find the author in the first item (we know it's not empty) for (Person aut2 : bib.getFullAuthors()) { + + if (StringUtils.isNotBlank(aut2.getLastName())) { - if (StringUtils.isNotBlank(aut.getLastName()) && aut.getLastName().equals(aut2.getLastName())) { - // check also first name if present - at least for the initial - if ( StringUtils.isNotBlank(aut2.getFirstName()) && StringUtils.isNotBlank(aut.getFirstName()) ) { - // we have a match (full first name) - if (StringUtils.isBlank(aut.getMiddleName())) - aut.setMiddleName(aut2.getMiddleName()); - if (StringUtils.isBlank(aut.getTitle())) - aut.setTitle(aut2.getTitle()); - if (StringUtils.isBlank(aut.getSuffix())) - aut.setSuffix(aut2.getSuffix()); - break; - } else if ( StringUtils.isNotBlank(aut.getFirstName()) && - StringUtils.isNotBlank(aut2.getFirstName()) && - (aut.getFirstName().length() == 1) && - (aut.getFirstName().equals(aut2.getFirstName().substring(0,1))) ) { - // we have a match (initial) - aut.setFirstName(aut2.getFirstName()); - if (StringUtils.isBlank(aut.getMiddleName())) - aut.setMiddleName(aut2.getMiddleName()); - if (StringUtils.isBlank(aut.getTitle())) - aut.setTitle(aut2.getTitle()); - if (StringUtils.isBlank(aut.getSuffix())) - aut.setSuffix(aut2.getSuffix()); - break; + String aut2_lastname = aut2.getLastName().toLowerCase(); + + if (StringUtils.isNotBlank(aut.getLastName())) { + String aut_lastname = aut.getLastName().toLowerCase(); + + if (aut_lastname.equals(aut2_lastname)) { + // check also first name if present - at least for the initial + if ( StringUtils.isBlank(aut2.getFirstName()) || + (StringUtils.isNotBlank(aut2.getFirstName()) && StringUtils.isNotBlank(aut.getFirstName())) ) { + // we have no first name or a match (full first name) + + if ( StringUtils.isBlank(aut2.getFirstName()) + || + aut.getFirstName().equals(aut2.getFirstName()) + || + ( aut.getFirstName().length() == 1 && + aut.getFirstName().equals(aut2.getFirstName().substring(0,1)) ) + ) { + // we have a match (full or initial) + if (StringUtils.isNotBlank(aut2.getFirstName()) && + aut2.getFirstName().length() > aut.getFirstName().length()) + aut.setFirstName(aut2.getFirstName()); + if (StringUtils.isBlank(aut.getMiddleName())) + aut.setMiddleName(aut2.getMiddleName()); + if (StringUtils.isBlank(aut.getTitle())) + aut.setTitle(aut2.getTitle()); + if (StringUtils.isBlank(aut.getSuffix())) + aut.setSuffix(aut2.getSuffix()); + if (StringUtils.isBlank(aut.getORCID())) + aut.setORCID(aut2.getORCID()); + if (StringUtils.isBlank(aut.getEmail())) + aut.setEmail(aut2.getEmail()); + if(!CollectionUtils.isEmpty(aut2.getAffiliations())) + aut.setAffiliations(aut2.getAffiliations()); + if (!CollectionUtils.isEmpty(aut2.getAffiliationBlocks())) + aut.setAffiliationBlocks(aut2.getAffiliationBlocks()); + if (!CollectionUtils.isEmpty(aut2.getAffiliationMarkers())) + aut.setAffiliationMarkers(aut2.getAffiliationMarkers()); + if (!CollectionUtils.isEmpty(aut2.getMarkers())) + aut.setMarkers(aut2.getMarkers()); + if (!CollectionUtils.isEmpty(aut2.getLayoutTokens())) + aut.setLayoutTokens(aut2.getLayoutTokens()); + break; + } + } } } } @@ -4303,7 +4334,6 @@ else if (bibo.getFullAuthors().size() == 1) { bib.setFullAuthors(bibo.getFullAuthors()); } } - //System.out.println("result: \n" + bib.toTEI(0)); } /** diff --git a/grobid-core/src/main/java/org/grobid/core/data/Person.java b/grobid-core/src/main/java/org/grobid/core/data/Person.java index 5dfef62689..6733ee6be6 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/Person.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Person.java @@ -29,8 +29,8 @@ public class Person { private String rawName = null; // raw full name if relevant/available, e.g. name exactly as displayed private String orcid = null; private boolean corresp = false; - private List layoutTokens = new ArrayList<>(); + private List layoutTokens = new ArrayList<>(); private List affiliationBlocks = null; private List affiliations = null; private List affiliationMarkers = null; @@ -116,6 +116,10 @@ public List getAffiliationBlocks() { return affiliationBlocks; } + public void setAffiliationBlocks(List blocks) { + this.affiliationBlocks = blocks; + } + public void addAffiliationBlocks(String f) { if (affiliationBlocks == null) affiliationBlocks = new ArrayList(); @@ -128,7 +132,7 @@ public List getAffiliations() { public void addAffiliation(org.grobid.core.data.Affiliation f) { if (affiliations == null) - affiliations = new ArrayList(); + affiliations = new ArrayList<>(); affiliations.add(f); } @@ -136,6 +140,10 @@ public List getAffiliationMarkers() { return affiliationMarkers; } + public void setAffiliationMarkers(List affiliationMarkers) { + this.affiliationMarkers = affiliationMarkers; + } + public void addAffiliationMarker(String s) { if (affiliationMarkers == null) affiliationMarkers = new ArrayList(); @@ -150,6 +158,10 @@ public List getMarkers() { return markers; } + public void setMarkers(List markers) { + this.markers = markers; + } + public void addMarker(String f) { if (markers == null) markers = new ArrayList(); @@ -176,6 +188,35 @@ public boolean notNull() { return true; } + /** + * Create a new instance of Person object from current instance (shallow copy) + */ + public Person clonePerson() { + Person person = new Person(); + person.firstName = this.firstName ; + person.middleName = this.middleName; + person.lastName = this.lastName; + person.title = this.title; + person.suffix = this.suffix; + person.rawName = this.rawName; + person.orcid = this.orcid; + person.corresp = this.corresp; + person.email = this.email; + + if (this.layoutTokens != null) + person.layoutTokens = new ArrayList<>(this.layoutTokens); + if (this.affiliationBlocks != null) + person.affiliationBlocks = new ArrayList<>(this.affiliationBlocks); + if (this.affiliations != null) + person.affiliations = new ArrayList<>(this.affiliations); + if (this.affiliationMarkers != null) + person.affiliationMarkers = new ArrayList<>(this.affiliationMarkers); + if (this.markers != null) + person.markers = new ArrayList<>(this.markers); + + return person; + } + public String toString() { String res = ""; if (title != null) @@ -206,6 +247,10 @@ public List getLayoutTokens() { return layoutTokens; } + public void setLayoutTokens(List tokens) { + this.layoutTokens = tokens; + } + /** * TEI serialization via xom. */ @@ -228,23 +273,23 @@ public String toTEI(boolean withCoordinates) { XmlBuilderUtils.addCoords(persElement, LayoutTokensUtil.getCoordsString(getLayoutTokens())); } if (title != null) { - persElement.appendChild(XmlBuilderUtils.teiElement("roleName", TextUtilities.HTMLEncode(title))); + persElement.appendChild(XmlBuilderUtils.teiElement("roleName", title)); } if (firstName != null) { - Element forename = XmlBuilderUtils.teiElement("forename", TextUtilities.HTMLEncode(firstName)); + Element forename = XmlBuilderUtils.teiElement("forename", firstName); forename.addAttribute(new Attribute("type", "first")); persElement.appendChild(forename); } if (middleName != null) { - Element mn = XmlBuilderUtils.teiElement("forename", TextUtilities.HTMLEncode(middleName)); + Element mn = XmlBuilderUtils.teiElement("forename", middleName); mn.addAttribute(new Attribute("type", "middle")); persElement.appendChild(mn); } if (lastName != null) { - persElement.appendChild(XmlBuilderUtils.teiElement("surname", TextUtilities.HTMLEncode(lastName))); + persElement.appendChild(XmlBuilderUtils.teiElement("surname", lastName)); } if (suffix != null) { - persElement.appendChild(XmlBuilderUtils.teiElement("genName", TextUtilities.HTMLEncode(suffix))); + persElement.appendChild(XmlBuilderUtils.teiElement("genName", suffix)); } return XmlBuilderUtils.toXml(persElement); @@ -531,11 +576,15 @@ public static List deduplicate(List persons) { for(int j=0; j < localPersons.size(); j++) { Person localPerson = localPersons.get(j); String localFirstName = localPerson.getFirstName(); - if (localFirstName != null) + if (localFirstName != null) { localFirstName = localFirstName.toLowerCase(); + localFirstName = localFirstName.replaceAll("[\\-\\.]", ""); + } String localMiddleName = localPerson.getMiddleName(); - if (localMiddleName != null) + if (localMiddleName != null) { localMiddleName = localMiddleName.toLowerCase(); + localMiddleName = localMiddleName.replaceAll("[\\-\\.]", ""); + } int nbClash = 0; for(int k=0; k < localPersons.size(); k++) { boolean clash = false; @@ -543,11 +592,15 @@ public static List deduplicate(List persons) { continue; Person otherPerson = localPersons.get(k); String otherFirstName = otherPerson.getFirstName(); - if (otherFirstName != null) + if (otherFirstName != null) { otherFirstName = otherFirstName.toLowerCase(); + otherFirstName = otherFirstName.replaceAll("[\\-\\.]", ""); + } String otherMiddleName = otherPerson.getMiddleName(); - if (otherMiddleName != null) + if (otherMiddleName != null) { otherMiddleName = otherMiddleName.toLowerCase(); + otherMiddleName = otherMiddleName.replaceAll("[\\-\\.]", ""); + } // test first name clash if (localFirstName != null && otherFirstName != null) { @@ -657,6 +710,10 @@ public static List deduplicate(List persons) { localSuffix = localPerson.getSuffix().toLowerCase(); } + String otherOrcid = otherPerson.getORCID(); + if (otherOrcid != null) + localPerson.setORCID(otherOrcid); + if (otherPerson.getAffiliations() != null) { for(Affiliation affOther : otherPerson.getAffiliations()) { localPerson.addAffiliation(affOther); diff --git a/grobid-core/src/test/java/org/grobid/core/data/BiblioItemTest.java b/grobid-core/src/test/java/org/grobid/core/data/BiblioItemTest.java index 7231255b18..8751885f6d 100644 --- a/grobid-core/src/test/java/org/grobid/core/data/BiblioItemTest.java +++ b/grobid-core/src/test/java/org/grobid/core/data/BiblioItemTest.java @@ -1,5 +1,16 @@ package org.grobid.core.data; +import org.grobid.core.main.LibraryLoader; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.Arrays; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.Matchers.hasSize; +import static org.junit.Assert.*; + import static org.junit.Assert.assertThat; import java.io.IOException; @@ -36,6 +47,12 @@ public class BiblioItemTest { public static final Logger LOGGER = LoggerFactory.getLogger(BiblioItemTest.class); + + @Before + public void setUp() throws Exception { + LibraryLoader.load(); + } + private GrobidAnalysisConfig.GrobidAnalysisConfigBuilder configBuilder = ( new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() ); @@ -153,4 +170,150 @@ public void shouldNotGnerateRawAffiliationTextIfNotEnabled() throws Exception { is(empty()) ); } + + @Test + public void injectDOI() { + } + + @Test + public void correct_empty_shouldNotFail() { + BiblioItem.correct(new BiblioItem(), new BiblioItem()); + } + + + @Test + public void correct_1author_shouldWork() { + BiblioItem biblio1 = new BiblioItem(); + List authors = new ArrayList<>(); + authors.add(createPerson("John", "Doe")); + biblio1.setFullAuthors(authors); + + BiblioItem biblio2 = new BiblioItem(); + authors = new ArrayList<>(); + authors.add(createPerson("John1", "Doe")); + biblio2.setFullAuthors(authors); + + BiblioItem.correct(biblio1, biblio2); + + assertThat(biblio1.getFirstAuthorSurname(), is(biblio2.getFirstAuthorSurname())); + assertThat(biblio1.getFullAuthors().get(0).getFirstName(), is(biblio2.getFullAuthors().get(0).getFirstName())); + } + + @Test + public void correct_2authors_shouldMatchFullName_shouldUpdateAffiliation() { + BiblioItem biblio1 = new BiblioItem(); + List authors = new ArrayList<>(); + authors.add(createPerson("John", "Doe")); + authors.add(createPerson("Jane", "Will")); + biblio1.setFullAuthors(authors); + + BiblioItem biblio2 = new BiblioItem(); + authors = new ArrayList<>(); + authors.add(createPerson("John", "Doe", "UCLA")); + authors.add(createPerson("Jane", "Will","Harward")); + biblio2.setFullAuthors(authors); + + BiblioItem.correct(biblio1, biblio2); + + assertThat(biblio1.getFirstAuthorSurname(), is(biblio2.getFirstAuthorSurname())); + assertThat(biblio1.getFullAuthors(), hasSize(2)); + assertThat(biblio1.getFullAuthors().get(0).getFirstName(), is(biblio2.getFullAuthors().get(0).getFirstName())); + // biblio1 affiliations empty we update them with ones from biblio2 + assertThat(biblio1.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString(), is(biblio2.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString())); + assertThat(biblio1.getFullAuthors().get(1).getFirstName(), is(biblio2.getFullAuthors().get(1).getFirstName())); + assertThat(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString(), is(biblio2.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString())); + } + + @Test + public void correct_2authors_shouldMatchFullName_shouldKeepAffiliation() { + BiblioItem biblio1 = new BiblioItem(); + List authors = new ArrayList<>(); + authors.add(createPerson("John", "Doe", "Stanford")); + authors.add(createPerson("Jane", "Will", "Cambridge")); + biblio1.setFullAuthors(authors); + + BiblioItem biblio2 = new BiblioItem(); + authors = new ArrayList<>(); + authors.add(createPerson("John", "Doe" )); + authors.add(createPerson("Jane", "Will", "UCLA")); + biblio2.setFullAuthors(authors); + + BiblioItem.correct(biblio1, biblio2); + + assertThat(biblio1.getFirstAuthorSurname(), is(biblio2.getFirstAuthorSurname())); + assertThat(biblio1.getFullAuthors(), hasSize(2)); + assertThat(biblio1.getFullAuthors().get(0).getFirstName(), is(biblio2.getFullAuthors().get(0).getFirstName())); + // biblio1 affiliations not empty, we keep biblio1 as is + assertThat(biblio1.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString(), is(biblio1.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString())); + assertThat(biblio1.getFullAuthors().get(1).getFirstName(), is(biblio2.getFullAuthors().get(1).getFirstName())); + assertThat(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString(), is(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString())); + assertThat(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString(), is(biblio2.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString())); + } + + @Test + public void correct_2authors_initial_2_shouldUpdateAuthor() { + BiblioItem biblio1 = new BiblioItem(); + List authors = new ArrayList<>(); + authors.add(createPerson("John", "Doe", "ULCA")); + authors.add(createPerson("J", "Will", "Harward")); + biblio1.setFullAuthors(authors); + + BiblioItem biblio2 = new BiblioItem(); + authors = new ArrayList<>(); + authors.add(createPerson("John1", "Doe", "Stanford")); + authors.add(createPerson("Jane", "Will", "Berkeley")); + biblio2.setFullAuthors(authors); + + BiblioItem.correct(biblio1, biblio2); + + assertThat(biblio1.getFirstAuthorSurname(), is(biblio2.getFirstAuthorSurname())); + assertThat(biblio1.getFullAuthors(), hasSize(2)); + assertThat(biblio1.getFullAuthors().get(0).getFirstName(), is(biblio2.getFullAuthors().get(0).getFirstName())); + // affiliation should be kept though since not empty + assertThat(biblio1.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString(), is(biblio1.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString())); + assertThat(biblio1.getFullAuthors().get(1).getFirstName(), is(biblio2.getFullAuthors().get(1).getFirstName())); + assertThat(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString(), is(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString())); + } + + @Test + public void correct_2authors_initial_shouldUpdateAuthor() { + BiblioItem biblio1 = new BiblioItem(); + List authors = new ArrayList<>(); + authors.add(createPerson("John", "Doe", "ULCA")); + authors.add(createPerson("Jane", "Will", "Harward")); + biblio1.setFullAuthors(authors); + + BiblioItem biblio2 = new BiblioItem(); + authors = new ArrayList<>(); + authors.add(createPerson("John1", "Doe", "Stanford")); + authors.add(createPerson("J", "Will", "Berkeley")); + biblio2.setFullAuthors(authors); + + BiblioItem.correct(biblio1, biblio2); + + assertThat(biblio1.getFirstAuthorSurname(), is(biblio2.getFirstAuthorSurname())); + assertThat(biblio1.getFullAuthors(), hasSize(2)); + assertThat(biblio1.getFullAuthors().get(0).getFirstName(), is(biblio2.getFullAuthors().get(0).getFirstName())); + // affiliation should be kept though + assertThat(biblio1.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString(), is(biblio1.getFullAuthors().get(0).getAffiliations().get(0).getAffiliationString())); + //assertThat(biblio1.getFullAuthors().get(1).getFirstName(), is(biblio2.getFullAuthors().get(0).getFirstName())); + assertThat(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString(), is(biblio1.getFullAuthors().get(1).getAffiliations().get(0).getAffiliationString())); + } + + private Person createPerson(String firstName, String secondName) { + final Person person = new Person(); + person.setFirstName(firstName); + person.setLastName(secondName); + return person; + } + + private Person createPerson(String firstName, String secondName, String affiliation) { + final Person person = createPerson(firstName, secondName); + final Affiliation affiliation1 = new Affiliation(); + affiliation1.setAffiliationString(affiliation); + List affiliations = new ArrayList<>(); + affiliations.add(affiliation1); + person.setAffiliations(affiliations); + return person; + } }