Skip to content

Commit

Permalink
consolidate cleanDOI method within BiblioItem + some more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Apr 27, 2021
1 parent 4df289f commit f2d3df2
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 119 deletions.
95 changes: 46 additions & 49 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Expand Up @@ -1004,50 +1004,47 @@ public void setInDOI(String id) {
}
}

private static String cleanDOI(String bibl) {
if (bibl != null) {
bibl = StringUtils.normalizeSpace(bibl);
bibl = bibl.replace(" ", "");

if (bibl.startsWith("http://dx.doi.org/") ||
bibl.startsWith("https://dx.doi.org/") ||
bibl.startsWith("http://doi.org/") ||
bibl.startsWith("https://doi.org/")) {
bibl = bibl.replaceAll("http(s)?\\://(dx\\.)?doi\\.org/", "");
}
public static String cleanDOI(String doi) {
if (doi == null) {
return doi;
}

//bibl = bibl.replace("//", "/");
if (bibl.toLowerCase().startsWith("doi:") || bibl.toLowerCase().startsWith("doi/")) {
bibl = bibl.substring(4);
}
if (bibl.toLowerCase().startsWith("doi")) {
bibl = bibl.substring(3);
}
// pretty common wrong extraction pattern:
// 43-61.DOI:10.1093/jpepsy/14.1.436/7
// 367-74.DOI:10.1080/14034940210165064
// (pages concatenated to the DOI) - easy/safe to fix
if ( (bibl.indexOf("DOI:10.") != -1) || (bibl.indexOf("doi:10.") != -1) ) {
int ind = bibl.indexOf("DOI:10.");
if (ind == -1)
ind = bibl.indexOf("doi:10.");
bibl = bibl.substring(ind+4);
}
doi = StringUtils.normalizeSpace(doi);
doi = doi.replace(" ", "");
doi = doi.replaceAll("https?\\://(dx\\.)?doi\\.org/", "");

// for DOI coming from PDF links, we have some prefix cleaning to make
if (bibl.startsWith("file://") || bibl.startsWith("https://") || bibl.startsWith("http://")) {
int ind = bibl.indexOf("/10.");
if (ind != -1)
bibl = bibl.substring(ind+1);
}

bibl = bibl.trim();
int ind = bibl.indexOf("http://");
if (ind != -1 && ind > 10) {
bibl = bibl.substring(0,ind);
}
//bibl = bibl.replace("//", "/");
if (doi.toLowerCase().startsWith("doi:") || doi.toLowerCase().startsWith("doi/")) {
doi = doi.substring(4);
}
if (doi.toLowerCase().startsWith("doi")) {
doi = doi.substring(3);
}
return bibl;
// pretty common wrong extraction pattern:
// 43-61.DOI:10.1093/jpepsy/14.1.436/7
// 367-74.DOI:10.1080/14034940210165064
// (pages concatenated to the DOI) - easy/safe to fix
if (StringUtils.containsIgnoreCase(doi, "doi:10.")) {
doi = doi.substring(StringUtils.indexOfIgnoreCase(doi, "doi:10.")+4);
}

// for DOI coming from PDF links, we have some prefix cleaning to make
if (doi.startsWith("file://") || doi.startsWith("https://") || doi.startsWith("http://")) {
int ind = doi.indexOf("/10.");
if (ind != -1)
doi = doi.substring(ind+1);
}

doi = doi.trim();
int ind = doi.indexOf("http://");
if (ind > 10) {
doi = doi.substring(0, ind);
}

doi = doi.replaceAll("[\\p{M}]", "");
doi = doi.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

return doi;
}

public void setArXivId(String id) {
Expand Down Expand Up @@ -4313,16 +4310,16 @@ else if (alphaPostfixEnd != null)
}

/**
* Correct/add only the DOI of the first biblio item based on the second one
* Correct/add identifiers of the first biblio item based on the second one
*/
public static void injectDOI(BiblioItem bib, BiblioItem bibo) {
bib.setDOI(bibo.getDOI());
public static void injectIdentifiers(BiblioItem destination, BiblioItem source) {
destination.setDOI(source.getDOI());
// optionally associated strong identifiers are also injected
bib.setPMID(bibo.getPMID());
bib.setPMCID(bibo.getPMCID());
bib.setPII(bibo.getPII());
bib.setIstexId(bibo.getIstexId());
bib.setArk(bibo.getArk());
destination.setPMID(source.getPMID());
destination.setPMCID(source.getPMCID());
destination.setPII(source.getPII());
destination.setIstexId(source.getIstexId());
destination.setArk(source.getArk());
}

/**
Expand Down
Expand Up @@ -38,10 +38,8 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* @author Patrice Lopez
Expand Down Expand Up @@ -376,7 +374,7 @@ public List<BibDataSet> processingReferenceSection(Document doc, ReferenceSegmen
if (consolidate == 1)
BiblioItem.correct(resCitation, bibo);
else if (consolidate == 2)
BiblioItem.injectDOI(resCitation, bibo);
BiblioItem.injectIdentifiers(resCitation, bibo);
}
}
}
Expand Down Expand Up @@ -567,7 +565,7 @@ public BiblioItem consolidateCitation(BiblioItem resCitation, String rawCitation
if (consolidate == 1)
BiblioItem.correct(resCitation, bibo);
else if (consolidate == 2)
BiblioItem.injectDOI(resCitation, bibo);
BiblioItem.injectIdentifiers(resCitation, bibo);
}
} catch (Exception e) {
LOGGER.error("An exception occurred while running bibliographical data consolidation.", e);
Expand Down
Expand Up @@ -15,8 +15,6 @@

package org.grobid.core.engines;

import com.google.common.io.Files;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;

import org.grobid.core.data.Affiliation;
Expand All @@ -29,14 +27,10 @@
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.engines.label.SegmentationLabels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.exceptions.GrobidResourceException;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.factory.GrobidPoolingFactory;
import org.grobid.core.lang.Language;
import org.grobid.core.utilities.Consolidation;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.LanguageUtilities;
import org.grobid.core.utilities.Utilities;
import org.grobid.core.utilities.counters.CntManager;
Expand Down Expand Up @@ -220,7 +214,7 @@ public List<BiblioItem> processRawReferences(List<String> references, int consol
if (consolidate == 1)
BiblioItem.correct(resCitation, bibo);
else if (consolidate == 2)
BiblioItem.injectDOI(resCitation, bibo);
BiblioItem.injectIdentifiers(resCitation, bibo);
}
finalResults.add(resCitation);
}
Expand Down
Expand Up @@ -14,8 +14,6 @@
import org.grobid.core.data.Figure;
import org.grobid.core.data.Table;
import org.grobid.core.data.Equation;
import org.grobid.core.data.Metadata;
import org.grobid.core.data.Person;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentPiece;
import org.grobid.core.document.DocumentPointer;
Expand Down Expand Up @@ -55,12 +53,10 @@
import java.io.OutputStreamWriter;
import java.io.Writer;

import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeSet;
Expand Down Expand Up @@ -221,7 +217,7 @@ public Document processing(DocumentSource documentSource,
if (config.getConsolidateCitations() == 1)
BiblioItem.correct(resCitation, bibo);
else if (config.getConsolidateCitations() == 2)
BiblioItem.injectDOI(resCitation, bibo);
BiblioItem.injectIdentifiers(resCitation, bibo);
}
}
} catch(Exception e) {
Expand Down
Expand Up @@ -1394,7 +1394,7 @@ else if (consolidate == 2)
if (consolidate == 1)
BiblioItem.correct(resHeader, bib);
else if (consolidate == 2)
BiblioItem.injectDOI(resHeader, bib);
BiblioItem.injectIdentifiers(resHeader, bib);
}
} catch (Exception e) {
throw new GrobidException("An exception occured while running bibliographical data consolidation.", e);
Expand Down
Expand Up @@ -17,6 +17,8 @@

import java.util.*;

import static org.grobid.core.data.BiblioItem.cleanDOI;

/**
* Singleton class for managing the extraction of bibliographical information from pdf documents.
* When consolidation operations are realized, be sure to call the close() method
Expand Down Expand Up @@ -115,7 +117,7 @@ public BiblioItem consolidate(BiblioItem bib, String rawCitation) throws Excepti

String theDOI = bib.getDOI();
if (StringUtils.isNotBlank(theDOI)) {
theDOI = cleanDoi(theDOI);
theDOI = cleanDOI(theDOI);
}
final String doi = theDOI;
String aut = bib.getFirstAuthorSurname();
Expand Down Expand Up @@ -327,7 +329,7 @@ public Map<Integer,BiblioItem> consolidate(List<BibDataSet> biblios) {
// first we get the exploitable metadata
String doi = theBiblio.getDOI();
if (StringUtils.isNotBlank(doi)) {
doi = cleanDoi(doi);
doi = BiblioItem.cleanDOI(doi);
}
String aut = theBiblio.getFirstAuthorSurname();
String title = theBiblio.getTitle();
Expand Down Expand Up @@ -609,24 +611,6 @@ public void onError(int status, String message, Exception exception) {
return false;
}*/

/**
* This is a DOI cleaning specifically adapted to CrossRef call
*/
protected static String cleanDoi(String doi) {
doi = doi.replace("\"", "");
doi = doi.replace("\n", "");
if (StringUtils.lowerCase(doi).startsWith("doi:") || StringUtils.lowerCase(doi).startsWith("doi/")) {
doi = doi.substring(4);
doi = doi.trim();
}
doi = doi.replaceAll("[\\p{M}]", "");
doi = doi.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

doi = doi.replace(" ", "");
return doi;
}


/**
* The new public CrossRef API is a search API, and thus returns
* many false positives. It is necessary to validate return results
Expand Down

0 comments on commit f2d3df2

Please sign in to comment.