Skip to content

Commit

Permalink
support HAL ID from consolidation service
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Feb 25, 2024
1 parent d4822e1 commit 97cd71d
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 10 deletions.
54 changes: 45 additions & 9 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Expand Up @@ -107,6 +107,7 @@ public String toString() {
", PMID='" + PMID + '\'' +
", PMCID='" + PMCID + '\'' +
", PII='" + PII + '\'' +
", HALId='" + halId + '\'' +
", ark='" + ark + '\'' +
", istexId='" + istexId + '\'' +
", inDOI='" + inDOI + '\'' +
Expand Down Expand Up @@ -256,6 +257,7 @@ public String toString() {
private String PMID = null;
private String PMCID = null;
private String PII = null;
private String halId = null;
private String ark = null;
private String istexId = null;
private String abstract_ = null;
Expand Down Expand Up @@ -526,6 +528,10 @@ public String getDOI() {
return doi;
}

public String getHalId() {
return halId;
}

public String getArk() {
return ark;
}
Expand Down Expand Up @@ -1060,9 +1066,20 @@ public static String cleanDOI(String doi) {
doi = doi.replaceAll("[\\p{M}]", "");
doi = doi.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

// remove possible starting/trailing parenthesis
if (doi.startsWith("(") || doi.startsWith("[") || doi.startsWith("⟨"))
doi = doi.substring(1);

if (doi.endsWith(")") || doi.endsWith("]") || doi.endsWith("⟩"))
doi = doi.substring(0,doi.length()-1);

return doi;
}

public void setHalId(String halId) {
this.halId = halId;
}

public void setArXivId(String id) {
if (id != null) {
arXivId = StringUtils.normalizeSpace(id);
Expand Down Expand Up @@ -1591,6 +1608,7 @@ public void reset() {
type = null;
book_type = null;
doi = null;
halId = null;
istexId = null;
ark = null;
inDOI = null;
Expand Down Expand Up @@ -2101,6 +2119,9 @@ public String toBibTeX(String id, GrobidAnalysisConfig config) {
* the corresponding field and reset the generic pubnum field.
*/
public void checkIdentifier() {

System.out.println(pubnum);

// DOI
if (!StringUtils.isEmpty(pubnum) && StringUtils.isEmpty(doi)) {
Matcher doiMatcher = TextUtilities.DOIPattern.matcher(pubnum);
Expand Down Expand Up @@ -2169,7 +2190,7 @@ else if (pubnum != null && pubnum.length() == 13)
}
}

// TODO: PII
// TODO: PII and HALId

}

Expand Down Expand Up @@ -2345,6 +2366,13 @@ else if (bookTitle == null) {
tei.append("<idno type=\"DOI\">" + TextUtilities.HTMLEncode(doi) + "</idno>\n");
}

if (!StringUtils.isEmpty(halId)) {
for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
}
tei.append("<idno type=\"HALid\">" + TextUtilities.HTMLEncode(halId) + "</idno>\n");
}

if (!StringUtils.isEmpty(arXivId)) {
for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
Expand Down Expand Up @@ -2786,9 +2814,6 @@ else if (this.getYear().length() == 4)
}
}

/*for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
}*/
if ((volumeBlock != null) | (issue != null) || (pageRange != null) || (publication_date != null)
|| (publisher != null)) {
for (int i = 0; i < indent + 2; i++) {
Expand Down Expand Up @@ -2947,7 +2972,12 @@ else if (this.getYear().length() == 4)
for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
}
if ((publication_date != null) || (pageRange != null) || (location != null) || (publisher != null) || (volumeBlock != null)) {
if (normalized_publication_date != null ||
publication_date != null ||
pageRange != null ||
location != null ||
publisher != null ||
volumeBlock != null) {
tei.append("<imprint>\n");
}
else {
Expand Down Expand Up @@ -3177,12 +3207,13 @@ else if (this.getYear().length() == 4)
}

if (uri != null) {
if (uri.startsWith("http://hal.")) {
/*if (uri.startsWith("http://hal.") || ) {
for (int i = 0; i < indent + 1; i++) {
tei.append("\t");
}
tei.append("<idno type=\"HALid\">" + TextUtilities.HTMLEncode(uri) + "</idno>\n");
} else {
} else */
{
for (int i = 0; i < indent + 1; i++) {
tei.append("\t");
}
Expand All @@ -3191,7 +3222,7 @@ else if (this.getYear().length() == 4)
}

if (url != null) {
if (url.startsWith("http://hal.")) {
if (url.startsWith("http://hal.") || url.startsWith("https://hal.")) {
for (int i = 0; i < indent + 1; i++) {
tei.append("\t");
}
Expand Down Expand Up @@ -4117,6 +4148,7 @@ public static void injectIdentifiers(BiblioItem destination, BiblioItem source)
destination.setPII(source.getPII());
destination.setIstexId(source.getIstexId());
destination.setArk(source.getArk());
destination.setHalId(source.getHalId());
}

/**
Expand All @@ -4140,6 +4172,8 @@ public static void correct(BiblioItem bib, BiblioItem bibo) {
bib.setIstexId(bibo.getIstexId());
if (bibo.getArk() != null)
bib.setArk(bibo.getArk());
if (bibo.getHalId() != null)
bib.setHalId(bibo.getHalId());

if (bibo.getOAURL() != null)
bib.setOAURL(bibo.getOAURL());
Expand Down Expand Up @@ -4243,6 +4277,8 @@ public static void correct(BiblioItem bib, BiblioItem bibo) {
bib.setISBN10(bibo.getISBN10());
if (bibo.getISBN13() != null)
bib.setISBN13(bibo.getISBN13());
if (bibo.getHalId() != null)
bib.setHalId(bibo.getHalId());

if (bibo.getItem() != -1) {
bib.setItem(bibo.getItem());
Expand Down Expand Up @@ -4361,7 +4397,7 @@ public boolean rejectAsReference() {
if (fullAuthors == null && collaboration == null)
authorSet = false;
// normally properties authors and authorList are null in the current Grobid version
if (!titleSet && !authorSet && (url == null) && (doi == null))
if (!titleSet && !authorSet && url == null && doi == null && halId ==null)
return true;
else
return false;
Expand Down
Expand Up @@ -747,6 +747,10 @@ else if (biblio.getE_Year().length() == 4)
tei.append("\t\t\t\t\t<idno type=\"DOI\">" + TextUtilities.HTMLEncode(theDOI) + "</idno>\n");
}

if (!StringUtils.isEmpty(biblio.getHalId())) {
tei.append("\t\t\t\t\t<idno type=\"halId\">" + TextUtilities.HTMLEncode(biblio.getHalId()) + "</idno>\n");
}

if (!StringUtils.isEmpty(biblio.getArXivId())) {
tei.append("\t\t\t\t\t<idno type=\"arXiv\">" + TextUtilities.HTMLEncode(biblio.getArXivId()) + "</idno>\n");
}
Expand Down
Expand Up @@ -26,7 +26,17 @@ protected BiblioItem deserializeOneItem(JsonNode item) {
biblio = new BiblioItem();
//System.out.println(item.toString());

biblio.setDOI(item.get("DOI").asText());
JsonNode doiNode = item.get("DOI");
if (doiNode != null && (!doiNode.isMissingNode()) ) {
String doi = doiNode.asText();
biblio.setDOI(doi);
}

JsonNode halNode = item.get("halId");
if (halNode != null && (!halNode.isMissingNode()) ) {
String halId = halNode.asText();
biblio.setHalId(halId);
}

// the following are usually provided by biblio-glutton which index augmented/aggregated
// metadata
Expand Down Expand Up @@ -170,6 +180,9 @@ protected BiblioItem deserializeOneItem(JsonNode item) {
if (publishPrintNode == null || publishPrintNode.isMissingNode()) {
publishPrintNode = item.get("published-print");
}
if (publishPrintNode == null || publishPrintNode.isMissingNode()) {
publishPrintNode = item.get("published");
}
if (publishPrintNode != null && (!publishPrintNode.isMissingNode())) {
JsonNode datePartNode = publishPrintNode.get("date-parts");
if (datePartNode != null && (!datePartNode.isMissingNode()) &&
Expand Down
Expand Up @@ -117,6 +117,12 @@ public void execute() {
doi = params.get("doi");
uriBuilder.setParameter("doi", doi);
}
if (params.get("HALID") != null || params.get("halId") != null) {
String doi = params.get("HALID");
if (doi == null)
doi = params.get("halId");
uriBuilder.setParameter("halId", doi);
}
if (params.get("PMID") != null || params.get("pmid") != null) {
String pmid = params.get("PMID");
if (pmid == null)
Expand Down

0 comments on commit 97cd71d

Please sign in to comment.