Skip to content

Commit

Permalink
Adding grobid implementation #12
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Feb 2, 2019
1 parent aaea913 commit 235e07d
Show file tree
Hide file tree
Showing 13 changed files with 609 additions and 19 deletions.
9 changes: 7 additions & 2 deletions lookup/build.gradle
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar

buildscript {
repositories {
mavenLocal()
Expand Down Expand Up @@ -51,6 +49,7 @@ dependencies {
testCompile group: 'junit', name: 'junit', version: '4.12'
testCompile "org.hamcrest:hamcrest-all:1.3"
testCompile "org.easymock:easymock:3.5"

compile "com.google.code.gson:gson:2.8.1"

compile "io.dropwizard:dropwizard-core:1.3.8"
Expand All @@ -65,13 +64,19 @@ dependencies {

compile "com.rockymadden.stringmetric:stringmetric-core_2.10:0.27.3"

//Parsing XML/Json
compile group: 'org.codehaus.woodstox', name: 'stax2-api', version: '4.0.0'
compile group: 'org.codehaus.woodstox', name: 'woodstox-core-asl', version: '4.4.1'

compile 'org.lmdbjava:lmdbjava:0.6.1'
compile 'de.ruedigermoeller:fst:2.56'
compile 'org.xerial.snappy:snappy-java:1.1.7.2'

compile 'org.elasticsearch.client:elasticsearch-rest-high-level-client:6.5.1'

// compile group: 'org.apache.httpcomponents', name: 'httpmime', version: '4.5.3'
compile 'org.apache.commons:commons-collections4:4.1'
compile 'commons-io:commons-io:2.6'

compile group: 'org.tukaani', name: 'xz', version: '1.8'
}
Expand Down
15 changes: 14 additions & 1 deletion lookup/data/config/config.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
storage: data/db
#storage: /Volumes/SEAGATE1TB/scienceminer/glutter/lmdb
#storage: /Volumes/SEAGATE1TB/scienceminer/crossref/lmdb
#storage: /Volumes/Lacie/workspace/crossref/lmdb
#storage: /media/lopez/T5/data/db
version: version_placeholder
Expand All @@ -11,6 +11,9 @@ ignoreCrossRefFields:
# Loading batch size
batchSize: 10000

# Grobid URL
grobidPath: http://localhost:8070/api

# when set to -1 it's using the number of processors available to the machine
maxAcceptedRequests: 2048

Expand All @@ -19,6 +22,16 @@ elastic:
index: crossref
type: work

#httpClient:
# timeout: 500ms
# connectionTimeout: 500ms
# timeToLive: 1h
# cookiesEnabled: false
# maxConnections: 1024
# maxConnectionsPerRoute: 1024
# keepAlive: 0ms
# retries: 0

server:
type: custom
applicationConnectors:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
package com.scienceminer.lookup.configuration;

import com.fasterxml.jackson.annotation.JsonProperty;
import io.dropwizard.Configuration;
import io.dropwizard.client.HttpClientConfiguration;

import javax.validation.Valid;
import javax.validation.constraints.NotNull;
import java.util.List;

public class LookupConfiguration extends Configuration {
Expand All @@ -16,7 +20,24 @@ public class LookupConfiguration extends Configuration {

private Elastic elastic;

private String grobidPath;

@Valid
@NotNull
private HttpClientConfiguration httpClient = new HttpClientConfiguration();

@JsonProperty("httpClient")
public HttpClientConfiguration getHttpClientConfiguration() {
return httpClient;
}

@JsonProperty("httpClient")
public void setHttpClientConfiguration(HttpClientConfiguration httpClient) {
this.httpClient = httpClient;
}

private List<String> ignoreCrossRefFields;

private int maxAcceptedRequests;

public String getStorage() {
Expand Down Expand Up @@ -68,7 +89,16 @@ public void setMaxAcceptedRequests(int maxAcceptedRequests) {
this.maxAcceptedRequests = maxAcceptedRequests;
}

public String getGrobidPath() {
return grobidPath;
}

public void setGrobidPath(String grobidPath) {
this.grobidPath = grobidPath;
}

public class Source {

private String unpaywall;
private String istex;

Expand All @@ -88,12 +118,13 @@ public void setIstex(String istex) {
this.istex = istex;
}
}

public class Elastic {

private String host;
private String index;
private String type;

private String type;

public String getHost() {
return host;
Expand All @@ -118,5 +149,6 @@ public String getType() {
public void setType(String type) {
this.type = type;
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import com.scienceminer.lookup.data.PmidData;
import com.scienceminer.lookup.exception.NotFoundException;
import com.scienceminer.lookup.storage.lookup.*;
import com.scienceminer.lookup.utils.grobid.GrobidClient;
import com.scienceminer.lookup.utils.grobid.GrobidResponseStaxHandler;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import scala.Option;
Expand All @@ -30,6 +32,7 @@ public class LookupEngine {
private MetadataMatching metadataMatching = null;
private PMIdsLookup pmidLookup = null;
public static Pattern DOIPattern = Pattern.compile("\"DOI\"\\s?:\\s?\"(10\\.\\d{4,5}\\/[^\"\\s]+[^;,.\\s])\"");
private GrobidClient grobidClient = null;

public LookupEngine() {
}
Expand Down Expand Up @@ -246,6 +249,37 @@ public String retrieveByBiblio(String biblio) {
return injectIdsByDoi(outputData.getJsonObject(), outputData.getDOI());
}

public void retrieveByBiblioAsync(String biblio, Boolean postValidate, String firstAuthor, String title, Boolean parseReference, Consumer<MatchingDocument> callback) {
metadataMatching.retrieveByBiblioAsync(biblio, matchingDocument -> {
if (!matchingDocument.isException()) {
if (postValidate != null && postValidate) {
//no title and author, extract with grobid. if grobid unavailable... it will fail.
if(isBlank(title) && isBlank(firstAuthor) && parseReference) {
try {
GrobidResponseStaxHandler.GrobidResponse response = grobidClient.processCitation(biblio, "0");
if (!areMetadataMatching(response.getAtitle(), response.getFirstAuthor(), matchingDocument)) {
callback.accept(new MatchingDocument(new NotFoundException("Article found but it didn't passed the postValidation.")));
return;
}
} catch (Exception e) {
callback.accept(new MatchingDocument(new NotFoundException("Article found but it could not be postValidated. No title and first Author provided for validation and " +
"Grobid wasn't available.", e)));
}
} else {
if (!areMetadataMatching(title, firstAuthor, matchingDocument)) {
callback.accept(new MatchingDocument(new NotFoundException("Article found but it didn't passed the postValidation.")));
return;
}
}
}

final String s = injectIdsByDoi(matchingDocument.getJsonObject(), matchingDocument.getDOI());
matchingDocument.setFinalJsonObject(s);
}
callback.accept(matchingDocument);
});
}

public void retrieveByBiblioAsync(String biblio, Consumer<MatchingDocument> callback) {
metadataMatching.retrieveByBiblioAsync(biblio, matchingDocument -> {
if (!matchingDocument.isException()) {
Expand Down Expand Up @@ -444,4 +478,7 @@ public void setPmidLookup(PMIdsLookup pmidLookup) {
this.pmidLookup = pmidLookup;
}

public void setGrobidClient(GrobidClient grobidClient) {
this.grobidClient = grobidClient;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package com.scienceminer.lookup.utils.grobid;

import com.ctc.wstx.stax.WstxInputFactory;
import com.scienceminer.lookup.exception.ServiceException;
import com.scienceminer.lookup.utils.xml.StaxUtils;
import org.apache.http.Consts;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.codehaus.stax2.XMLStreamReader2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLStreamException;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.List;

/**
* Created by lfoppiano on 17/08/16.
*/
public class GrobidClient {

private static final Logger LOGGER = LoggerFactory.getLogger(GrobidClient.class);

private HttpClient httpClient;
private String grobidPath;
private WstxInputFactory inputFactory = new WstxInputFactory();
private GrobidResponseStaxHandler grobidResponseStaxHandler = new GrobidResponseStaxHandler();

public GrobidClient(String grobidPath) {
this.grobidPath = grobidPath;
this.httpClient = HttpClientBuilder.create().build();
}

public void ping() throws ServiceException {
try {
final HttpResponse response = httpClient.execute(new HttpGet(grobidPath + "/isalive"));
if (response.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_OK) {
throw new ServiceException(502, "Error while connecting to GROBID service. Error code: " + response.getStatusLine().getStatusCode());
}
} catch (IOException e) {
throw new ServiceException(502, "Error while connecting to GROBID service", e);
}
}

public GrobidResponseStaxHandler.GrobidResponse processCitation(String rawCitation, String consolidation) throws ServiceException {

try {
final HttpPost request = new HttpPost(grobidPath + "/processCitation");

List<NameValuePair> formparams = new ArrayList<>();
formparams.add(new BasicNameValuePair("citations", rawCitation));
formparams.add(new BasicNameValuePair("consolidateCitation", consolidation));
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, Consts.UTF_8);
request.setEntity(entity);

final HttpResponse response = httpClient.execute(request);
if (response.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_OK) {
throw new ServiceException(502, "Error while connecting to GROBID service. Error code: " + response.getStatusLine().getStatusCode());
} else {
try {
XMLStreamReader2 reader = (XMLStreamReader2) inputFactory.createXMLStreamReader(response.getEntity().getContent());

StaxUtils.traverse(reader, grobidResponseStaxHandler);

return grobidResponseStaxHandler.getResponse();
} catch (XMLStreamException e) {
throw new ServiceException(502, "Cannot parse the respons from GROBID", e);
}
}
} catch (IOException e) {
throw new ServiceException(502, "Error while connecting to GROBID service", e);
}

}
}

0 comments on commit 235e07d

Please sign in to comment.