Skip to content

Commit

Permalink
#3918 - PubMed Central support
Browse files Browse the repository at this point in the history
- DKPro Core 2.3.0 -> 2.3.1
- Sort results by relevance which is the same as on the PubMed website
- Avoid including results from the last 24 hours to reduce the risk that BioC documents are not available for them
- Better support for annotations in BioC documents
- More testing
  • Loading branch information
reckart committed Apr 11, 2023
1 parent 8426fbc commit 8a390b9
Show file tree
Hide file tree
Showing 46 changed files with 2,015 additions and 251 deletions.
11 changes: 0 additions & 11 deletions inception/inception-external-search-pubmed/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -91,16 +91,5 @@
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>

<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>uimafit-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-segmentation-asl</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,16 @@
*/
package de.tudarmstadt.ukp.inception.externalsearch.pubmed;

import static java.time.temporal.ChronoField.DAY_OF_MONTH;
import static java.time.temporal.ChronoField.MONTH_OF_YEAR;
import static java.time.temporal.ChronoField.YEAR;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.time.Duration;
import java.time.Instant;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.List;

Expand All @@ -37,6 +44,7 @@
import de.tudarmstadt.ukp.inception.externalsearch.pubmed.traits.PubMedProviderTraits;
import de.tudarmstadt.ukp.inception.io.bioc.BioCFormatSupport;
import de.tudarmstadt.ukp.inception.io.bioc.model.BioCToCas;
import de.tudarmstadt.ukp.inception.schema.AnnotationSchemaService;

public class PubMedCentralProvider
implements ExternalSearchProvider<PubMedProviderTraits>
Expand All @@ -49,18 +57,25 @@ public class PubMedCentralProvider

private final PmcOaClient pmcoaClient;
private final EntrezClient entrezClient;
private final AnnotationSchemaService schemaService;

public PubMedCentralProvider(EntrezClient aEntrezClient, PmcOaClient aPmcoaClient)
public PubMedCentralProvider(EntrezClient aEntrezClient, PmcOaClient aPmcoaClient,
AnnotationSchemaService aSchemaService)
{
pmcoaClient = aPmcoaClient;
entrezClient = aEntrezClient;
schemaService = aSchemaService;
}

@Override
public List<ExternalSearchResult> executeQuery(DocumentRepository aDocumentRepository,
PubMedProviderTraits aTraits, String aQuery)
{
var query = aQuery + " AND \"free full text\"[filter]";
var date = Instant.now().atZone(ZoneOffset.UTC).minus(Duration.ofHours(24));
var query = aQuery + " AND \"free full text\"[filter] AND (\"0001/01/01\"[PubDate] : \""
+ date.get(YEAR) + "/" + date.get(MONTH_OF_YEAR) + "/" + date.get(DAY_OF_MONTH)
+ "\"[PubDate])";

var searchResponse = entrezClient.esearch(DB_PUB_MED_CENTRAL, query, 0, 100);
var summaryResponse = entrezClient.esummary(DB_PUB_MED_CENTRAL,
searchResponse.getIdList().stream().mapToInt(i -> i).toArray());
Expand Down Expand Up @@ -107,7 +122,8 @@ public String getDocumentText(DocumentRepository aDocumentRepository,
var biocXml = pmcoaClient.bioc(aTraits, PMCID_PREFIX + stripExtension(aDocumentId));

try {
var cas = WebAnnoCasUtil.createCas();
var cas = WebAnnoCasUtil.createCas(
schemaService.getFullProjectTypeSystem(aDocumentRepository.getProject()));
new BioCToCas().parseXml(new ByteArrayInputStream(biocXml), cas.getJCas());
return cas.getDocumentText();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import de.tudarmstadt.ukp.inception.externalsearch.pubmed.pmcoa.PmcOaClient;
import de.tudarmstadt.ukp.inception.externalsearch.pubmed.traits.PubMedProviderTraits;
import de.tudarmstadt.ukp.inception.externalsearch.pubmed.traits.PubMedProviderTraitsEditor;
import de.tudarmstadt.ukp.inception.schema.AnnotationSchemaService;

/**
* Support for PubAnnotation.
Expand All @@ -53,13 +54,16 @@ public class PubMedCentralProviderFactory

private final EntrezClient entrezClient;
private final PmcOaClient pmcOaClient;
private final AnnotationSchemaService schemaService;

private String beanName;

public PubMedCentralProviderFactory(EntrezClient aEntrezClient, PmcOaClient aPmcOaClient)
public PubMedCentralProviderFactory(EntrezClient aEntrezClient, PmcOaClient aPmcOaClient,
AnnotationSchemaService aSchemaService)
{
entrezClient = aEntrezClient;
pmcOaClient = aPmcOaClient;
schemaService = aSchemaService;
}

@Override
Expand All @@ -83,7 +87,7 @@ public String getDisplayName()
@Override
public ExternalSearchProvider<PubMedProviderTraits> getNewExternalSearchProvider()
{
return new PubMedCentralProvider(entrezClient, pmcOaClient);
return new PubMedCentralProvider(entrezClient, pmcOaClient, schemaService);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import de.tudarmstadt.ukp.inception.externalsearch.pubmed.pmcoa.PmcOaClient;
import de.tudarmstadt.ukp.inception.io.bioc.BioCFormatSupport;
import de.tudarmstadt.ukp.inception.io.bioc.config.BioCAutoConfiguration;
import de.tudarmstadt.ukp.inception.schema.AnnotationSchemaService;

@Configuration
@AutoConfigureAfter({ ExternalSearchAutoConfiguration.class, BioCAutoConfiguration.class,
Expand All @@ -41,8 +42,8 @@ public class PubMedDocumentRepositoryAutoConfiguration
{
@Bean
public PubMedCentralProviderFactory pubMedCentralProviderFactory(EntrezClient aEntrezClient,
PmcOaClient aPmcOaClient)
PmcOaClient aPmcOaClient, AnnotationSchemaService aSchemaService)
{
return new PubMedCentralProviderFactory(aEntrezClient, aPmcOaClient);
return new PubMedCentralProviderFactory(aEntrezClient, aPmcOaClient, aSchemaService);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ public class EntrezClient
private static final String PARAM_RETSTART = "retstart";
private static final String PARAM_DB = "db";
private static final String PARAM_RETMAX = "retmax";
private static final String PARAM_SORT = "sort";

private static final String EUTILS_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
private static final String ESEARCH_URL = EUTILS_BASE_URL
+ "/esearch.fcgi?db={db}&retstart={retstart}&retmax={retmax}&term={term}";
+ "/esearch.fcgi?db={db}&retstart={retstart}&retmax={retmax}&term={term}&sort={sort}";
private static final String ESUMMARY_URL = EUTILS_BASE_URL + "/esummary.fcgi?db={db}&id={id}";
private static final String EFETCH_URL = EUTILS_BASE_URL + "/efetch.fcgi?db={db}&id={id}";

Expand All @@ -58,7 +59,8 @@ public ESearchResult esearch(String aDb, String aQuery, int aOffset, int aPageSi
PARAM_DB, aDb, //
PARAM_RETSTART, Integer.toString(aOffset), //
PARAM_RETMAX, Integer.toString(aPageSize), //
PARAM_TERM, aQuery);
PARAM_TERM, aQuery, //
PARAM_SORT, "relevance");

var response = restTemplate.exchange(ESEARCH_URL, HttpMethod.GET, null, ESearchResult.class,
variables);
Expand Down
31 changes: 31 additions & 0 deletions inception/inception-io-bioc/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,23 @@
<groupId>de.tudarmstadt.ukp.inception.app</groupId>
<artifactId>inception-support</artifactId>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.inception.app</groupId>
<artifactId>inception-project-initializers-basic</artifactId>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.inception.app</groupId>
<artifactId>inception-io-webanno-tsv</artifactId>
</dependency>

<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-resources-asl</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-parameter-asl</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-io-asl</artifactId>
Expand All @@ -59,6 +71,11 @@
<artifactId>dkpro-core-api-segmentation-asl</artifactId>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>

<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
Expand All @@ -77,6 +94,10 @@
<artifactId>uimafit-core</artifactId>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
Expand All @@ -87,5 +108,15 @@
<artifactId>dkpro-core-testing-asl</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-io-xmi-asl</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>xmlunit</groupId>
<artifactId>xmlunit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>

0 comments on commit 8a390b9

Please sign in to comment.