Skip to content

Commit

Permalink
Merge 3c12100 into e0e7844
Browse files Browse the repository at this point in the history
  • Loading branch information
pazembrz committed Oct 21, 2020
2 parents e0e7844 + 3c12100 commit c016246
Show file tree
Hide file tree
Showing 24 changed files with 441 additions and 306 deletions.
37 changes: 36 additions & 1 deletion docker-compose.test.py2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,23 @@ services:
- APP_CELERY_RESULT_BACKEND=redis://redis:6379/1
- APP_CRAWLER_HOST_URL=http://scrapyd:6800
- APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
- APP_FILES_STORE=/tmp/file_urls
- APP_LAST_RUNS_PATH=/code/.scrapy/last_runs
- APP_CRAWL_ONCE_PATH=/code/.scrapy
- COVERAGE_PROCESS_START=/code/.coveragerc
- APP_DOWNLOAD_BUCKET=downloaded
tty: true
volumes:
- .:/code/
- ./tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf

functional_aps:
<<: *service_base
command: py.test tests/functional/aps
depends_on:
- scrapyd
- localstack
- aps-http-server.local

functional_wsp:
<<: *service_base
command: py.test -vv tests/functional/wsp
Expand All @@ -43,6 +51,8 @@ services:
condition: service_healthy
localstack:
condition: service_healthy
elsevier-http-server.local:
condition: service_healthy

functional_desy:
<<: *service_base
Expand Down Expand Up @@ -100,6 +110,8 @@ services:
interval: 5s
retries: 5
test: ['CMD', "curl", "-k", "http://localhost:6800/listprojects.json"]
ports:
- "1234:1234"

scrapyd-deploy:
<<: *service_base
Expand Down Expand Up @@ -152,6 +164,20 @@ services:
cds-http-server.local:
condition: service_healthy

elsevier-http-server.local:
image: nginx:stable-alpine
restart: "always"
volumes:
- ${PWD}/tests/functional/elsevier/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
- ${PWD}/tests/functional/elsevier/fixtures/http_server/data:/etc/nginx/html/
ports:
- 80:80
healthcheck:
timeout: 5s
interval: 5s
retries: 5
test: "curl -k http://localhost:80/arxiv-physics-hep-th.xml"

arxiv-http-server.local:
image: nginx:stable-alpine
restart: "always"
Expand Down Expand Up @@ -180,6 +206,15 @@ services:
retries: 5
test: "curl -k http://localhost:80/cds-single.xml"

aps-http-server.local:
image: nginx:stable-alpine
restart: "always"
volumes:
- ./tests/functional/aps/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
- ./tests/functional/aps/fixtures/http_server/records:/etc/nginx/html/
ports:
- 80:80

rabbitmq:
image: rabbitmq:3-management
restart: "always"
Expand Down
1 change: 0 additions & 1 deletion hepcrawl/parsers/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from __future__ import absolute_import, division, print_function

import itertools
import re

import six

Expand Down
36 changes: 23 additions & 13 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@
from __future__ import absolute_import, division, print_function

import os
from six.moves.urllib.parse import urlparse

import shutil
import pprint
import logging

import requests

from scrapy import Request

from scrapy.pipelines.files import FilesPipeline
from scrapy.utils.project import get_project_settings

from .api import CrawlResult
from .settings import FILES_STORE
Expand Down Expand Up @@ -54,36 +57,43 @@ def __init__(self, store_uri, *args, **kwargs):

def get_media_requests(self, item, info):
if item.get('file_urls'):
logging.info(
'Got the following files to download:\n%s' % pprint.pformat(
LOGGER.info(
'Got the following files to download:\n%s', pprint.pformat(
item['file_urls']
)
)
for document_url in item.file_urls:
yield Request(
url=document_url,
meta=item.ftp_params,
)

def get_absolute_file_path(self, path):
return os.path.abspath(
os.path.join(self.store.basedir, path)
return [Request(x) for x in item.get(self.files_urls_field, [])]
return list()

def generate_presigned_s3_url(self, path, expire=86400):
bucket_location = get_project_settings().get("DOWNLOAD_BUCKET", "documents")
LOGGER.info("Generating presigned url for: %s in %s", path, bucket_location)
return self.store.s3_client.generate_presigned_url(
ClientMethod='get_object',
Params={'Bucket': bucket_location, "Key": path},
ExpiresIn=expire
)

def item_completed(self, results, item, info):
"""Create a map that connects file names with downloaded files."""
LOGGER.info("results: %s, item: %s, info: %s", results, item, info)
record_files = [
RecordFile(
path=self.get_absolute_file_path(result_data['path']),
path=self.generate_presigned_s3_url(result_data['path']),
name=os.path.basename(result_data['url']),
)
for ok, result_data in results
if ok
]
LOGGER.info("Processed files to download: %s", record_files)
item.record_files = record_files

return item

def file_path(self, request, response=None, info=None):
path = super(DocumentsPipeline, self).file_path(request, response, info)
return urlparse(path).path


class InspireAPIPushPipeline(object):
"""Push to INSPIRE API via tasks API."""
Expand Down
11 changes: 7 additions & 4 deletions hepcrawl/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,16 @@

# Files Pipeline settings
# =======================
FILES_STORE = os.environ.get(
"APP_FILES_STORE",
'files'
)
FILES_URLS_FIELD = 'file_urls'
FILES_RESULT_FIELD = 'files'

# S3 Settings
DOWNLOAD_BUCKET = os.environ.get("APP_DOWNLOAD_BUCKET", "documents")
AWS_ENDPOINT_URL = os.environ.get("APP_AWS_ENDPOINT_URL", "https://s3.cern.ch")
FILES_STORE = "s3://{bucket}/".format(bucket=DOWNLOAD_BUCKET)
AWS_ACCESS_KEY_ID = os.environ.get("APP_AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.environ.get("APP_AWS_SECRET_ACCESS_KEY")

# INSPIRE Push Pipeline settings
# ==============================
API_PIPELINE_URL = "http://localhost:5555/api/task/async-apply"
Expand Down
16 changes: 8 additions & 8 deletions hepcrawl/spiders/aps_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from inspire_utils.record import get_value

from . import StatefulSpider
from .common.lastrunstore_spider import LastRunStoreSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
Expand Down Expand Up @@ -51,13 +50,13 @@ class APSSpider(LastRunStoreSpider):
Selecting specific journals is not supported for technical reasons as it's incompatible with the way the last run time is stored.
"""
name = 'APS'
aps_base_url = "http://harvest.aps.org/v2/journals/articles"

@strict_kwargs
def __init__(self, from_date=None, until_date=None,
date="published", sets=None, per_page=100,
date="published", sets=None, per_page=100, aps_url="http://harvest.aps.org/v2/journals/articles",
**kwargs):
"""Construct APS spider."""
self.aps_url = aps_url
super(APSSpider, self).__init__(**kwargs)
self.set = sets
self.from_date = from_date
Expand All @@ -80,7 +79,7 @@ def url(self):
params['per_page'] = self.per_page
if self.date:
params['date'] = self.date
return furl(APSSpider.aps_base_url).add(params).url
return furl(self.aps_url).add(params).url


def start_requests(self):
Expand All @@ -99,9 +98,8 @@ def parse(self, response):

for article in aps_response['data']:
doi = get_value(article, 'identifiers.doi', default='')

if doi:
request = Request(url='{}/{}'.format(self.aps_base_url, doi),
request = Request(url='{}/{}'.format(self.aps_url, doi),
headers={'Accept': 'text/xml'},
callback=self._parse_jats,
errback=self._parse_json_on_failure)
Expand All @@ -124,10 +122,12 @@ def _parse_jats(self, response):

file_name = self._file_name_from_url(response.url)
parser.attach_fulltext_document(file_name, response.url)

record = parser.parse()
file_urls = [document['url'] for document in record.get('documents', [])]
return ParsedItem(
record=parser.parse(),
record=record,
record_format='hep',
file_urls=file_urls
)

def _parse_json_on_failure(self, failure):
Expand Down
Loading

0 comments on commit c016246

Please sign in to comment.