Merge 3c12100 into e0e7844

inspirehep · Oct 21, 2020 · c016246 · c016246
2 parents e0e7844 + 3c12100
commit c016246
Show file tree

Hide file tree

Showing 24 changed files with 441 additions and 306 deletions.
diff --git a/docker-compose.test.py2.yml b/docker-compose.test.py2.yml
@@ -17,15 +17,23 @@ services:
       - APP_CELERY_RESULT_BACKEND=redis://redis:6379/1
       - APP_CRAWLER_HOST_URL=http://scrapyd:6800
       - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
-      - APP_FILES_STORE=/tmp/file_urls
       - APP_LAST_RUNS_PATH=/code/.scrapy/last_runs
       - APP_CRAWL_ONCE_PATH=/code/.scrapy
       - COVERAGE_PROCESS_START=/code/.coveragerc
+      - APP_DOWNLOAD_BUCKET=downloaded
     tty: true
     volumes:
       - .:/code/
       - ./tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
 
+  functional_aps:
+    <<: *service_base
+    command: py.test tests/functional/aps
+    depends_on:
+      - scrapyd
+      - localstack
+      - aps-http-server.local
+
   functional_wsp:
     <<: *service_base
     command: py.test -vv tests/functional/wsp
@@ -43,6 +51,8 @@ services:
         condition: service_healthy
       localstack:
         condition: service_healthy
+      elsevier-http-server.local:
+        condition: service_healthy
 
   functional_desy:
     <<: *service_base
@@ -100,6 +110,8 @@ services:
       interval: 5s
       retries: 5
       test: ['CMD', "curl", "-k", "http://localhost:6800/listprojects.json"]
+    ports:
+      - "1234:1234"
 
   scrapyd-deploy:
     <<: *service_base
@@ -152,6 +164,20 @@ services:
       cds-http-server.local:
         condition: service_healthy
 
+  elsevier-http-server.local:
+    image: nginx:stable-alpine
+    restart: "always"
+    volumes:
+      - ${PWD}/tests/functional/elsevier/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
+      - ${PWD}/tests/functional/elsevier/fixtures/http_server/data:/etc/nginx/html/
+    ports:
+      - 80:80
+    healthcheck:
+      timeout: 5s
+      interval: 5s
+      retries: 5
+      test: "curl -k http://localhost:80/arxiv-physics-hep-th.xml"
+
   arxiv-http-server.local:
     image: nginx:stable-alpine
     restart: "always"
@@ -180,6 +206,15 @@ services:
       retries: 5
       test: "curl -k http://localhost:80/cds-single.xml"
 
+  aps-http-server.local:
+    image: nginx:stable-alpine
+    restart: "always"
+    volumes:
+      - ./tests/functional/aps/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
+      - ./tests/functional/aps/fixtures/http_server/records:/etc/nginx/html/
+    ports:
+      - 80:80
+
   rabbitmq:
     image: rabbitmq:3-management
     restart: "always"

diff --git a/hepcrawl/parsers/jats.py b/hepcrawl/parsers/jats.py
@@ -12,7 +12,6 @@
 from __future__ import absolute_import, division, print_function
 
 import itertools
-import re
 
 import six
 

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
@@ -16,14 +16,17 @@
 from __future__ import absolute_import, division, print_function
 
 import os
+from six.moves.urllib.parse import urlparse
+
 import shutil
 import pprint
 import logging
 
 import requests
-
 from scrapy import Request
+
 from scrapy.pipelines.files import FilesPipeline
+from scrapy.utils.project import get_project_settings
 
 from .api import CrawlResult
 from .settings import FILES_STORE
@@ -54,36 +57,43 @@ def __init__(self, store_uri, *args, **kwargs):
 
     def get_media_requests(self, item, info):
         if item.get('file_urls'):
-            logging.info(
-                'Got the following files to download:\n%s' % pprint.pformat(
+            LOGGER.info(
+                'Got the following files to download:\n%s', pprint.pformat(
                     item['file_urls']
                 )
             )
-            for document_url in item.file_urls:
-                yield Request(
-                    url=document_url,
-                    meta=item.ftp_params,
-                )
-
-    def get_absolute_file_path(self, path):
-        return os.path.abspath(
-            os.path.join(self.store.basedir, path)
+            return [Request(x) for x in item.get(self.files_urls_field, [])]
+        return list()
+
+    def generate_presigned_s3_url(self, path, expire=86400):
+        bucket_location = get_project_settings().get("DOWNLOAD_BUCKET", "documents")
+        LOGGER.info("Generating presigned url for: %s in %s", path, bucket_location)
+        return self.store.s3_client.generate_presigned_url(
+            ClientMethod='get_object',
+            Params={'Bucket': bucket_location, "Key": path},
+            ExpiresIn=expire
         )
 
     def item_completed(self, results, item, info):
         """Create a map that connects file names with downloaded files."""
+        LOGGER.info("results: %s, item: %s, info: %s", results, item, info)
         record_files = [
             RecordFile(
-                path=self.get_absolute_file_path(result_data['path']),
+                path=self.generate_presigned_s3_url(result_data['path']),
                 name=os.path.basename(result_data['url']),
             )
             for ok, result_data in results
             if ok
         ]
+        LOGGER.info("Processed files to download: %s", record_files)
         item.record_files = record_files
 
         return item
 
+    def file_path(self, request, response=None, info=None):
+        path = super(DocumentsPipeline, self).file_path(request, response, info)
+        return urlparse(path).path
+
 
 class InspireAPIPushPipeline(object):
     """Push to INSPIRE API via tasks API."""

diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
@@ -121,13 +121,16 @@
 
 # Files Pipeline settings
 # =======================
-FILES_STORE = os.environ.get(
-    "APP_FILES_STORE",
-    'files'
-)
 FILES_URLS_FIELD = 'file_urls'
 FILES_RESULT_FIELD = 'files'
 
+# S3 Settings
+DOWNLOAD_BUCKET = os.environ.get("APP_DOWNLOAD_BUCKET", "documents")
+AWS_ENDPOINT_URL = os.environ.get("APP_AWS_ENDPOINT_URL", "https://s3.cern.ch")
+FILES_STORE = "s3://{bucket}/".format(bucket=DOWNLOAD_BUCKET)
+AWS_ACCESS_KEY_ID = os.environ.get("APP_AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.environ.get("APP_AWS_SECRET_ACCESS_KEY")
+
 # INSPIRE Push Pipeline settings
 # ==============================
 API_PIPELINE_URL = "http://localhost:5555/api/task/async-apply"

diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py
@@ -20,7 +20,6 @@
 
 from inspire_utils.record import get_value
 
-from . import StatefulSpider
 from .common.lastrunstore_spider import LastRunStoreSpider
 from ..items import HEPRecord
 from ..loaders import HEPLoader
@@ -51,13 +50,13 @@ class APSSpider(LastRunStoreSpider):
         Selecting specific journals is not supported for technical reasons as it's incompatible with the way the last run time is stored.
     """
     name = 'APS'
-    aps_base_url = "http://harvest.aps.org/v2/journals/articles"
 
     @strict_kwargs
     def __init__(self, from_date=None, until_date=None,
-                 date="published", sets=None, per_page=100,
+                 date="published", sets=None, per_page=100, aps_url="http://harvest.aps.org/v2/journals/articles",
                  **kwargs):
         """Construct APS spider."""
+        self.aps_url = aps_url
         super(APSSpider, self).__init__(**kwargs)
         self.set = sets
         self.from_date = from_date
@@ -80,7 +79,7 @@ def url(self):
             params['per_page'] = self.per_page
         if self.date:
             params['date'] = self.date
-        return furl(APSSpider.aps_base_url).add(params).url
+        return furl(self.aps_url).add(params).url
 
 
     def start_requests(self):
@@ -99,9 +98,8 @@ def parse(self, response):
 
         for article in aps_response['data']:
             doi = get_value(article, 'identifiers.doi', default='')
-
             if doi:
-                request = Request(url='{}/{}'.format(self.aps_base_url, doi),
+                request = Request(url='{}/{}'.format(self.aps_url, doi),
                               headers={'Accept': 'text/xml'},
                               callback=self._parse_jats,
                               errback=self._parse_json_on_failure)
@@ -124,10 +122,12 @@ def _parse_jats(self, response):
 
         file_name = self._file_name_from_url(response.url)
         parser.attach_fulltext_document(file_name, response.url)
-
+        record = parser.parse()
+        file_urls = [document['url'] for document in record.get('documents', [])]
         return ParsedItem(
-            record=parser.parse(),
+            record=record,
             record_format='hep',
+            file_urls=file_urls
         )
 
     def _parse_json_on_failure(self, failure):