Fix APS spider - document url header

inspirehep · Oct 23, 2020 · be00a05 · be00a05
1 parent eb6588c
commit be00a05
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 11 deletions.
diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
@@ -62,10 +62,10 @@ def get_media_requests(self, item, info):
                     item['file_urls']
                 )
             )
-            return [Request(x) for x in item.get(self.files_urls_field, [])]
+            return [Request(**x) for x in item.get(self.files_urls_field, [])]
         return list()
 
-    def generate_presigned_s3_url(self, path, expire=86400):
+    def generate_presigned_s3_url(self, path, expire=7776000):
         bucket_location = get_project_settings().get("DOWNLOAD_BUCKET", "documents")
         LOGGER.info("Generating presigned url for: %s in %s", path, bucket_location)
         return self.store.s3_client.generate_presigned_url(

diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py
@@ -123,7 +123,7 @@ def _parse_jats(self, response):
         file_name = self._file_name_from_url(response.url)
         parser.attach_fulltext_document(file_name, response.url)
         record = parser.parse()
-        file_urls = [document['url'] for document in record.get('documents', [])]
+        file_urls = [{"url": document['url'], "headers": {'Accept': 'text/xml'}} for document in record.get('documents', [])]
         return ParsedItem(
             record=record,
             record_format='hep',

diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
@@ -255,7 +255,7 @@ def _parsed_items_from_marcxml(
                             new_documents.append(document)
                             self.logger.info("Updating document %s", document)
                         else:
-                            files_to_download.append(document['url'])
+                            files_to_download.append({"url": document['url']})
 
                     if new_documents:
                         parsed_item.record['documents'] = new_documents

diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py
@@ -231,13 +231,9 @@ def parse_record(self, response):
             )
             parser.attach_fulltext_document(file_name, document_url)
             parsed_record = parser.parse()
-            files_urls = [
-                document["url"] for document in parsed_record.get("documents", [])
-            ]
-            self.logger.info("Files to download: %s", files_urls)
-            return ParsedItem(
-                record=parsed_record, file_urls=files_urls, record_format="hep"
-            )
+            file_urls = [{"url": document['url']} for document in parsed_record.get('documents', [])]
+            self.logger.info("Files to download: %s", file_urls)
+            return ParsedItem(record=parsed_record, file_urls=file_urls, record_format="hep")
         else:
             self.logger.info(
                 "Document {name} is missing required metadata, skipping item creation.".format(