Skip to content

Commit

Permalink
Fix APS spider - document url header
Browse files Browse the repository at this point in the history
  • Loading branch information
pazembrz committed Oct 23, 2020
1 parent eb6588c commit be00a05
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 11 deletions.
4 changes: 2 additions & 2 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ def get_media_requests(self, item, info):
item['file_urls']
)
)
return [Request(x) for x in item.get(self.files_urls_field, [])]
return [Request(**x) for x in item.get(self.files_urls_field, [])]
return list()

def generate_presigned_s3_url(self, path, expire=86400):
def generate_presigned_s3_url(self, path, expire=7776000):
bucket_location = get_project_settings().get("DOWNLOAD_BUCKET", "documents")
LOGGER.info("Generating presigned url for: %s in %s", path, bucket_location)
return self.store.s3_client.generate_presigned_url(
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/aps_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def _parse_jats(self, response):
file_name = self._file_name_from_url(response.url)
parser.attach_fulltext_document(file_name, response.url)
record = parser.parse()
file_urls = [document['url'] for document in record.get('documents', [])]
file_urls = [{"url": document['url'], "headers": {'Accept': 'text/xml'}} for document in record.get('documents', [])]
return ParsedItem(
record=record,
record_format='hep',
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def _parsed_items_from_marcxml(
new_documents.append(document)
self.logger.info("Updating document %s", document)
else:
files_to_download.append(document['url'])
files_to_download.append({"url": document['url']})

if new_documents:
parsed_item.record['documents'] = new_documents
Expand Down
10 changes: 3 additions & 7 deletions hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,13 +231,9 @@ def parse_record(self, response):
)
parser.attach_fulltext_document(file_name, document_url)
parsed_record = parser.parse()
files_urls = [
document["url"] for document in parsed_record.get("documents", [])
]
self.logger.info("Files to download: %s", files_urls)
return ParsedItem(
record=parsed_record, file_urls=files_urls, record_format="hep"
)
file_urls = [{"url": document['url']} for document in parsed_record.get('documents', [])]
self.logger.info("Files to download: %s", file_urls)
return ParsedItem(record=parsed_record, file_urls=file_urls, record_format="hep")
else:
self.logger.info(
"Document {name} is missing required metadata, skipping item creation.".format(
Expand Down

0 comments on commit be00a05

Please sign in to comment.