-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
3,263 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
import glob | ||
import os | ||
import shutil | ||
import tempfile | ||
import xml.etree.ElementTree as et | ||
import zipfile | ||
from io import BytesIO | ||
|
||
import boto3 | ||
import scrapy | ||
from scrapy import Request, Spider | ||
|
||
from . import StatefulSpider | ||
from ..parsers import ElsevierParser | ||
from ..utils import ParsedItem, strict_kwargs | ||
|
||
|
||
class ElsevierSpider(StatefulSpider): | ||
name = "elsevier" | ||
start_urls = [] | ||
|
||
@strict_kwargs | ||
def __init__( | ||
self, | ||
acces_key_id, | ||
secret_access_key, | ||
packages_bucket_name, | ||
files_bucket_name, | ||
elsevier_consyn_url, | ||
s3_host="https://s3.cern.ch", | ||
*args, | ||
**kwargs | ||
): | ||
super(ElsevierSpider, self).__init__(*args, **kwargs) | ||
self.access_key_id = acces_key_id | ||
self.secret_access_key = secret_access_key | ||
self.packages_bucket_name = packages_bucket_name | ||
self.files_bucket_name = files_bucket_name | ||
self.elsevier_consyn_url = elsevier_consyn_url | ||
self.new_packages = set() | ||
self.new_xml_files = set() | ||
self.s3_host = s3_host | ||
|
||
if not ( | ||
self.access_key_id, | ||
self.secret_access_key, | ||
self.packages_bucket_name, | ||
self.files_bucket_name, | ||
): | ||
raise Exception("Missing parametrs necessary to establish s3 connection") | ||
else: | ||
self.s3_connection = self.create_s3_connection() | ||
self.s3_packages_bucket_conn = self.s3_bucket_connection( | ||
self.packages_bucket_name | ||
) | ||
self.s3_files_bucket_conn = self.s3_bucket_connection( | ||
self.files_bucket_name | ||
) | ||
self.s3_client = self.connect_s3_client() | ||
|
||
def create_s3_connection(self): | ||
session = boto3.Session( | ||
aws_access_key_id=self.access_key_id, | ||
aws_secret_access_key=self.secret_access_key, | ||
) | ||
s3 = session.resource("s3", endpoint_url=self.s3_host) | ||
return s3 | ||
|
||
def s3_bucket_connection(self, bucket_name): | ||
bucket_connection = self.s3_connection.Bucket(bucket_name) | ||
return bucket_connection | ||
|
||
def connect_s3_client(self): | ||
s3_client = boto3.client( | ||
"s3", | ||
aws_access_key_id=self.access_key_id, | ||
aws_secret_access_key=self.secret_access_key, | ||
endpoint_url=self.s3_host, | ||
) | ||
return s3_client | ||
|
||
def create_presigned_url(self, bucket, file, method): | ||
url = self.s3_client.generate_presigned_url( | ||
ClientMethod=method, | ||
Params={"Bucket": bucket, "Key": file}, | ||
ExpiresIn=920000, | ||
) | ||
return url | ||
|
||
def _get_keys_names_from_bucket(self): | ||
keys = set([key.key for key in self.s3_packages_bucket_conn.objects.all()]) | ||
return keys | ||
|
||
def _get_package_urls_from_elsevier(self, elsevier_metadata): | ||
""" | ||
Extracts names and urls of the zip packages from elsevier batch feed | ||
Returns: | ||
dict(name: url): dict of zip packages names and urls | ||
""" | ||
packages_metadata_parsed = et.fromstring(elsevier_metadata) | ||
urls_for_packages = {} | ||
for children in packages_metadata_parsed.getchildren(): | ||
if "entry" in children.tag: | ||
file_data = children.getchildren() | ||
link = file_data[1].attrib["href"] | ||
urls_for_packages[file_data[0].text] = link | ||
return urls_for_packages | ||
|
||
def _get_all_new_packages(self, elsevier_metadata): | ||
""" | ||
Checks which packages from elsevier batch feed are not in the s3 bucket yet | ||
Returns: | ||
dict(name: url): dict of zip packages names and urls | ||
""" | ||
urls_for_packages = self._get_package_urls_from_elsevier(elsevier_metadata) | ||
bucket_data = self._get_keys_names_from_bucket() | ||
packages_not_in_bucket = { | ||
name: urls_for_packages[name] | ||
for name in set(urls_for_packages.keys()) - bucket_data | ||
} | ||
self.new_packages = set(packages_not_in_bucket.keys()) | ||
return packages_not_in_bucket | ||
|
||
def start_requests(self): | ||
elsevier_batch_download_url = self.elsevier_consyn_url | ||
yield Request( | ||
elsevier_batch_download_url, callback=self.get_packages_from_elsevier | ||
) | ||
|
||
def get_packages_from_elsevier(self, response): | ||
""" | ||
Parse batch feed file from elsevier and downloads new zip packages from Elsevier server. | ||
""" | ||
elsevier_metadata = response.body | ||
for name, url in self._get_all_new_packages(elsevier_metadata).items(): | ||
if name.lower().endswith("zip"): | ||
yield Request( | ||
url, | ||
callback=self.populate_s3_bucket_with_elsevier_packages, | ||
meta={"name": name}, | ||
) | ||
|
||
def populate_s3_bucket_with_elsevier_packages(self, response): | ||
""" | ||
Uploads to s3 bucket new zip packages. | ||
""" | ||
name = response.meta["name"] | ||
url = self.create_presigned_url( | ||
method="put_object", bucket=self.packages_bucket_name, file=name | ||
) | ||
yield Request( | ||
url, | ||
method="PUT", | ||
body=response.body, | ||
meta={"name": name}, | ||
callback=self.download_zip_packages, | ||
) | ||
|
||
@staticmethod | ||
def _get_doi_for_xml_file(xml_file): | ||
parser = ElsevierParser(xml_file) | ||
doi = parser.get_identifier() | ||
return doi | ||
|
||
def download_zip_packages(self, response): | ||
""" | ||
Downloads zip packages from s3 and passes to unziping function. | ||
""" | ||
get_url = self.create_presigned_url( | ||
bucket=self.packages_bucket_name, | ||
method="get_object", | ||
file=response.meta["name"], | ||
) | ||
yield Request( | ||
get_url, | ||
callback=self.unzip_zip_package_to_s3, | ||
meta={"name": response.meta["name"]}, | ||
) | ||
|
||
def unzip_zip_package_to_s3(self, response): | ||
""" | ||
Extracts the files from zip folders downloaded from elsevier and | ||
uploads them with a correct name (article doi) to the correct s3 bucket. | ||
""" | ||
tempdir = tempfile.mkdtemp() | ||
with zipfile.ZipFile(BytesIO(response.body)) as zip_package: | ||
zip_package.extractall(tempdir) | ||
for root, dirnames, filenames in os.walk(tempdir): | ||
for file in glob.glob(root + "/*.xml"): | ||
with open(file) as f: | ||
elsevier_xml = f.read() | ||
file_doi = self._get_doi_for_xml_file(elsevier_xml) | ||
self.new_xml_files.add("{file_doi}.xml".format(file_doi=file_doi)) | ||
url = self.create_presigned_url( | ||
method="put_object", | ||
bucket=self.files_bucket_name, | ||
file="{file_doi}.xml".format(file_doi=file_doi), | ||
) | ||
|
||
yield Request( | ||
url, | ||
method="PUT", | ||
body=elsevier_xml, | ||
meta={"name": "{file_doi}.xml".format(file_doi=file_doi)}, | ||
callback=self.parse_items_from_s3, | ||
) | ||
shutil.rmtree(tempdir) | ||
|
||
def parse_items_from_s3(self, response): | ||
""" | ||
Downloads xml files from S3 and passes to parser. | ||
""" | ||
download_url = self.create_presigned_url( | ||
bucket=self.files_bucket_name, | ||
method="get_object", | ||
file=response.meta["name"], | ||
) | ||
yield Request(download_url, callback=self.parse_record_from_s3) | ||
|
||
def parse_record_from_s3(self, response): | ||
"""Parse an elsevier XML downloaded from s3 into a HEP record.""" | ||
parser = ElsevierParser(response.text) | ||
|
||
return ParsedItem(record=parser.parse(), record_format="hep") | ||
|
||
def parse_record(self, record): | ||
"""Parse an elsevier XML exported file into a HEP record.""" | ||
with open(record, "r") as f: | ||
elsevier_record = f.read() | ||
|
||
parser = ElsevierParser(elsevier_record) | ||
return ParsedItem(record=parser.parse(), record_format="hep") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
1 change: 1 addition & 0 deletions
1
tests/functional/elsevier/fixtures/elsevier/elsevier_batch_feed_response_mock.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
<feed xmlns="removed"><title>A feed title</title><subtitle>A subtitle</subtitle><link href="http://it-s-a-non-existing-link" rel="self"/><id>IiiDdd</id><updated>2020-09-17T08:18:16.460608Z</updated><author><name>INSPIRE</name></author><entry><title>test_zip_file (something) 1.1.1.ZIP</title><link href="http://localstack:4566/batch-feed/test_zip_file.ZIP"/><id>2.92775610124881E171</id><updated>2020-09-17T08:18:16.460608Z</updated><summary>A summary</summary></entry></feed> |
1 change: 1 addition & 0 deletions
1
tests/functional/elsevier/fixtures/elsevier/elsevier_batch_feed_response_mock_replicated.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
<feed xmlns="removed"><title>A feed title</title><subtitle>A subtitle</subtitle><link href="http://it-s-a-non-existing-link" rel="self"/><id>IiiDdd</id><updated>2020-09-17T08:18:16.460608Z</updated><author><name>INSPIRE</name></author><entry><title>test_zip_file (something) 1.1.1.ZIP</title><link href="http://localstack:4566/batch-feed/test_zip_file_replicated.ZIP"/><id>2.92775610124881E171</id><updated>2020-09-17T08:18:16.460608Z</updated><summary>A summary</summary></entry></feed> |
1 change: 1 addition & 0 deletions
1
tests/functional/elsevier/fixtures/elsevier/parsed_records/j.geomphys.2020.103892.xml
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.