-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
3,212 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
import glob | ||
import os | ||
import shutil | ||
import tempfile | ||
import xml.etree.ElementTree as et | ||
import zipfile | ||
|
||
import boto3 | ||
import requests | ||
|
||
from ..parsers import ElsevierParser | ||
from ..utils import ParsedItem, strict_kwargs | ||
from scrapy import Request, Spider | ||
import scrapy | ||
from io import BytesIO | ||
|
||
|
||
class ElsevierSpider(scrapy.Spider): | ||
name = 'elsevier' | ||
start_urls = [] | ||
|
||
def __init__( | ||
self, | ||
acces_key_id, | ||
secret_access_key, | ||
packages_bucket_name, | ||
files_bucket_name, | ||
elsevier_consyn_key, | ||
s3_host="https://s3.cern.ch", | ||
*args, **kwargs | ||
): | ||
super(ElsevierSpider, self).__init__(*args, **kwargs) | ||
self.access_key_id = acces_key_id | ||
self.secret_access_key = secret_access_key | ||
self.packages_bucket_name = packages_bucket_name | ||
self.files_bucket_name = files_bucket_name | ||
self.elsevier_consyn_key = elsevier_consyn_key | ||
self.new_packages = set() | ||
self.new_xml_files = set() | ||
self.s3_host = s3_host | ||
|
||
if not ( | ||
self.access_key_id, | ||
self.secret_access_key, | ||
self.packages_bucket_name, | ||
self.files_bucket_name, | ||
): | ||
raise Exception("Missing parametrs necessary to establish s3 connection") | ||
else: | ||
self.s3_connection = self.create_s3_connection() | ||
self.s3_packages_bucket_conn = self.s3_bucket_connection( | ||
self.packages_bucket_name | ||
) | ||
self.s3_files_bucket_conn = self.s3_bucket_connection( | ||
self.files_bucket_name | ||
) | ||
self.s3_client = self.connect_s3_client() | ||
|
||
def create_s3_connection(self): | ||
session = boto3.Session( | ||
aws_access_key_id=self.access_key_id, | ||
aws_secret_access_key=self.secret_access_key, | ||
) | ||
s3 = session.resource("s3", endpoint_url=self.s3_host) | ||
return s3 | ||
|
||
def s3_bucket_connection(self, bucket_name): | ||
bucket_connection = self.s3_connection.Bucket(bucket_name) | ||
return bucket_connection | ||
|
||
def connect_s3_client(self): | ||
s3_client = boto3.client("s3", | ||
aws_access_key_id=self.access_key_id, | ||
aws_secret_access_key=self.secret_access_key, | ||
endpoint_url=self.s3_host | ||
) | ||
return s3_client | ||
|
||
def create_presigned_url(self, bucket, file, method): | ||
url = self.s3_client.generate_presigned_url( | ||
ClientMethod=method, | ||
Params={'Bucket': bucket, "Key": file}, | ||
ExpiresIn=920000 | ||
) | ||
return url | ||
|
||
def _get_keys_names_from_bucket(self): | ||
keys = set([key.key for key in self.s3_packages_bucket_conn.objects.all()]) | ||
return keys | ||
|
||
def _get_package_urls_from_elsevier(self, elsevier_metadata): | ||
""" | ||
Extracts names and urls of the zip packages from elsevier batch feed | ||
Returns: | ||
dict(name: url): dict of zip packages names and urls | ||
""" | ||
packages_metadata_parsed = et.fromstring(elsevier_metadata) | ||
urls_for_packages = {} | ||
for children in packages_metadata_parsed.getchildren(): | ||
if "entry" in children.tag: | ||
file_data = children.getchildren() | ||
link = file_data[1].attrib["href"] | ||
urls_for_packages[file_data[0].text] = link | ||
return urls_for_packages | ||
|
||
def _get_all_new_packages(self, elsevier_metadata): | ||
""" | ||
Checks which packages from elsevier batch feed are not in the s3 bucket yet | ||
Returns: | ||
dict(name: url): dict of zip packages names and urls | ||
""" | ||
urls_for_packages = self._get_package_urls_from_elsevier(elsevier_metadata) | ||
bucket_data = self._get_keys_names_from_bucket() | ||
packages_not_in_bucket = { | ||
name: urls_for_packages[name] | ||
for name in set(urls_for_packages.keys()) - bucket_data | ||
} | ||
self.new_packages = set(packages_not_in_bucket.keys()) | ||
return packages_not_in_bucket | ||
|
||
def start_requests(self): | ||
elsevier_batch_download_url = ( | ||
"https://consyn.elsevier.com/batch/atom?key=" + self.elsevier_consyn_key | ||
) | ||
yield Request(elsevier_batch_download_url, | ||
callback=self.get_packages_from_elsevier) | ||
|
||
def get_packages_from_elsevier(self, response): | ||
elsevier_metadata = response.body | ||
for name, url in self._get_all_new_packages(elsevier_metadata).items(): | ||
if name.lower().endswith("zip"): | ||
yield Request(url, | ||
callback=self.populate_s3_bucket_with_elsevier_packages, | ||
meta={'name': name}) | ||
|
||
def populate_s3_bucket_with_elsevier_packages(self, response): | ||
""" | ||
Uploads to s3 bucket new zip folders containing xml-s for elsevier articles | ||
""" | ||
name = response.meta['name'] | ||
url = self.create_presigned_url(method="put_object", | ||
bucket=self.packages_bucket_name, | ||
file=name) | ||
yield Request(url, | ||
method="PUT", | ||
body=response.body, | ||
meta={'name': name}, | ||
callback=self.extract_zip_packages) | ||
|
||
@staticmethod | ||
def _get_doi_for_xml_file(xml_file): | ||
parser = ElsevierParser(xml_file) | ||
doi = parser.get_identifier() | ||
return doi | ||
|
||
def extract_zip_packages(self, response): | ||
""" | ||
Extracts the files from zip folders downloaded from elsevier and | ||
uploads them with a correct name (article doi) to the correct s3 bucket | ||
Yields: | ||
HEP records | ||
""" | ||
get_url = self.create_presigned_url(bucket=self.packages_bucket_name, | ||
method="get_object", | ||
file=response.meta["name"]) | ||
yield Request(get_url, | ||
callback=self.unzip_zip_package_to_s3, | ||
meta={"name": response.meta["name"]}) | ||
|
||
def unzip_zip_package_to_s3(self, response): | ||
tempdir = tempfile.mkdtemp() | ||
with zipfile.ZipFile(BytesIO(response.body)) as zip_package: | ||
zip_package.extractall(tempdir) | ||
for root, dirnames, filenames in os.walk(tempdir): | ||
for file in glob.glob(root + "/*.xml"): | ||
with open(file) as f: | ||
elsevier_xml = f.read() | ||
file_doi = self._get_doi_for_xml_file(elsevier_xml) | ||
self.new_xml_files.add('{file_doi}.xml'.format(file_doi=file_doi)) | ||
url = self.create_presigned_url(method="put_object", | ||
bucket=self.files_bucket_name, | ||
file='{file_doi}.xml'.format(file_doi=file_doi)) | ||
|
||
yield Request(url, method="PUT", | ||
body=elsevier_xml, | ||
meta={"name": '{file_doi}.xml'.format(file_doi=file_doi)}, | ||
callback=self.parse_items_from_s3) | ||
shutil.rmtree(tempdir) | ||
|
||
def parse_items_from_s3(self, response): | ||
""" | ||
Parse xml files in the s3 bucket | ||
Yields: | ||
HEP records | ||
""" | ||
download_url = self.create_presigned_url(bucket=self.files_bucket_name, | ||
method="get_object", | ||
file=response.meta["name"]) | ||
yield Request(download_url, | ||
callback=self.parse_record) | ||
|
||
def parse_record(self, response): | ||
"""Parse an elsevier XML exported file into a HEP record.""" | ||
parser = ElsevierParser(response.text) | ||
|
||
return ParsedItem(record=parser.parse(), | ||
record_format="hep",) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
1 change: 1 addition & 0 deletions
1
tests/functional/elsevier/fixtures/elsevier/parsed_records/j.geomphys.2020.103892.xml
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.