Skip to content

Commit

Permalink
Merge 1d93ef0 into 1d6c9c7
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Sep 24, 2020
2 parents 1d6c9c7 + 1d93ef0 commit 211eaeb
Show file tree
Hide file tree
Showing 20 changed files with 3,263 additions and 14 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ env:
- PYTHON=py2 SUITE=functional_desy
- PYTHON=py2 SUITE=functional_cds
- PYTHON=py2 SUITE=functional_pos
- PYTHON=py2 SUITE=functional_elsevier
- PYTHON=py3 SUITE=unit

matrix:
Expand Down
11 changes: 9 additions & 2 deletions docker-compose.test.py2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ services:
ftp_server:
condition: service_healthy

functional_elsevier:
<<: *service_base
command: py.test -vv tests/functional/elsevier
depends_on:
- scrapyd
- localstack

functional_desy:
<<: *service_base
Expand Down Expand Up @@ -196,12 +202,13 @@ services:
localstack:
image: localstack/localstack:latest
ports:
- '4572:4572'
- '4566:4566'
- "4566:4566"
environment:
- SERVICES=s3
- DEBUG=1
- DATA_DIR=/home/localstack/data
- HOSTNAME_EXTERNAL=localstack
- HOSTNAME=localstack

networks:
ftp:
Expand Down
27 changes: 17 additions & 10 deletions hepcrawl/parsers/elsevier.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def parse(self):
self.builder.add_doi(**doi)
for keyword in self.keywords:
self.builder.add_keyword(keyword)
self.builder.add_imprint_date(self.publication_date.dumps())
self.builder.add_imprint_date(self.publication_date.dumps() if self.publication_date else None)
for reference in self.references:
self.builder.add_reference(reference)

Expand Down Expand Up @@ -235,6 +235,7 @@ def dois(self):

@property
def document_type(self):
doctype = None
if self.root.xpath("./*[self::article or self::simple-article or self::book-review]"):
doctype = 'article'
elif self.root.xpath("./*[self::book or self::simple-book]"):
Expand Down Expand Up @@ -341,11 +342,17 @@ def page_end(self):

@property
def publication_date(self):
publication_date = None
publication_date_string = self.root.xpath(
'./RDF/Description/coverDisplayDate/text()'
).extract_first()
if publication_date_string:
publication_date = PartialDate.parse(publication_date_string)
try:
publication_date = PartialDate.parse(publication_date_string)
except:
# in case when date contains month range, eg. July-September 2020
publication_date = re.sub("[A-aZ-z]*-(?=[A-aZ-z])", "", publication_date_string)
publication_date = PartialDate.parse(publication_date)
return publication_date

@property
Expand Down Expand Up @@ -383,9 +390,9 @@ def subtitle(self):
def title(self):
title = self.root.xpath(
'./*/head/title//text()'
).extract_first().strip('\n')
).extract_first()

return title
return title.strip('\n') if title else None

@property
def year(self):
Expand Down Expand Up @@ -484,9 +491,9 @@ def get_reference_authors(ref_node):
authors = ref_node.xpath("./contribution/authors/author")
authors_names = []
for author in authors:
given_names = author.xpath("./given-name/text()").extract_first()
last_names = author.xpath("./surname/text()").extract_first()
authors_names.append(" ".join([given_names, last_names]))
given_names = author.xpath("./given-name/text()").extract_first(default="")
last_names = author.xpath("./surname/text()").extract_first(default="")
authors_names.append(" ".join([given_names, last_names]).strip())
return authors_names

@staticmethod
Expand All @@ -502,9 +509,9 @@ def get_reference_editors(ref_node):
editors = ref_node.xpath(".//editors/authors/author")
editors_names = []
for editor in editors:
given_names = editor.xpath("./given-name/text()").extract_first()
last_names = editor.xpath("./surname/text()").extract_first()
editors_names.append(" ".join([given_names, last_names]))
given_names = editor.xpath("./given-name/text()").extract_first(default="")
last_names = editor.xpath("./surname/text()").extract_first(default="")
editors_names.append(" ".join([given_names, last_names]).strip())
return editors_names

@staticmethod
Expand Down
235 changes: 235 additions & 0 deletions hepcrawl/spiders/elsevier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
import glob
import os
import shutil
import tempfile
import xml.etree.ElementTree as et
import zipfile
from io import BytesIO

import boto3
import scrapy
from scrapy import Request, Spider

from . import StatefulSpider
from ..parsers import ElsevierParser
from ..utils import ParsedItem, strict_kwargs


class ElsevierSpider(StatefulSpider):
name = "elsevier"
start_urls = []

@strict_kwargs
def __init__(
self,
acces_key_id,
secret_access_key,
packages_bucket_name,
files_bucket_name,
elsevier_consyn_url,
s3_host="https://s3.cern.ch",
*args,
**kwargs
):
super(ElsevierSpider, self).__init__(*args, **kwargs)
self.access_key_id = acces_key_id
self.secret_access_key = secret_access_key
self.packages_bucket_name = packages_bucket_name
self.files_bucket_name = files_bucket_name
self.elsevier_consyn_url = elsevier_consyn_url
self.new_packages = set()
self.new_xml_files = set()
self.s3_host = s3_host

if not (
self.access_key_id,
self.secret_access_key,
self.packages_bucket_name,
self.files_bucket_name,
):
raise Exception("Missing parametrs necessary to establish s3 connection")
else:
self.s3_connection = self.create_s3_connection()
self.s3_packages_bucket_conn = self.s3_bucket_connection(
self.packages_bucket_name
)
self.s3_files_bucket_conn = self.s3_bucket_connection(
self.files_bucket_name
)
self.s3_client = self.connect_s3_client()

def create_s3_connection(self):
session = boto3.Session(
aws_access_key_id=self.access_key_id,
aws_secret_access_key=self.secret_access_key,
)
s3 = session.resource("s3", endpoint_url=self.s3_host)
return s3

def s3_bucket_connection(self, bucket_name):
bucket_connection = self.s3_connection.Bucket(bucket_name)
return bucket_connection

def connect_s3_client(self):
s3_client = boto3.client(
"s3",
aws_access_key_id=self.access_key_id,
aws_secret_access_key=self.secret_access_key,
endpoint_url=self.s3_host,
)
return s3_client

def create_presigned_url(self, bucket, file, method):
url = self.s3_client.generate_presigned_url(
ClientMethod=method,
Params={"Bucket": bucket, "Key": file},
ExpiresIn=920000,
)
return url

def _get_keys_names_from_bucket(self):
keys = set([key.key for key in self.s3_packages_bucket_conn.objects.all()])
return keys

def _get_package_urls_from_elsevier(self, elsevier_metadata):
"""
Extracts names and urls of the zip packages from elsevier batch feed
Returns:
dict(name: url): dict of zip packages names and urls
"""
packages_metadata_parsed = et.fromstring(elsevier_metadata)
urls_for_packages = {}
for children in packages_metadata_parsed.getchildren():
if "entry" in children.tag:
file_data = children.getchildren()
link = file_data[1].attrib["href"]
urls_for_packages[file_data[0].text] = link
return urls_for_packages

def _get_all_new_packages(self, elsevier_metadata):
"""
Checks which packages from elsevier batch feed are not in the s3 bucket yet
Returns:
dict(name: url): dict of zip packages names and urls
"""
urls_for_packages = self._get_package_urls_from_elsevier(elsevier_metadata)
bucket_data = self._get_keys_names_from_bucket()
packages_not_in_bucket = {
name: urls_for_packages[name]
for name in set(urls_for_packages.keys()) - bucket_data
}
self.new_packages = set(packages_not_in_bucket.keys())
return packages_not_in_bucket

def start_requests(self):
elsevier_batch_download_url = self.elsevier_consyn_url
yield Request(
elsevier_batch_download_url, callback=self.get_packages_from_elsevier
)

def get_packages_from_elsevier(self, response):
"""
Parse batch feed file from elsevier and downloads new zip packages from Elsevier server.
"""
elsevier_metadata = response.body
for name, url in self._get_all_new_packages(elsevier_metadata).items():
if name.lower().endswith("zip"):
yield Request(
url,
callback=self.populate_s3_bucket_with_elsevier_packages,
meta={"name": name},
)

def populate_s3_bucket_with_elsevier_packages(self, response):
"""
Uploads to s3 bucket new zip packages.
"""
name = response.meta["name"]
url = self.create_presigned_url(
method="put_object", bucket=self.packages_bucket_name, file=name
)
yield Request(
url,
method="PUT",
body=response.body,
meta={"name": name},
callback=self.download_zip_packages,
)

@staticmethod
def _get_doi_for_xml_file(xml_file):
parser = ElsevierParser(xml_file)
doi = parser.get_identifier()
return doi

def download_zip_packages(self, response):
"""
Downloads zip packages from s3 and passes to unziping function.
"""
get_url = self.create_presigned_url(
bucket=self.packages_bucket_name,
method="get_object",
file=response.meta["name"],
)
yield Request(
get_url,
callback=self.unzip_zip_package_to_s3,
meta={"name": response.meta["name"]},
)

def unzip_zip_package_to_s3(self, response):
"""
Extracts the files from zip folders downloaded from elsevier and
uploads them with a correct name (article doi) to the correct s3 bucket.
"""
tempdir = tempfile.mkdtemp()
with zipfile.ZipFile(BytesIO(response.body)) as zip_package:
zip_package.extractall(tempdir)
for root, dirnames, filenames in os.walk(tempdir):
for file in glob.glob(root + "/*.xml"):
with open(file) as f:
elsevier_xml = f.read()
file_doi = self._get_doi_for_xml_file(elsevier_xml)
self.new_xml_files.add("{file_doi}.xml".format(file_doi=file_doi))
url = self.create_presigned_url(
method="put_object",
bucket=self.files_bucket_name,
file="{file_doi}.xml".format(file_doi=file_doi),
)

yield Request(
url,
method="PUT",
body=elsevier_xml,
meta={"name": "{file_doi}.xml".format(file_doi=file_doi)},
callback=self.parse_items_from_s3,
)
shutil.rmtree(tempdir)

def parse_items_from_s3(self, response):
"""
Downloads xml files from S3 and passes to parser.
"""
download_url = self.create_presigned_url(
bucket=self.files_bucket_name,
method="get_object",
file=response.meta["name"],
)
yield Request(download_url, callback=self.parse_record_from_s3)

def parse_record_from_s3(self, response):
"""Parse an elsevier XML downloaded from s3 into a HEP record."""
parser = ElsevierParser(response.text)

return ParsedItem(record=parser.parse(), record_format="hep")

def parse_record(self, record):
"""Parse an elsevier XML exported file into a HEP record."""
with open(record, "r") as f:
elsevier_record = f.read()

parser = ElsevierParser(elsevier_record)
return ParsedItem(record=parser.parse(), record_format="hep")
2 changes: 1 addition & 1 deletion hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class ElsevierSpider(StatefulSpider, XMLFeedSpider):
* This is useful: https://www.elsevier.com/__data/assets/pdf_file/0006/58407/ja50_tagbytag5.pdf
"""

name = 'elsevier'
name = 'elsevier-spider'
start_urls = []
iterator = 'xml'
itertag = 'doc:document'
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/desy/test_desy.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def setup_s3_files(s3_key, s3_secret, s3_server, s3_input_bucket, s3_output_buck
def get_s3_settings():
key = 'key'
secret = 'secret'
s3_host = 'http://localstack:4572'
s3_host = 'http://localstack:4566'
input_bucket = 'incoming'
output_bucket = 'processed'

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<feed xmlns="removed"><title>A feed title</title><subtitle>A subtitle</subtitle><link href="http://it-s-a-non-existing-link" rel="self"/><id>IiiDdd</id><updated>2020-09-17T08:18:16.460608Z</updated><author><name>INSPIRE</name></author><entry><title>test_zip_file (something) 1.1.1.ZIP</title><link href="http://localstack:4566/batch-feed/test_zip_file.ZIP"/><id>2.92775610124881E171</id><updated>2020-09-17T08:18:16.460608Z</updated><summary>A summary</summary></entry></feed>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<feed xmlns="removed"><title>A feed title</title><subtitle>A subtitle</subtitle><link href="http://it-s-a-non-existing-link" rel="self"/><id>IiiDdd</id><updated>2020-09-17T08:18:16.460608Z</updated><author><name>INSPIRE</name></author><entry><title>test_zip_file (something) 1.1.1.ZIP</title><link href="http://localstack:4566/batch-feed/test_zip_file_replicated.ZIP"/><id>2.92775610124881E171</id><updated>2020-09-17T08:18:16.460608Z</updated><summary>A summary</summary></entry></feed>

Large diffs are not rendered by default.

Loading

0 comments on commit 211eaeb

Please sign in to comment.