Skip to content

Commit

Permalink
elsevier parser: improve handling of metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Oct 20, 2020
1 parent 827a6ed commit 4b9a89b
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 57 deletions.
90 changes: 45 additions & 45 deletions hepcrawl/parsers/elsevier.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def article_type(self):

@property
def artid(self):
artid = self.root.xpath("./*/item-info/aid/text()").extract_first()
artid = self.root.xpath("string(./*/item-info/aid[1])").extract_first()
return artid

@property
Expand All @@ -195,7 +195,7 @@ def authors(self):
@property
def collaborations(self):
collaborations = self.root.xpath(
"./*/head/author-group//collaboration/text/text()"
"string(./*/head/author-group//collaboration/text[1])"
).extract()
return collaborations

Expand All @@ -213,7 +213,7 @@ def copyright(self):
@property
def copyright_holder(self):
copyright_holder = self.root.xpath(
"./*/item-info/copyright[@type]/text()"
"string(./*/item-info/copyright[@type][1])"
).extract_first()
if not copyright_holder:
copyright_type = self.root.xpath(
Expand All @@ -226,11 +226,11 @@ def copyright_holder(self):
@property
def copyright_statement(self):
copyright_statement = self.root.xpath(
"./RDF/Description/copyright/text()"
"string(./RDF/Description/copyright[1])"
).extract_first()
if not copyright_statement:
copyright_statement = self.root.xpath(
"./*/item-info/copyright[@type]/text()"
"string(./*/item-info/copyright[@type][1])"
).extract_first()

return copyright_statement
Expand All @@ -245,7 +245,7 @@ def copyright_year(self):

@property
def dois(self):
doi = self.root.xpath("./*/item-info/doi/text()").extract_first()
doi = self.root.xpath("string(./RDF/Description/doi[1])").extract_first()
return [{"doi": doi, "material": self.material}]

@property
Expand All @@ -270,7 +270,7 @@ def is_conference_paper(self):
if self.root.xpath("./conference-info"):
return True
journal_issue = self.root.xpath(
"./RDF/Description/issueName/text()"
"string(./RDF/Description/issueName[1])"
).extract_first()
if journal_issue:
is_conference = re.findall(r"proceedings|proc.", journal_issue.lower())
Expand All @@ -279,33 +279,33 @@ def is_conference_paper(self):

@property
def journal_title(self):
jid = self.root.xpath("./*/item-info/jid/text()").extract_first(default="")
jid = self.root.xpath("string(./*/item-info/jid[1])").extract_first(default="")
publication = self.root.xpath(
"./RDF/Description/publicationName/text()"
"string(./RDF/Description/publicationName[1])"
).extract_first(default=jid)
publication = re.sub(" [S|s]ection", "", publication).replace(",", "").strip()
return publication

@property
def journal_issue(self):
journal_issue = self.root.xpath(
"./serial-issue/issue-info/issue-first/text()"
"string(./serial-issue/issue-info/issue-first[1])"
).extract_first()

return journal_issue

@property
def journal_volume(self):
journal_volume = self.root.xpath(
"./RDF/Description/volume/text()"
"string(./RDF/Description/volume[1])"
).extract_first()

return journal_volume

@property
def keywords(self):
keywords = self.root.xpath(
"./*/head/keywords[not(@abr)]/keyword/text/text()"
"string(./*/head/keywords[not(@abr)]/keyword/text[1])"
).extract()

return keywords
Expand All @@ -323,15 +323,15 @@ def license(self):
@property
def license_statement(self):
license_statement = self.root.xpath(
"./RDF/Description/licenseLine/text()"
"string(./RDF/Description/licenseLine[1])"
).extract_first()

return license_statement

@property
def license_url(self):
license_url = self.root.xpath(
"./RDF/Description/openAccessInformation/userLicense/text()"
"string(./RDF/Description/openAccessInformation/userLicense[1])"
).extract_first()

return license_url
Expand All @@ -356,22 +356,22 @@ def material(self):
@property
def page_start(self):
page_start = self.root.xpath(
"./RDF/Description/startingPage/text()"
"string(./RDF/Description/startingPage[1])"
).extract_first()
return page_start

@property
def page_end(self):
page_end = self.root.xpath(
"./RDF/Description/endingPage/text()"
"string(./RDF/Description/endingPage[1])"
).extract_first()
return page_end

@property
def publication_date(self):
publication_date = None
publication_date_string = self.root.xpath(
"./RDF/Description/coverDisplayDate/text()"
"string(./RDF/Description/coverDisplayDate[1])"
).extract_first()
if publication_date_string:
try:
Expand Down Expand Up @@ -401,21 +401,21 @@ def publication_info(self):

@property
def publisher(self):
publisher = self.root.xpath("./RDF/Description/publisher/text()").extract_first(
publisher = self.root.xpath("string(./RDF/Description/publisher[1])").extract_first(
"Elsevier B.V."
)

return publisher

@property
def subtitle(self):
subtitle = self.root.xpath("./*/head/subtitle/text()").extract_first()
subtitle = self.root.xpath("string(./*/head/subtitle[1])").extract_first()

return subtitle

@property
def title(self):
title = self.root.xpath("./*/head/title//text()").extract_first()
title = self.root.xpath("string(./*/head/title[1])").extract_first()

return title.strip("\n") if title else None

Expand All @@ -427,11 +427,11 @@ def year(self):
def get_author_affiliations(self, author_node, author_group_node):
"""Extract an author's affiliations."""
ref_ids = author_node.xpath(".//@refid[contains(., 'af')]").extract()
group_affs = author_group_node.xpath("./affiliation/textfn/text()")
group_affs = author_group_node.xpath("string(./affiliation/textfn[1])").getall()
if ref_ids:
affiliations = self._find_affiliations_by_id(author_group_node, ref_ids)
else:
affiliations = group_affs.extract()
affiliations = group_affs
return affiliations

@staticmethod
Expand All @@ -443,24 +443,24 @@ def _find_affiliations_by_id(author_group, ref_ids):
affiliations_by_id = []
for aff_id in ref_ids:
affiliation = author_group.xpath(
"//affiliation[@id='{}']/textfn/text()".format(aff_id)
"string(//affiliation[@id='{}']/textfn[1])".format(aff_id)
).extract_first()
affiliations_by_id.append(affiliation)

return affiliations_by_id

def get_author_emails(self, author_node):
"""Extract an author's email addresses."""
emails = author_node.xpath('./e-address[@type="email"]/text()').extract()
emails = author_node.xpath('string(./e-address[@type="email"][1])').getall()

return emails

@staticmethod
def get_author_name(author_node):
"""Extract an author's name."""
surname = author_node.xpath("./surname/text()").extract_first()
given_names = author_node.xpath("./given-name/text()").extract_first()
suffix = author_node.xpath(".//suffix/text()").extract_first()
surname = author_node.xpath("string(./surname[1])").extract_first()
given_names = author_node.xpath("string(./given-name[1])").extract_first()
suffix = author_node.xpath("string(.//suffix[1])").extract_first()
author_name = ", ".join(el for el in (surname, given_names, suffix) if el)

return author_name
Expand Down Expand Up @@ -518,8 +518,8 @@ def get_reference_authors(ref_node):
authors = ref_node.xpath("./contribution/authors/author")
authors_names = []
for author in authors:
given_names = author.xpath("./given-name/text()").extract_first(default="")
last_names = author.xpath("./surname/text()").extract_first(default="")
given_names = author.xpath("string(./given-name[1])").extract_first(default="")
last_names = author.xpath("string(./surname[1])").extract_first(default="")
authors_names.append(" ".join([given_names, last_names]).strip())
return authors_names

Expand All @@ -536,15 +536,15 @@ def get_reference_editors(ref_node):
editors = ref_node.xpath(".//editors/authors/author")
editors_names = []
for editor in editors:
given_names = editor.xpath("./given-name/text()").extract_first(default="")
last_names = editor.xpath("./surname/text()").extract_first(default="")
given_names = editor.xpath("string(./given-name[1])").extract_first(default="")
last_names = editor.xpath("string(./surname[1])").extract_first(default="")
editors_names.append(" ".join([given_names, last_names]).strip())
return editors_names

@staticmethod
def get_reference_pages(ref_node):
first_page = ref_node.xpath(".//pages/first-page/text()").extract_first()
last_page = ref_node.xpath(".//pages/last-page/text()").extract_first()
first_page = ref_node.xpath("string(.//pages/first-page[1])").extract_first()
last_page = ref_node.xpath("string(.//pages/last-page[1])").extract_first()
return first_page, last_page

def get_reference_iter(self, ref_node):
Expand All @@ -569,31 +569,31 @@ def get_reference_iter(self, ref_node):
)

fields = [
((".//series/title/maintitle/text()"), builder.set_journal_title,),
(("string(.//series/title/maintitle[1])"), builder.set_journal_title,),
(
".//title[parent::edited-book|parent::book]/maintitle/text()",
"string(.//title[parent::edited-book|parent::book]/maintitle[1])",
builder.add_parent_title,
),
("./publisher/name/text()", builder.set_publisher),
(".//volume-nr/text()", builder.set_journal_volume),
(".//issue-nr/text()", builder.set_journal_issue),
(".//date/text()", builder.set_year),
(".//inter-ref/text()", builder.add_url),
(".//doi/text()", builder.add_uid),
("string(./publisher/name[1])", builder.set_publisher),
("string(.//volume-nr[1])", builder.set_journal_volume),
("string(.//issue-nr[1])", builder.set_journal_issue),
("string(.//date[1])", builder.set_year),
("string(.//inter-ref[1])", builder.add_url),
("string(.//doi[1])", builder.add_uid),
(
'pub-id[@pub-id-type="other"]'
'[contains(preceding-sibling::text(),"Report No")]/text()',
'string(pub-id[@pub-id-type="other"]'
'[contains(preceding-sibling::text(),"Report No")][1])',
builder.add_report_number,
),
("./title/maintitle/text()", builder.add_title),
("string(./title/maintitle[1])", builder.add_title),
]
for xpath, field_handler in fields:
value = citation_node.xpath(xpath).extract_first()
citation_node.xpath(xpath)
if value:
field_handler(value)

label_value = ref_node.xpath("./label/text()").extract_first()
label_value = ref_node.xpath("string(./label[1])").extract_first()
builder.set_label(label_value.strip("[]"))

pages = self.get_reference_pages(citation_node)
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/responses/elsevier/j.nima.2019.162728_expected.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ copyright_holder: Elsevier B.V.
copyright_statement: © 2019 Elsevier B.V. All rights reserved.
copyright_year: 2019
document_type: conference paper
license_url:
license_statement:
license_url: ''
license_statement: ''
article_type: full-length article
journal_title: Nuclear Inst. and Methods in Physics Research A
material: publication
Expand Down Expand Up @@ -66,7 +66,7 @@ dois:
- material: publication
doi: 10.1016/j.nima.2019.162728
journal_volume: '958'
journal_issue:
journal_issue: ''
is_conference_paper: true
publication_date: '2020-04-01'
collaborations: ['NA61/SHINE, CBM and BM@N collaborations']
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/responses/elsevier/j.nima.2019.162787_expected.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ copyright_holder:
copyright_statement: © 2019 Published by Elsevier B.V.
copyright_year: 2019
document_type: conference paper
license_url:
license_statement:
license_url: ''
license_statement: ''
article_type: full-length article
journal_title: Nuclear Inst. and Methods in Physics Research A
material: publication
Expand All @@ -24,7 +24,7 @@ dois:
- material: publication
doi: 10.1016/j.nima.2019.162787
journal_volume: '958'
journal_issue:
journal_issue: ''
is_conference_paper: true
publication_date: '2020-04-01'
collaborations: ['on behalf of LHCb Muon group']
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/responses/elsevier/j.nimb.2019.04.063_expected.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ dois:
- material: publication
doi: 10.1016/j.nimb.2019.04.063
journal_volume: '463'
journal_issue:
journal_issue: ''
is_conference_paper: false
publication_date: '2020-01-15'
collaborations: []
collaborations: ['']
documents:
- key: j.nimb.2019.04.063.xml
url: http://example.org/j.nimb.2019.04.063.xml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ copyright_holder: Elsevier B.V.
copyright_statement: © 2020 Elsevier B.V. All rights reserved.
copyright_year: 2020
document_type: article
license_url:
license_statement:
license_url: ''
license_statement: ''
article_type: full-length article
journal_title: Nuclear Physics A
material: publication
Expand Down Expand Up @@ -34,10 +34,10 @@ dois:
- material: publication
doi: 10.1016/j.nuclphysa.2020.121991
journal_volume: '1002'
journal_issue:
journal_issue: ''
is_conference_paper: false
publication_date: 2020-10
collaborations: []
collaborations: ['']
documents:
- key: j.nuclphysa.2020.121991.xml
url: http://example.org/j.nuclphysa.2020.121991.xml
Expand Down

0 comments on commit 4b9a89b

Please sign in to comment.