Skip to content

Commit

Permalink
RefExtract: Update eprint extraction in engine and regex
Browse files Browse the repository at this point in the history
Signed-off-by: Melissa Clegg <cleggm1@fnal.gov>
  • Loading branch information
cleggm1 committed Sep 17, 2019
1 parent 7ac5e36 commit bcdc081
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 12 deletions.
10 changes: 6 additions & 4 deletions refextract/references/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,11 +586,13 @@ def add_recid_elements(splitted_citations):


def arxiv_urls_to_report_numbers(citation_elements):
arxiv_url_prefix = 'http://arxiv.org/abs/'
arxiv_url_prefix = re.compile('^https?:\/\/(?:(?:cn\.|de\.|in\.|lanl\.)?arxiv\.org|xxx\.lanl\.gov)\/(?:abs|pdf)\/(\S+\d{4})(?:v\d{1,2})?(?:\.pdf)?', re.UNICODE | re.IGNORECASE)
for el in citation_elements:
if el['type'] == 'URL' and el['url_string'].startswith(arxiv_url_prefix):
el['type'] = 'REPORTNUMBER'
el['report_num'] = el['url_string'].replace(arxiv_url_prefix, 'arXiv:')
if el['type'] == 'URL' and el['url_string']:
matchobj = arxiv_url_prefix.match(el['url_string'])
if matchobj:
el['type'] = 'REPORTNUMBER'
el['report_num'] = matchobj.group(1)


def look_for_hdl(citation_elements):
Expand Down
22 changes: 14 additions & 8 deletions refextract/references/regexs.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,20 +72,26 @@ def compute_pos_patterns(patterns):

# Pattern for arxiv numbers
# arxiv 9910-1234v9 [physics.ins-det]
re_arxiv = re.compile(ur"""
ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
[\s.-]*(?P<num>\d{4})(?!\d)(?:[\s-]*V(?P<version>\d))?
re_arxiv = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
(?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
(?:ARXIV[\s:-]*))(?P<year>\d{2})-?(?P<month>\d{2})
[\s.-]*(?P<num>\d{4})(?!\d)(?:[\s-]*V(?P<version>\d{1,2}))?
(?:\.pdf)?
\s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)

re_arxiv_5digits = re.compile(ur"""
ARXIV[\s:-]*(?P<year>(1[3-9]|[2-8][0-9]))-?(?P<month>(0[1-9]|1[0-2]))
[\s.-]*(?P<num>\d{5})(?!\d)(?:[\s-]*V(?P<version>\d))?
re_arxiv_5digits = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
(?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
(?:ARXIV[\s:-]*))(?P<year>(1[3-9]|[2-8][0-9]))-?(?P<month>(0[1-9]|1[0-2]))
[\s.-]*(?P<num>\d{5})(?!\d)
(?:[\s-]*V(?P<version>\d{1,2}))?
(?:\.pdf)?
\s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)

# Pattern for arxiv numbers catchup
# arxiv:9910-123 [physics.ins-det]
RE_ARXIV_CATCHUP = re.compile(ur"""
ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
RE_ARXIV_CATCHUP = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
(?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
(?:ARXIV[\s:-]*))(?P<year>\d{2})-?(?P<month>\d{2})
[\s.-]*(?P<num>\d{3})
\s*\[(?P<suffix>[A-Z.-]+)\]""", re.VERBOSE | re.UNICODE | re.IGNORECASE)

Expand Down
32 changes: 32 additions & 0 deletions tests/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,38 @@ def test_doi_subdivisions():
assert references[0]['linemarker'] == [u'10']


def test_old_arxiv():
ref_line = u'[20] B. Moore, T. R. Quinn, F. Governato, J. Stadel, and G. Lake, "Cold collapse and the corecatastrophe," Mon. Not. Roy. Astron. Soc.310(1999) 1147–1152, arXiv:astro-ph/9903164 [astro-ph].'
res = get_references(ref_line)
references = res[0]
assert references[0]['reportnumber'] == [u'astro-ph/9903164']
assert references[0]['linemarker'] == [u'20']


def test_old_lanl_url_version():
ref_line = u'[44] Navarro, J.F., Frenk, C.S., White, S.D.M. http://xxx.lanl.gov/pdf/astro-ph/9508025v1'
res = get_references(ref_line)
references = res[0]
assert references[0]['reportnumber'] == [u'astro-ph/9508025']
assert references[0]['linemarker'] == [u'44']


def test_old_arxiv_url():
ref_line = u'[298] V. Allori, D. Duerr, S. Goldstein, and N. Zanghi. 2002. Seven steps towards the classical world. Journal of Optics B : Quantum and semiclassical Optics, Volume 4, number 4. https://arxiv.org/abs/quant-ph/0112005'
res = get_references(ref_line)
references = res[0]
assert references[0]['reportnumber'] == [u'quant-ph/0112005']
assert references[0]['linemarker'] == [u'298']


def test_old_arxiv_mirror_url():
ref_line = u'[13] A. Zupanc, et al, Belle Collaboration, https://cn.arxiv.org/abs/hep-ex/0703040 2007'
res = get_references(ref_line)
references = res[0]
assert references[0]['reportnumber'] == [u'hep-ex/0703040']
assert references[0]['linemarker'] == [u'13']


def test_get_plaintext_document_body(tmpdir):
input = [u"Some text\n", u"on multiple lines\n"]
f = tmpdir.join("plain.txt")
Expand Down
24 changes: 24 additions & 0 deletions tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,27 @@ def test_5_digits_suffix_version_new_2012():
ref_line = u"""{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"""
r = tag_arxiv(ref_line)
assert r.strip(': ') == u"{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"


def test_4_digits_new_url():
ref_line = u"""{any prefix}https://arxiv.org/abs/1311.2198{any postfix}"""
r = tag_arxiv(ref_line)
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1311.2198</cds.REPORTNUMBER>{any postfix}"


def test_5_digits_new_url():
ref_line = u"""{any prefix}https://arxiv.org/abs/1602.03988{any postfix}"""
r = tag_arxiv(ref_line)
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1602.03988</cds.REPORTNUMBER>{any postfix}"


def test_4_digits_version_new_url():
ref_line = u"""{any prefix}https://arxiv.org/abs/0708.0882v1{any postfix}"""
r = tag_arxiv(ref_line)
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:0708.0882</cds.REPORTNUMBER>{any postfix}"


def test_5_digits_new_pdf_url():
ref_line = u"""{any prefix}https://arxiv.org/pdf/1712.03976.pdf{any postfix}"""
r = tag_arxiv(ref_line)
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1712.03976</cds.REPORTNUMBER>{any postfix}"

0 comments on commit bcdc081

Please sign in to comment.