Skip to content

Commit

Permalink
Merge 3866ae5 into be5be0a
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Jan 28, 2021
2 parents be5be0a + 3866ae5 commit 4a81e6c
Show file tree
Hide file tree
Showing 11 changed files with 69 additions and 57 deletions.
8 changes: 6 additions & 2 deletions hepcrawl/parsers/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from __future__ import absolute_import, division, print_function

from itertools import chain
import re

import six
Expand All @@ -36,6 +37,8 @@
)
RE_PAGES = re.compile(r'(?i)(\d+)\s*pages?\b')

RE_DOIS = re.compile(r'[,;\s]+(?=\s*10[.]\d{4,})')


class ArxivParser(object):
"""Parser for the arXiv format.
Expand Down Expand Up @@ -227,16 +230,17 @@ def collaborations(self):
@property
def dois(self):
doi_values = self.root.xpath('.//doi/text()').extract()
doi_values_splitted = chain.from_iterable([re.split(RE_DOIS, doi) for doi in doi_values])
dois = [
{'doi': value, 'material': 'publication'} for value in doi_values
{'doi': value, 'material': 'publication'} for value in doi_values_splitted
]

return dois

@property
def licenses(self):
licenses = self.root.xpath('.//license/text()').extract()
return [{'url':license, 'material': self.material} for license in licenses]
return [{'url': license, 'material': self.material} for license in licenses]

@property
def material(self):
Expand Down
3 changes: 2 additions & 1 deletion hepcrawl/testlib/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import os
import json
import shutil
import yaml

from scrapy.http import Request, TextResponse
from scrapy.selector import Selector
Expand Down Expand Up @@ -130,7 +131,7 @@ def expected_json_results_from_file(*path_chunks, **kwargs):
response_file = get_test_suite_path(*path_chunks, test_suite=test_suite)

with open(response_file) as fd:
expected_data = json.load(fd)
expected_data = yaml.safe_load(fd)

return expected_data

Expand Down
5 changes: 5 additions & 0 deletions hepcrawl/testlib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,8 @@ def deep_sort(element):
return sorted([deep_sort(item) for item in element])

return element


def sort_list_of_records_by_record_title(list_to_sort):
"""Sort list of records by record title"""
return sorted(list_to_sort, key=lambda k: k['titles'][0]['title'])
5 changes: 0 additions & 5 deletions tests/functional/cds/fixtures/cds_expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@
"_collections": [
"Literature"
],
"public_notes": [
{
"value": "Submitted to None"
}
],
"number_of_pages": 8,
"inspire_categories": [
{
Expand Down
23 changes: 14 additions & 9 deletions tests/functional/cds/test_cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from deepdiff import DeepDiff
from hepcrawl.testlib.tasks import app as celery_app
from hepcrawl.testlib.celery_monitor import CeleryMonitor
from hepcrawl.testlib.utils import get_crawler_instance
from hepcrawl.testlib.utils import get_crawler_instance, sort_list_of_records_by_record_title
from hepcrawl.testlib.fixtures import (
get_test_suite_path,
expected_json_results_from_file,
Expand Down Expand Up @@ -116,13 +116,18 @@ def test_cds(

crawl_result = crawl_results[0]

gotten_results = [
override_generated_fields(result['record'])
for result in crawl_result['results_data']
]
expected_results = [
override_generated_fields(expected) for expected in expected_results
]
gotten_results = sort_list_of_records_by_record_title(
[
override_generated_fields(result['record'])
for result in crawl_result['results_data']
]
)
expected_results = sort_list_of_records_by_record_title(
[
override_generated_fields(expected) for expected in expected_results
]
)

assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
for record, expected_record in zip(gotten_results, expected_results):
assert DeepDiff(record, expected_record, ignore_order=True) == {}
assert not crawl_result['errors']
4 changes: 2 additions & 2 deletions tests/functional/desy/fixtures/desy_records_s3_expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -224,10 +224,10 @@
"core": true,
"documents": [
{
"url":"http://localstack:4566/downloaded/full/9fa53d94584e605ee2f33c7ca0f273872a0ef693.pdf?AWSAccessKeyId=key",
"url":"http://localstack:4566/downloaded/full/2eeed0d0cb1b2fcffd6dd16e2097fcf055499152.pdf?AWSAccessKeyId=key",
"fulltext":true,
"key":"document",
"original_url":"http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf"
"original_url":"https://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf"
}
],
"thesis_info": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@
<subfield code="u">DESY</subfield>
</datafield>
<datafield tag="FFT" ind1=" " ind2=" ">
<subfield code="a">http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf</subfield>
<subfield code="a">https://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf</subfield>
<subfield code="d">Fulltext</subfield>
<subfield code="t">INSPIRE-PUBLIC</subfield>
</datafield>
Expand Down
46 changes: 19 additions & 27 deletions tests/functional/desy/test_desy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
clean_dir,
)
from hepcrawl.testlib.tasks import app as celery_app
from hepcrawl.testlib.utils import get_crawler_instance
from hepcrawl.testlib.utils import get_crawler_instance, sort_list_of_records_by_record_title


S3_CONFIG = {
Expand Down Expand Up @@ -216,31 +216,26 @@ def test_desy(
)

crawl_result = crawl_results[0]
gotten_records = sort_list_of_records_by_record_title(
[
result['record'] for result in crawl_result['results_data']
]
)
expected_results = sort_list_of_records_by_record_title(expected_results)

gotten_records = [
result['record'] for result in crawl_result['results_data']
]
gotten_records = override_dynamic_fields_on_records(gotten_records)
expected_results = override_dynamic_fields_on_records(expected_results)

gotten_records = sorted(
gotten_records,
key=lambda record: record['titles'][0]['title'],
)
expected_results = sorted(
expected_results,
key=lambda result: result['titles'][0]['title'],
)

#preproces s3 urls
for rec in gotten_records:
for document in rec.get('documents', []):
if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
assert "&Expires=" in document['url']
document['url'] = document['url'].split('&Expires=')[0]

for record, expected_record in zip(gotten_records, expected_results):
assert DeepDiff(record, expected_record, ignore_order=True) == {}

assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {}
assert not crawl_result['errors']


Expand Down Expand Up @@ -322,29 +317,26 @@ def test_desy_crawl_twice(expected_results, settings, cleanup):

crawl_result = crawl_results[0]

gotten_records = [
result['record'] for result in crawl_result['results_data']
]
gotten_records = sort_list_of_records_by_record_title(
[
result['record'] for result in crawl_result['results_data']
]
)
expected_results = sort_list_of_records_by_record_title(expected_results)

gotten_records = override_dynamic_fields_on_records(gotten_records)
expected_results = override_dynamic_fields_on_records(expected_results)

gotten_records = sorted(
gotten_records,
key=lambda record: record['titles'][0]['title'],
)
expected_results = sorted(
expected_results,
key=lambda result: result['titles'][0]['title'],
)

# preproces s3 urls
for rec in gotten_records:
for document in rec.get('documents', []):
if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
assert "&Expires=" in document['url']
document['url'] = document['url'].split('&Expires=')[0]

assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {}
for record, expected_record in zip(gotten_records, expected_results):
assert DeepDiff(record, expected_record, ignore_order=True) == {}

assert not crawl_result['errors']

# Second crawl
Expand Down
18 changes: 9 additions & 9 deletions tests/functional/wsp/test_wsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
clean_dir,
)
from hepcrawl.testlib.tasks import app as celery_app
from hepcrawl.testlib.utils import get_crawler_instance
from hepcrawl.testlib.utils import get_crawler_instance, sort_list_of_records_by_record_title


@pytest.fixture(scope="function")
Expand Down Expand Up @@ -146,13 +146,13 @@ def test_wsp(expected_results, settings, cleanup):

crawl_result = crawl_results[0]

gotten_results = [
gotten_results = sort_list_of_records_by_record_title([
override_generated_fields(result['record'])
for result in crawl_result['results_data']
]
expected_results = [
])
expected_results = sort_list_of_records_by_record_title([
override_generated_fields(expected) for expected in expected_results
]
])

assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
assert gotten_results == expected_results
Expand Down Expand Up @@ -205,13 +205,13 @@ def test_wsp_ftp_crawl_twice(expected_results, settings, cleanup):

crawl_result = crawl_results[0]

gotten_results = [
gotten_results = sort_list_of_records_by_record_title([
override_generated_fields(result['record'])
for result in crawl_result['results_data']
]
expected_results = [
])
expected_results = sort_list_of_records_by_record_title([
override_generated_fields(expected) for expected in expected_results
]
])

assert gotten_results == expected_results
assert not crawl_result['errors']
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/responses/arxiv/sample_arxiv_record0.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
<comments>6 pages, 4 figures, conference paper</comments>
<report-no>YITP-2016-26</report-no>
<journal-ref>Phys.Rev. D93 (2015) 016005</journal-ref>
<doi>10.1103/PhysRevD.93.016005</doi>
<doi>10.1103/PhysRevD.93.016005, 10.1103/PhysRevD.98.079901;10.1103/PhysRevD.98.079903</doi>
<license>https://creativecommons.org/licenses/by/3.0/</license>
<abstract>We study the dynamics of quantum coherence under Unruh thermal noise and seek under which condition the coherence can be frozen in a relativistic setting. We find that the quantum coherence can not be frozen for any acceleration due to the effect of Unruh thermal noise. We also find that quantum coherence is more robust than entanglement under the effect of Unruh thermal noise and therefore the coherence type quantum resources are more accessible for relativistic quantum information processing tasks. Besides, the dynamic of quantum coherence is found to be more sensitive than entanglement to the preparation of the detectors' initial state and the atom-field coupling strength, while it is less sensitive than entanglement to the acceleration of the detector.</abstract>
</arXiv>
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/test_arxiv_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,16 @@ def test_dois(results):
'source': 'arXiv',
'value': '10.1103/PhysRevD.93.016005',
'material': 'publication',
},
{
"value": "10.1103/PhysRevD.98.079901",
"source": "arXiv",
"material": "publication"
},
{
"value": "10.1103/PhysRevD.98.079903",
"source": "arXiv",
"material": "publication"
}
]
for record in results:
Expand Down

0 comments on commit 4a81e6c

Please sign in to comment.