Merge 3866ae5 into be5be0a

inspirehep · Jan 28, 2021 · 4a81e6c · 4a81e6c
2 parents be5be0a + 3866ae5
commit 4a81e6c
Show file tree

Hide file tree

Showing 11 changed files with 69 additions and 57 deletions.
diff --git a/hepcrawl/parsers/arxiv.py b/hepcrawl/parsers/arxiv.py
@@ -11,6 +11,7 @@
 
 from __future__ import absolute_import, division, print_function
 
+from itertools import chain
 import re
 
 import six
@@ -36,6 +37,8 @@
 )
 RE_PAGES = re.compile(r'(?i)(\d+)\s*pages?\b')
 
+RE_DOIS = re.compile(r'[,;\s]+(?=\s*10[.]\d{4,})')
+
 
 class ArxivParser(object):
     """Parser for the arXiv format.
@@ -227,16 +230,17 @@ def collaborations(self):
     @property
     def dois(self):
         doi_values = self.root.xpath('.//doi/text()').extract()
+        doi_values_splitted = chain.from_iterable([re.split(RE_DOIS, doi) for doi in doi_values])
         dois = [
-            {'doi': value, 'material': 'publication'} for value in doi_values
+            {'doi': value, 'material': 'publication'} for value in doi_values_splitted
         ]
 
         return dois
 
     @property
     def licenses(self):
         licenses = self.root.xpath('.//license/text()').extract()
-        return [{'url':license, 'material': self.material} for license in licenses]
+        return [{'url': license, 'material': self.material} for license in licenses]
 
     @property
     def material(self):

diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py
@@ -12,6 +12,7 @@
 import os
 import json
 import shutil
+import yaml
 
 from scrapy.http import Request, TextResponse
 from scrapy.selector import Selector
@@ -130,7 +131,7 @@ def expected_json_results_from_file(*path_chunks, **kwargs):
     response_file = get_test_suite_path(*path_chunks, test_suite=test_suite)
 
     with open(response_file) as fd:
-        expected_data = json.load(fd)
+        expected_data = yaml.safe_load(fd)
 
     return expected_data
 

diff --git a/hepcrawl/testlib/utils.py b/hepcrawl/testlib/utils.py
@@ -39,3 +39,8 @@ def deep_sort(element):
         return sorted([deep_sort(item) for item in element])
 
     return element
+
+
+def sort_list_of_records_by_record_title(list_to_sort):
+    """Sort list of records by record title"""
+    return sorted(list_to_sort, key=lambda k: k['titles'][0]['title'])
diff --git a/tests/functional/cds/fixtures/cds_expected.json b/tests/functional/cds/fixtures/cds_expected.json
@@ -19,11 +19,6 @@
         "_collections": [
             "Literature"
         ],
-        "public_notes": [
-            {
-                "value": "Submitted to None"
-            }
-        ],
         "number_of_pages": 8,
         "inspire_categories": [
             {

diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py
@@ -17,7 +17,7 @@
 from deepdiff import DeepDiff
 from hepcrawl.testlib.tasks import app as celery_app
 from hepcrawl.testlib.celery_monitor import CeleryMonitor
-from hepcrawl.testlib.utils import get_crawler_instance
+from hepcrawl.testlib.utils import get_crawler_instance, sort_list_of_records_by_record_title
 from hepcrawl.testlib.fixtures import (
     get_test_suite_path,
     expected_json_results_from_file,
@@ -116,13 +116,18 @@ def test_cds(
 
     crawl_result = crawl_results[0]
 
-    gotten_results = [
-        override_generated_fields(result['record'])
-        for result in crawl_result['results_data']
-    ]
-    expected_results = [
-        override_generated_fields(expected) for expected in expected_results
-    ]
+    gotten_results = sort_list_of_records_by_record_title(
+        [
+            override_generated_fields(result['record'])
+            for result in crawl_result['results_data']
+        ]
+    )
+    expected_results = sort_list_of_records_by_record_title(
+        [
+            override_generated_fields(expected) for expected in expected_results
+        ]
+    )
 
-    assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
+    for record, expected_record in zip(gotten_results, expected_results):
+        assert DeepDiff(record, expected_record, ignore_order=True) == {}
     assert not crawl_result['errors']
diff --git a/tests/functional/desy/fixtures/desy_records_s3_expected.json b/tests/functional/desy/fixtures/desy_records_s3_expected.json
@@ -224,10 +224,10 @@
         "core": true,
         "documents": [
             {
-                "url":"http://localstack:4566/downloaded/full/9fa53d94584e605ee2f33c7ca0f273872a0ef693.pdf?AWSAccessKeyId=key",
+                "url":"http://localstack:4566/downloaded/full/2eeed0d0cb1b2fcffd6dd16e2097fcf055499152.pdf?AWSAccessKeyId=key",
                 "fulltext":true,
                 "key":"document",
-                "original_url":"http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf"
+                "original_url":"https://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf"
             }
         ], 
         "thesis_info": {

diff --git a/tests/functional/desy/fixtures/s3_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/s3_server/DESY/desy_collection_records.xml
@@ -238,7 +238,7 @@
     <subfield code="u">DESY</subfield>
   </datafield>
   <datafield tag="FFT" ind1=" " ind2=" ">
-    <subfield code="a">http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf</subfield>
+    <subfield code="a">https://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf</subfield>
     <subfield code="d">Fulltext</subfield>
     <subfield code="t">INSPIRE-PUBLIC</subfield>
   </datafield>

diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py
@@ -27,7 +27,7 @@
     clean_dir,
 )
 from hepcrawl.testlib.tasks import app as celery_app
-from hepcrawl.testlib.utils import get_crawler_instance
+from hepcrawl.testlib.utils import get_crawler_instance, sort_list_of_records_by_record_title
 
 
 S3_CONFIG = {
@@ -216,31 +216,26 @@ def test_desy(
     )
 
     crawl_result = crawl_results[0]
+    gotten_records = sort_list_of_records_by_record_title(
+        [
+            result['record'] for result in crawl_result['results_data']
+        ]
+    )
+    expected_results = sort_list_of_records_by_record_title(expected_results)
 
-    gotten_records = [
-        result['record'] for result in crawl_result['results_data']
-    ]
     gotten_records = override_dynamic_fields_on_records(gotten_records)
     expected_results = override_dynamic_fields_on_records(expected_results)
 
-    gotten_records = sorted(
-            gotten_records,
-            key=lambda record: record['titles'][0]['title'],
-        )
-    expected_results = sorted(
-            expected_results,
-            key=lambda result: result['titles'][0]['title'],
-        )
-
     #preproces s3 urls
     for rec in gotten_records:
         for document in rec.get('documents', []):
             if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
                 assert "&Expires=" in document['url']
                 document['url'] = document['url'].split('&Expires=')[0]
 
+    for record, expected_record in zip(gotten_records, expected_results):
+        assert DeepDiff(record, expected_record, ignore_order=True) == {}
 
-    assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {}
     assert not crawl_result['errors']
 
 
@@ -322,29 +317,26 @@ def test_desy_crawl_twice(expected_results, settings, cleanup):
 
     crawl_result = crawl_results[0]
 
-    gotten_records = [
-        result['record'] for result in crawl_result['results_data']
-    ]
+    gotten_records = sort_list_of_records_by_record_title(
+        [
+            result['record'] for result in crawl_result['results_data']
+        ]
+    )
+    expected_results = sort_list_of_records_by_record_title(expected_results)
+
     gotten_records = override_dynamic_fields_on_records(gotten_records)
     expected_results = override_dynamic_fields_on_records(expected_results)
 
-    gotten_records = sorted(
-            gotten_records,
-            key=lambda record: record['titles'][0]['title'],
-        )
-    expected_results = sorted(
-            expected_results,
-            key=lambda result: result['titles'][0]['title'],
-        )
-
     # preproces s3 urls
     for rec in gotten_records:
         for document in rec.get('documents', []):
             if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
                 assert "&Expires=" in document['url']
                 document['url'] = document['url'].split('&Expires=')[0]
 
-    assert DeepDiff(gotten_records, expected_results, ignore_order=True) == {}
+    for record, expected_record in zip(gotten_records, expected_results):
+        assert DeepDiff(record, expected_record, ignore_order=True) == {}
+
     assert not crawl_result['errors']
 
     # Second crawl

diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
@@ -24,7 +24,7 @@
     clean_dir,
 )
 from hepcrawl.testlib.tasks import app as celery_app
-from hepcrawl.testlib.utils import get_crawler_instance
+from hepcrawl.testlib.utils import get_crawler_instance, sort_list_of_records_by_record_title
 
 
 @pytest.fixture(scope="function")
@@ -146,13 +146,13 @@ def test_wsp(expected_results, settings, cleanup):
 
     crawl_result = crawl_results[0]
 
-    gotten_results = [
+    gotten_results = sort_list_of_records_by_record_title([
         override_generated_fields(result['record'])
         for result in crawl_result['results_data']
-    ]
-    expected_results = [
+    ])
+    expected_results = sort_list_of_records_by_record_title([
         override_generated_fields(expected) for expected in expected_results
-    ]
+    ])
 
     assert DeepDiff(gotten_results, expected_results, ignore_order=True) == {}
     assert gotten_results == expected_results
@@ -205,13 +205,13 @@ def test_wsp_ftp_crawl_twice(expected_results, settings, cleanup):
 
     crawl_result = crawl_results[0]
 
-    gotten_results = [
+    gotten_results = sort_list_of_records_by_record_title([
         override_generated_fields(result['record'])
         for result in crawl_result['results_data']
-    ]
-    expected_results = [
+    ])
+    expected_results = sort_list_of_records_by_record_title([
         override_generated_fields(expected) for expected in expected_results
-    ]
+    ])
 
     assert gotten_results == expected_results
     assert not crawl_result['errors']

diff --git a/tests/unit/responses/arxiv/sample_arxiv_record0.xml b/tests/unit/responses/arxiv/sample_arxiv_record0.xml
@@ -34,7 +34,7 @@
 <comments>6 pages, 4 figures, conference paper</comments>
 <report-no>YITP-2016-26</report-no>
 <journal-ref>Phys.Rev. D93 (2015) 016005</journal-ref>
-<doi>10.1103/PhysRevD.93.016005</doi>
+<doi>10.1103/PhysRevD.93.016005, 10.1103/PhysRevD.98.079901;10.1103/PhysRevD.98.079903</doi>
 <license>https://creativecommons.org/licenses/by/3.0/</license>
 <abstract>We study the dynamics of quantum coherence under Unruh thermal noise and seek under which condition the coherence can be frozen in a relativistic setting. We find that the quantum coherence can not be frozen for any acceleration due to the effect of Unruh thermal noise. We also find that quantum coherence is more robust than entanglement under the effect of Unruh thermal noise and therefore the coherence type quantum resources are more accessible for relativistic quantum information processing tasks. Besides, the dynamic of quantum coherence is found to be more sensitive than entanglement to the preparation of the detectors' initial state and the atom-field coupling strength, while it is less sensitive than entanglement to the acceleration of the detector.</abstract>
 </arXiv>

diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py
@@ -144,6 +144,16 @@ def test_dois(results):
             'source': 'arXiv',
             'value': '10.1103/PhysRevD.93.016005',
             'material': 'publication',
+        },
+        {
+            "value": "10.1103/PhysRevD.98.079901",
+            "source": "arXiv",
+            "material": "publication"
+        },
+        {
+            "value": "10.1103/PhysRevD.98.079903",
+            "source": "arXiv",
+            "material": "publication"
         }
     ]
     for record in results: