Skip to content

Commit

Permalink
Merge pull request #219 from szymonlopaciuk/strict_spider_init
Browse files Browse the repository at this point in the history
utils: strictly check kwargs in spiders (#218)
  • Loading branch information
david-caro authored Feb 1, 2018
2 parents 5bd3c5f + dfd1f97 commit 0ee8f4e
Show file tree
Hide file tree
Showing 23 changed files with 147 additions and 10 deletions.
2 changes: 2 additions & 0 deletions hepcrawl/spiders/alpha_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ..utils import (
has_numbers,
ParsedItem,
strict_kwargs,
)


Expand All @@ -49,6 +50,7 @@ class AlphaSpider(StatefulSpider, CrawlSpider):
domain = "http://alpha.web.cern.ch/"
itertag = "//div[@class = 'node node-thesis']"

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct Alpha spider"""
super(AlphaSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/aps_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
get_nested,
build_dict,
ParsedItem,
strict_kwargs,
)


Expand All @@ -47,6 +48,7 @@ class APSSpider(StatefulSpider):
name = 'APS'
aps_base_url = "http://harvest.aps.org/v2/journals/articles"

@strict_kwargs
def __init__(self, url=None, from_date=None, until_date=None,
date="published", journals=None, sets=None, per_page=100,
**kwargs):
Expand Down
25 changes: 20 additions & 5 deletions hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
get_licenses,
split_fullname,
ParsedItem,
strict_kwargs,
)

RE_CONFERENCE = re.compile(
Expand All @@ -47,15 +48,29 @@ class ArxivSpider(OAIPMHSpider):
Using OAI-PMH XML files::
$ scrapy crawl arXiv \\
-a "oai_set=physics:hep-th" -a "from_date=2017-12-13"
-a "sets=physics:hep-th" -a "from_date=2017-12-13"
"""
name = 'arXiv'

def __init__(self, *args, **kwargs):
kwargs.setdefault('url', 'http://export.arxiv.org/oai2')
kwargs.setdefault('format', 'arXiv')
super(ArxivSpider, self).__init__(*args, **kwargs)
@strict_kwargs
def __init__(
self,
url='http://export.arxiv.org/oai2',
format='arXiv',
sets=None,
from_date=None,
until_date=None,
**kwargs
):
super(ArxivSpider, self).__init__(
url=url,
format=format,
sets=sets,
from_date=from_date,
until_date=until_date,
**kwargs
)

def parse_record(self, selector):
"""Parse an arXiv XML exported file into a HEP record."""
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
parse_domain,
get_node,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -78,6 +79,7 @@ class BaseSpider(StatefulSpider, XMLFeedSpider):
("dc", "http://purl.org/dc/elements/1.1/"),
]

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct BASE spider"""
super(BaseSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/brown_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
parse_domain,
get_mime_type,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -63,6 +64,7 @@ class BrownSpider(StatefulSpider, CrawlSpider):
name = 'brown'
start_urls = ["https://repository.library.brown.edu/api/collections/355/"]

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct Brown spider."""
super(BrownSpider, self).__init__(*args, **kwargs)
Expand Down
3 changes: 2 additions & 1 deletion hepcrawl/spiders/cds_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from scrapy.spider import XMLFeedSpider

from . import StatefulSpider
from ..utils import ParsedItem
from ..utils import ParsedItem, strict_kwargs


class CDSSpider(StatefulSpider, XMLFeedSpider):
Expand All @@ -48,6 +48,7 @@ class CDSSpider(StatefulSpider, XMLFeedSpider):
('marc', 'http://www.loc.gov/MARC21/slim'),
]

@strict_kwargs
def __init__(self, source_file=None, **kwargs):
super(CDSSpider, self).__init__(**kwargs)
self.source_file = source_file
Expand Down
6 changes: 4 additions & 2 deletions hepcrawl/spiders/common/oaipmh_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from scrapy.selector import Selector

from .last_run_store import LastRunStoreSpider
from ...utils import strict_kwargs


LOGGER = logging.getLogger(__name__)
Expand All @@ -47,6 +48,7 @@ class OAIPMHSpider(LastRunStoreSpider):
__metaclass__ = abc.ABCMeta
name = 'OAI-PMH'

@strict_kwargs
def __init__(
self,
url,
Expand All @@ -55,9 +57,9 @@ def __init__(
alias=None,
from_date=None,
until_date=None,
*args, **kwargs
**kwargs
):
super(OAIPMHSpider, self).__init__(*args, **kwargs)
super(OAIPMHSpider, self).__init__(**kwargs)
self.url = url
self.format = format
if isinstance(sets, string_types):
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
ftp_list_files,
ftp_connection_info,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -71,6 +72,7 @@ class DesySpider(StatefulSpider):
"""
name = 'desy'

@strict_kwargs
def __init__(
self,
source_folder=None,
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/dnb_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
parse_domain,
get_node,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -64,6 +65,7 @@ class DNBSpider(StatefulSpider, XMLFeedSpider):
("slim", "http://www.loc.gov/MARC21/slim"),
]

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct DNB spider."""
super(DNBSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/edp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
get_node,
parse_domain,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -123,6 +124,7 @@ class EDPSpider(StatefulSpider, Jats, XMLFeedSpider):
'EPJ Web of Conferences'
}

@strict_kwargs
def __init__(self, package_path=None, ftp_folder="incoming", ftp_netrc=None, *args, **kwargs):
"""Construct EDP spider.
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
range_as_string,
unzip_xml_files,
ParsedItem,
strict_kwargs,
)
from ..dateutils import format_year

Expand Down Expand Up @@ -138,6 +139,7 @@ class ElsevierSpider(StatefulSpider, XMLFeedSpider):

ERROR_CODES = range(400, 432)

@strict_kwargs
def __init__(self, atom_feed=None, zip_file=None, xml_file=None, *args, **kwargs):
"""Construct Elsevier spider."""
super(ElsevierSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/hindawi_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from ..utils import (
get_licenses,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -69,6 +70,7 @@ class HindawiSpider(StatefulSpider, XMLFeedSpider):
("mml", "http://www.w3.org/1998/Math/MathML"),
]

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct Hindawi spider."""
super(HindawiSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/infn_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from ..utils import (
get_temporary_file,
ParsedItem,
strict_kwargs,
)
from ..dateutils import format_date

Expand Down Expand Up @@ -65,6 +66,7 @@ class InfnSpider(StatefulSpider, XMLFeedSpider):
itertag = "//tr[@onmouseover]"
today = str(datetime.date.today().year)

@strict_kwargs
def __init__(self, source_file=None, year=today, *args, **kwargs):
"""Construct INFN spider"""
super(InfnSpider, self).__init__(*args, **kwargs)
Expand Down
3 changes: 2 additions & 1 deletion hepcrawl/spiders/iop_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..extractors.nlm import NLM
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import ParsedItem
from ..utils import ParsedItem, strict_kwargs


class IOPSpider(StatefulSpider, XMLFeedSpider, NLM):
Expand Down Expand Up @@ -82,6 +82,7 @@ class IOPSpider(StatefulSpider, XMLFeedSpider, NLM):
# FIXME: add more
}

@strict_kwargs
def __init__(self, zip_file=None, xml_file=None, pdf_files=None, *args, **kwargs):
"""Construct IOP spider."""
super(IOPSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/magic_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ..utils import (
split_fullname,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -57,6 +58,7 @@ class MagicSpider(StatefulSpider, XMLFeedSpider):

ERROR_CODES = range(400, 432)

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct MAGIC spider"""
super(MagicSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/mit_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
get_temporary_file,
split_fullname,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -63,6 +64,7 @@ class MITSpider(StatefulSpider, XMLFeedSpider):
itertag = "//ul[@class='ds-artifact-list']/li"
today = str(datetime.date.today().year)

@strict_kwargs
def __init__(self, source_file=None, year=today, *args, **kwargs):
"""Construct MIT spider"""
super(MITSpider, self).__init__(*args, **kwargs)
Expand Down
3 changes: 2 additions & 1 deletion hepcrawl/spiders/phenix_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import ParsedItem
from ..utils import ParsedItem, strict_kwargs


class PhenixSpider(StatefulSpider, XMLFeedSpider):
Expand Down Expand Up @@ -50,6 +50,7 @@ class PhenixSpider(StatefulSpider, XMLFeedSpider):
iterator = "html"
itertag = "//table//td/ul/li"

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct PHENIX spider"""
super(PhenixSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/phil_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
parse_domain,
get_mime_type,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -57,6 +58,7 @@ class PhilSpider(StatefulSpider, CrawlSpider):
name = 'phil'
start_urls = ["http://philpapers.org/philpapers/raw/export/inspire.json"]

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct Phil spider."""
super(PhilSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
get_licenses,
get_first,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -76,6 +77,7 @@ class POSSpider(StatefulSpider):
"""
name = 'pos'

@strict_kwargs
def __init__(
self,
source_file=None,
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/t2k_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ..utils import (
split_fullname,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -59,6 +60,7 @@ class T2kSpider(StatefulSpider, XMLFeedSpider):
iterator = "html"
itertag = "//table[@id='folders']//tr"

@strict_kwargs
def __init__(self, source_file=None, *args, **kwargs):
"""Construct T2K spider"""
super(T2kSpider, self).__init__(*args, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
local_list_files,
unzip_xml_files,
ParsedItem,
strict_kwargs,
)


Expand Down Expand Up @@ -89,6 +90,7 @@ class WorldScientificSpider(StatefulSpider, XMLFeedSpider):
'rapid-communications'
]

@strict_kwargs
def __init__(
self,
local_package_dir=None,
Expand Down
Loading

0 comments on commit 0ee8f4e

Please sign in to comment.