Skip to content

Commit

Permalink
Implementation of first dict attribute support (header)
Browse files Browse the repository at this point in the history
  • Loading branch information
holgerd77 authored and holgerd77 committed Jul 31, 2015
1 parent fe36028 commit 7d18c74
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 52 deletions.
2 changes: 1 addition & 1 deletion dynamic_scraper/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class Scraper(models.Model):
max_items_read = models.IntegerField(blank=True, null=True, help_text="Max number of items to be read (empty: unlimited).")
max_items_save = models.IntegerField(blank=True, null=True, help_text="Max number of items to be saved (empty: unlimited).")
request_type = models.CharField(max_length=1, choices=REQUEST_TYPE_CHOICES, default='R', help_text="Normal GET request (default) or form request via POST, using Scrapys corresponding request classes (not used for checker).")
headers = models.TextField(blank=True, help_text="Optional HTTP headers sent with each request, provided as a JSON dict (e.g. {'Referer':'http://referer_url'})).")
headers = models.TextField(blank=True, help_text='Optional HTTP headers sent with each request, provided as a JSON dict (e.g. {"Referer":"http://referer_url"}, use double quotes!)).')
cookies = models.TextField(blank=True, help_text="Optional cookies as JSON dict, can use {page} placeholder of pagination.")
meta = models.TextField(blank=True, help_text="Optional Scrapy meta attributes as JSON dict, see Scrapy docs for reference.")
form_data = models.TextField(blank=True, help_text="Optional HTML form data as JSON dict, only used with FormRequest request type, can use {page} placeholder of pagination.")
Expand Down
18 changes: 14 additions & 4 deletions dynamic_scraper/spiders/django_base_spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import datetime, os
import datetime, json, os
from scrapy import log, signals
from scrapy.spider import Spider
from scrapy.xlib.pydispatch import dispatcher
Expand Down Expand Up @@ -130,16 +130,26 @@ def _check_mandatory_vars(self):


def _set_request_kwargs(self):
if self.scraper.headers != u'':
try:
headers = json.loads(self.scraper.headers)
except ValueError:
raise CloseSpider("Incorrect HTTP header attribute: not a valid JSON dict!")
if not isinstance(headers, dict):
raise CloseSpider("Incorrect HTTP header attribute: not a valid JSON dict!")
self.request_kwargs['headers'] = headers



def _set_meta_splash_args(self):
if self.scraper.detail_page_content_type == 'H' and self.scraper.render_javascript:
if 'meta' not in self.request_kwargs:
self.request_kwargs['meta'] = {}
self.request_kwargs['meta']['splash'] = {
'endpoint': 'render.html',
'args': self.conf['SPLASH_ARGS'].copy()
}
print self.request_kwargs



def spider_closed(self):
if self.conf['RUN_TYPE'] == 'TASK' and self.conf['DO_ACTION']:

Expand Down
2 changes: 1 addition & 1 deletion dynamic_scraper/spiders/django_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def __init__(self, *args, **kwargs):
super(DjangoChecker, self).__init__(self, *args, **kwargs)
self._set_config(**kwargs)
self._check_checker_config()
self._set_request_kwargs()

self.start_urls.append(self.scrape_url)
self.scheduler = Scheduler(self.scraper.scraped_obj_class.checker_scheduler_conf)
Expand Down Expand Up @@ -92,6 +91,7 @@ def _del_ref_object(self):

def start_requests(self):
for url in self.start_urls:
self._set_meta_splash_args()
yield Request(url, callback=self.parse, **self.request_kwargs)


Expand Down
2 changes: 2 additions & 0 deletions dynamic_scraper/spiders/django_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def _set_loader(self, response, xs, item):

def start_requests(self):
for url in self.start_urls:
self._set_meta_splash_args()
yield Request(url, callback=self.parse, **self.request_kwargs)


Expand Down Expand Up @@ -289,6 +290,7 @@ def parse(self, response):
else:
url_elem = self.scraper.get_detail_page_url_elems()[0]
url = item[url_elem.scraped_obj_attr.name]
self._set_meta_splash_args()
yield Request(url, callback=self.parse_item, **self.request_kwargs)
else:
self.log("Item could not be read!", log.ERROR)
Expand Down
82 changes: 36 additions & 46 deletions example_project/open_news/open_news.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[
{
"fields": {
"checker_scheduler_conf": "\"MIN_TIME\": 1440,\n\"MAX_TIME\": 10080,\n\"INITIAL_NEXT_ACTION_FACTOR\": 1,\n\"ZERO_ACTIONS_FACTOR_CHANGE\": 5,\n\"FACTOR_CHANGE_FACTOR\": 1.3,\n",
"scraper_scheduler_conf": "\"MIN_TIME\": 15,\n\"MAX_TIME\": 10080,\n\"INITIAL_NEXT_ACTION_FACTOR\": 10,\n\"ZERO_ACTIONS_FACTOR_CHANGE\": 20,\n\"FACTOR_CHANGE_FACTOR\": 1.3,\n",
"checker_scheduler_conf": "\"MIN_TIME\": 1440,\r\n\"MAX_TIME\": 10080,\r\n\"INITIAL_NEXT_ACTION_FACTOR\": 1,\r\n\"ZERO_ACTIONS_FACTOR_CHANGE\": 5,\r\n\"FACTOR_CHANGE_FACTOR\": 1.3,\r\n",
"scraper_scheduler_conf": "\"MIN_TIME\": 15,\r\n\"MAX_TIME\": 10080,\r\n\"INITIAL_NEXT_ACTION_FACTOR\": 10,\r\n\"ZERO_ACTIONS_FACTOR_CHANGE\": 20,\r\n\"FACTOR_CHANGE_FACTOR\": 1.3,\r\n",
"name": "Article",
"comments": ""
},
Expand Down Expand Up @@ -33,7 +33,7 @@
"fields": {
"obj_class": 1,
"name": "title",
"id_field": false,
"id_field": true,
"attr_type": "S"
},
"model": "dynamic_scraper.scrapedobjattr",
Expand All @@ -43,7 +43,7 @@
"fields": {
"obj_class": 1,
"name": "url",
"id_field": true,
"id_field": false,
"attr_type": "U"
},
"model": "dynamic_scraper.scrapedobjattr",
Expand All @@ -61,46 +61,56 @@
},
{
"fields": {
"status": "A",
"pagination_page_replace": "",
"checker_ref_url": "http://en.wikinews.org/wiki/This_wiki_article_doesnt_exist",
"cookies": "",
"checker_x_path_result": "",
"name": "Wikinews Scraper",
"render_javascript": false,
"checker_x_path": "",
"checker_ref_url": "http://en.wikinews.org/wiki/This_wiki_article_doesnt_exist",
"meta": "",
"pagination_append_str": "",
"comments": "",
"pagination_page_replace": "",
"checker_type": "4",
"scraped_obj_class": 1,
"status": "A",
"max_items_save": null,
"max_items_read": null,
"pagination_on_start": false,
"pagination_type": "N",
"checker_type": "4",
"comments": "",
"request_type": "R",
"content_type": "H",
"detail_page_content_type": "H",
"scraped_obj_class": 1
"name": "Wikinews Scraper",
"render_javascript": false,
"checker_x_path": "",
"pagination_on_start": false,
"pagination_type": "N",
"headers": "",
"form_data": ""
},
"model": "dynamic_scraper.scraper",
"pk": 1
},
{
"fields": {
"status": "A",
"pagination_page_replace": "",
"checker_ref_url": "",
"cookies": "",
"checker_x_path_result": "",
"name": "US Department of Justice - Press Release Scraper",
"render_javascript": false,
"checker_x_path": "",
"checker_ref_url": "",
"meta": "",
"pagination_append_str": "",
"comments": "",
"pagination_page_replace": "",
"checker_type": "N",
"scraped_obj_class": 1,
"status": "A",
"max_items_save": null,
"max_items_read": 10,
"pagination_on_start": false,
"pagination_type": "N",
"checker_type": "N",
"comments": "",
"request_type": "R",
"content_type": "J",
"detail_page_content_type": "H",
"scraped_obj_class": 1
"name": "US Department of Justice - Press Release Scraper",
"render_javascript": false,
"checker_x_path": "",
"pagination_on_start": false,
"pagination_type": "N",
"headers": "",
"form_data": ""
},
"model": "dynamic_scraper.scraper",
"pk": 2
Expand Down Expand Up @@ -240,25 +250,5 @@
},
"model": "dynamic_scraper.schedulerruntime",
"pk": 1
},
{
"fields": {
"url": "http://en.wikinews.org/wiki/Main_Page",
"scraper_runtime": 1,
"name": "Wikinews",
"scraper": 1
},
"model": "open_news.newswebsite",
"pk": 1
},
{
"fields": {
"url": "http://www.justice.gov/api/v1/blog_entries.json?amp%3Bpagesize=2",
"scraper_runtime": null,
"name": "US Department of Justice - Press Releases",
"scraper": 2
},
"model": "open_news.newswebsite",
"pk": 2
}
]

0 comments on commit 7d18c74

Please sign in to comment.