Skip to content

Commit

Permalink
Add Job.
Browse files Browse the repository at this point in the history
  • Loading branch information
hwms committed Apr 8, 2016
1 parent fc4f18d commit d128e58
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 0 deletions.
1 change: 1 addition & 0 deletions noscrapy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .job import Job
from .selector import Selector
from .selectors import *
from .sitemap import Sitemap
Expand Down
28 changes: 28 additions & 0 deletions noscrapy/job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import re
from urllib.parse import urljoin


class Job(object):
def __init__(self, url, parent_selector=None, scraper=None, parent_job=None, base_data=None):
if parent_job:
self.url = self.combine_urls(parent_job.url, url)
else:
self.url = url
self.parent_selector = parent_selector
self.scraper = scraper
self.data_items = []
self.base_data = base_data or {}

def combine_urls(self, parent_url, child_url):
return urljoin(parent_url, child_url)

def execute(self, browser, callback, scope=None):
sitemap = self.scraper.sitemap
results = browser.fetch_data(self.url, sitemap, self.parent_selector, callback)
# merge data with data from initialization
for result in results:
result.update(**{k: v for k, v in self.base_data.items() if k not in result})
self.data_items.append(result)

def get_results(self):
return self.data_items
41 changes: 41 additions & 0 deletions noscrapy/tests/test_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pytest

from noscrapy import Job

URL_JOINS = {
'0': ('http://example.com/', '/test/', 'http://example.com/test/'),
'1': ('http://example.com/', 'test/', 'http://example.com/test/'),
'2': ('http://example.com/asdasdad', 'http://tvnet.lv', 'http://tvnet.lv'),
'3': ('http://example.com/asdasdad', '?test', 'http://example.com/asdasdad?test'),
'4': ('http://example.com/1/', '2/', 'http://example.com/1/2/'),
'5': ('http://127.0.0.1/1/', '2/', 'http://127.0.0.1/1/2/'),
'6': ('http://xn--80aaxitdbjk.xn--p1ai/', '2/', 'http://xn--80aaxitdbjk.xn--p1ai/2/'),
'with_slash_after_question_mark': ('http://a/b?y=5/9', 'c?x=4/9', 'http://a/c?x=4/9'),
'port_0': ('http://a:81/http:/b/c', 'http://a:81/http:/b/d', 'http://a:81/http:/b/d'),
'port_0': ('http://a:81/http:/b/c', 'd', 'http://a:81/http:/b/d'),
}
@pytest.mark.parametrize('parent_url,fragment,url', list(URL_JOINS.values()), ids=list(URL_JOINS))
def test_urljoins(parent_url, fragment, url):
# should be able to create correct url from parent job
parent = Job(parent_url)
child = Job(fragment, parent_job=parent)
assert url == child.url


def test_get_results():
# should not override data with base data if it already exists
class BrowserMock:
def fetch_data(self, url, sitemap, parent_selector=None, callback=None):
return [{'a': 1, 'b': 2}]

class ScraperMock:
def __init__(self):
self.sitemap = None

job = Job(url=None,
scraper=ScraperMock(),
base_data={'a': 'do not override', 'c': 3})

job.execute(BrowserMock(), callback=lambda arg: arg)
results = job.get_results()
assert [{'a': 1, 'b': 2, 'c': 3}] == results

0 comments on commit d128e58

Please sign in to comment.