From d128e58fc10cc597e0f8979aaa5774a0fd898337 Mon Sep 17 00:00:00 2001 From: Mathias Seidler Date: Fri, 8 Apr 2016 20:59:04 +0200 Subject: [PATCH] Add Job. --- noscrapy/__init__.py | 1 + noscrapy/job.py | 28 ++++++++++++++++++++++++++ noscrapy/tests/test_job.py | 41 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 noscrapy/job.py create mode 100644 noscrapy/tests/test_job.py diff --git a/noscrapy/__init__.py b/noscrapy/__init__.py index 8d76082..fbb6f17 100644 --- a/noscrapy/__init__.py +++ b/noscrapy/__init__.py @@ -1,3 +1,4 @@ +from .job import Job from .selector import Selector from .selectors import * from .sitemap import Sitemap diff --git a/noscrapy/job.py b/noscrapy/job.py new file mode 100644 index 0000000..aa145a9 --- /dev/null +++ b/noscrapy/job.py @@ -0,0 +1,28 @@ +import re +from urllib.parse import urljoin + + +class Job(object): + def __init__(self, url, parent_selector=None, scraper=None, parent_job=None, base_data=None): + if parent_job: + self.url = self.combine_urls(parent_job.url, url) + else: + self.url = url + self.parent_selector = parent_selector + self.scraper = scraper + self.data_items = [] + self.base_data = base_data or {} + + def combine_urls(self, parent_url, child_url): + return urljoin(parent_url, child_url) + + def execute(self, browser, callback, scope=None): + sitemap = self.scraper.sitemap + results = browser.fetch_data(self.url, sitemap, self.parent_selector, callback) + # merge data with data from initialization + for result in results: + result.update(**{k: v for k, v in self.base_data.items() if k not in result}) + self.data_items.append(result) + + def get_results(self): + return self.data_items diff --git a/noscrapy/tests/test_job.py b/noscrapy/tests/test_job.py new file mode 100644 index 0000000..8797ef7 --- /dev/null +++ b/noscrapy/tests/test_job.py @@ -0,0 +1,41 @@ +import pytest + +from noscrapy import Job + +URL_JOINS = { + '0': ('http://example.com/', '/test/', 'http://example.com/test/'), + '1': ('http://example.com/', 'test/', 'http://example.com/test/'), + '2': ('http://example.com/asdasdad', 'http://tvnet.lv', 'http://tvnet.lv'), + '3': ('http://example.com/asdasdad', '?test', 'http://example.com/asdasdad?test'), + '4': ('http://example.com/1/', '2/', 'http://example.com/1/2/'), + '5': ('http://127.0.0.1/1/', '2/', 'http://127.0.0.1/1/2/'), + '6': ('http://xn--80aaxitdbjk.xn--p1ai/', '2/', 'http://xn--80aaxitdbjk.xn--p1ai/2/'), + 'with_slash_after_question_mark': ('http://a/b?y=5/9', 'c?x=4/9', 'http://a/c?x=4/9'), + 'port_0': ('http://a:81/http:/b/c', 'http://a:81/http:/b/d', 'http://a:81/http:/b/d'), + 'port_0': ('http://a:81/http:/b/c', 'd', 'http://a:81/http:/b/d'), +} +@pytest.mark.parametrize('parent_url,fragment,url', list(URL_JOINS.values()), ids=list(URL_JOINS)) +def test_urljoins(parent_url, fragment, url): + # should be able to create correct url from parent job + parent = Job(parent_url) + child = Job(fragment, parent_job=parent) + assert url == child.url + + +def test_get_results(): + # should not override data with base data if it already exists + class BrowserMock: + def fetch_data(self, url, sitemap, parent_selector=None, callback=None): + return [{'a': 1, 'b': 2}] + + class ScraperMock: + def __init__(self): + self.sitemap = None + + job = Job(url=None, + scraper=ScraperMock(), + base_data={'a': 'do not override', 'c': 3}) + + job.execute(BrowserMock(), callback=lambda arg: arg) + results = job.get_results() + assert [{'a': 1, 'b': 2, 'c': 3}] == results