Add Job.

hwms · Apr 8, 2016 · d128e58 · d128e58
1 parent fc4f18d
commit d128e58
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 0 deletions.
diff --git a/noscrapy/__init__.py b/noscrapy/__init__.py
@@ -1,3 +1,4 @@
+from .job import Job
 from .selector import Selector
 from .selectors import *
 from .sitemap import Sitemap

diff --git a/noscrapy/job.py b/noscrapy/job.py
@@ -0,0 +1,28 @@
+import re
+from urllib.parse import urljoin
+
+
+class Job(object):
+    def __init__(self, url, parent_selector=None, scraper=None, parent_job=None, base_data=None):
+        if parent_job:
+            self.url = self.combine_urls(parent_job.url, url)
+        else:
+            self.url = url
+        self.parent_selector = parent_selector
+        self.scraper = scraper
+        self.data_items = []
+        self.base_data = base_data or {}
+
+    def combine_urls(self, parent_url, child_url):
+        return urljoin(parent_url, child_url)
+
+    def execute(self, browser, callback, scope=None):
+        sitemap = self.scraper.sitemap
+        results = browser.fetch_data(self.url, sitemap, self.parent_selector, callback)
+        # merge data with data from initialization
+        for result in results:
+            result.update(**{k: v for k, v in self.base_data.items() if k not in result})
+            self.data_items.append(result)
+
+    def get_results(self):
+        return self.data_items
diff --git a/noscrapy/tests/test_job.py b/noscrapy/tests/test_job.py
@@ -0,0 +1,41 @@
+import pytest
+
+from noscrapy import Job
+
+URL_JOINS = {
+    '0': ('http://example.com/', '/test/', 'http://example.com/test/'),
+    '1': ('http://example.com/', 'test/', 'http://example.com/test/'),
+    '2': ('http://example.com/asdasdad', 'http://tvnet.lv', 'http://tvnet.lv'),
+    '3': ('http://example.com/asdasdad', '?test', 'http://example.com/asdasdad?test'),
+    '4': ('http://example.com/1/', '2/', 'http://example.com/1/2/'),
+    '5': ('http://127.0.0.1/1/', '2/', 'http://127.0.0.1/1/2/'),
+    '6': ('http://xn--80aaxitdbjk.xn--p1ai/', '2/', 'http://xn--80aaxitdbjk.xn--p1ai/2/'),
+    'with_slash_after_question_mark': ('http://a/b?y=5/9', 'c?x=4/9', 'http://a/c?x=4/9'),
+    'port_0': ('http://a:81/http:/b/c', 'http://a:81/http:/b/d', 'http://a:81/http:/b/d'),
+    'port_0': ('http://a:81/http:/b/c', 'd', 'http://a:81/http:/b/d'),
+}
+@pytest.mark.parametrize('parent_url,fragment,url', list(URL_JOINS.values()), ids=list(URL_JOINS))
+def test_urljoins(parent_url, fragment, url):
+    # should be able to create correct url from parent job
+    parent = Job(parent_url)
+    child = Job(fragment, parent_job=parent)
+    assert url == child.url
+
+
+def test_get_results():
+    # should not override data with base data if it already exists
+    class BrowserMock:
+        def fetch_data(self, url, sitemap, parent_selector=None, callback=None):
+            return [{'a': 1, 'b': 2}]
+
+    class ScraperMock:
+        def __init__(self):
+            self.sitemap = None
+
+    job = Job(url=None,
+              scraper=ScraperMock(),
+              base_data={'a': 'do not override', 'c': 3})
+
+    job.execute(BrowserMock(), callback=lambda arg: arg)
+    results = job.get_results()
+    assert [{'a': 1, 'b': 2, 'c': 3}] == results