From fc4f18d8add17223dec88d410394963911a1001c Mon Sep 17 00:00:00 2001 From: Mathias Seidler Date: Thu, 7 Apr 2016 22:18:55 +0200 Subject: [PATCH] Add Sitemap. --- noscrapy/__init__.py | 1 + noscrapy/sitemap.py | 377 ++++++++++++++++++++++++ noscrapy/tests/test_sitemap.py | 507 +++++++++++++++++++++++++++++++++ 3 files changed, 885 insertions(+) create mode 100644 noscrapy/sitemap.py create mode 100644 noscrapy/tests/test_sitemap.py diff --git a/noscrapy/__init__.py b/noscrapy/__init__.py index 2794520..8d76082 100644 --- a/noscrapy/__init__.py +++ b/noscrapy/__init__.py @@ -1,3 +1,4 @@ from .selector import Selector from .selectors import * +from .sitemap import Sitemap from .utils import json diff --git a/noscrapy/sitemap.py b/noscrapy/sitemap.py new file mode 100644 index 0000000..e5b7a05 --- /dev/null +++ b/noscrapy/sitemap.py @@ -0,0 +1,377 @@ +import re +from collections import MutableSequence +from itertools import zip_longest, chain + +from noscrapy.utils import Field, json, Type + +from .selector import Selector + +START_URLS_RE = re.compile(r'^(.*?)\[(\d+)\-(\d+)(:(\d+))?\](.*)$') + +class Sitemap(MutableSequence, metaclass=Type): + id = Field(None) + ids = Field(fget='_ids', ro=True) + possible_parent_ids = Field(fget='_possible_parent_ids', ro=True) + columns = Field(fget='_columns', ro=True) + has_recursive_selectors = Field(fget='_has_recursive_selectors', ro=True) + start_urls = Field(fget='_get_start_urls', ro=True) + + parent_id = Field('_root') + parent_item = Field(None) + + def __init__(self, *args, **features): + self.selectors = [] + for arg in args: + if isinstance(arg, str): + features['id'] = arg + if isinstance(arg, Selector): + features['selectors'] = [arg] + elif isinstance(arg, Sitemap): + features = dict(arg.__getstate__(), **features) + elif isinstance(arg, dict): + features = dict(arg, **features) + else: + features['selectors'] = list(arg) + start_urls = features.pop('start_urls', ()) + start_urls = [start_urls] if isinstance(start_urls, str) else start_urls + self._start_urls = list(start_urls) + for value in features.pop('selectors', ()): + try: + self.append(value) + except ValueError: + pass + for attr, value in features.items(): + setattr(self, attr, value.strip() if isinstance(value, str) else value) + + def __delitem__(self, value): + unlinked_ids = [] + index = value if isinstance(value, (int, slice)) else self.selectors.index(value) + selector_id = self.selectors[index].id + for selector in self.selectors: + selector.remove_parent(selector_id) + if not selector.parents and selector_id != selector.id: + unlinked_ids.append(selector.id) + del self.selectors[index] + for unlinked_id in unlinked_ids: + del self[unlinked_id] + + def __getitem__(self, index): + index = index if isinstance(index, (int, slice)) else self.selectors.index(index) + return self.selectors[index] + + def get(self, index, default=None): + try: + return self[index] + except ValueError: + return default + + def __setitem__(self, index, value): + index = index if isinstance(index, (int, slice)) else self.selectors.index(index) + current = self.selectors[index] + selector = Selector(value) + if current.id != selector.id: + if selector.id in self.selectors: + raise ValueError('Id %r is already taken' % selector.id) + self.selectors[index] = selector + if current.id != selector.id: + self._rename_parents(current.id, selector.id) + + def insert(self, index, value): + selector = Selector(value) + if selector.id in self.selectors: + raise ValueError('Id %r is already taken' % selector.id) + self.selectors.insert(index, selector) + + def __len__(self): + return len(self.selectors) + + def __eq__(self, other): + marker = object() + for a, b in zip_longest(self, other, fillvalue=marker): + if a != b: + return False + return True + + def __repr__(self): + reprs = [repr(o) for o in self] + return '%s(%r, [%s])' % (type(self).__name__, self.id, ', '.join(reprs)) + + def __getstate__(self): + return {'id': self.id, 'selectors': self.selectors} + + __setstate__ = __init__ + + def copy(self): + return self.__class__(self.__getstate__()) + + def concat(self, *other_lists): + result = self.copy() + for other_list in other_lists: + result.extend(other_list) + return result + + def _ids(self): + return ('_root',) + tuple(s.id for s in self) + + def _possible_parent_ids(self): + return ('_root',) + tuple(s.id for s in self if s.can_have_childs) + + def _columns(self): + return tuple(chain.from_iterable(s.columns for s in self)) + + def get_all(self, parent_id=None): + """Returns all or recursively all childs of a parent.""" + if not parent_id: + yield from self.selectors + return + + results = set() + def get_childs(parent_id): + for pos, selector in enumerate(self): + if pos not in results and selector.has_parent(parent_id): + results.add(pos) + get_childs(selector.id) + get_childs(parent_id) + for pos in sorted(results): + yield self[pos] + + def get_direct_childs(self, parent_id): + """Returns only selectors that are directly under a parent.""" + for selector in self: + if selector.has_parent(parent_id): + yield selector + + def get_one_page_selectors(self, selector_id): + selector = self.get(selector_id) + results = [selector] + # recursively find all parents that could lead to the page where selector_id is used. + def find_parents(selector): + for parent_id in selector.parents: + if parent_id == '_root': + return + parent = self.get(parent_id) + if parent not in results and parent.will_return_items: + results.append(parent) + find_parents(parent) + find_parents(selector) + results += self.get_one_page_childs(selector.id) + results = sorted(self.index(s) for s in results) + for pos in sorted(results): + yield self[pos] + + def get_one_page_childs(self, parent_id): + """Returns all child selectors of a selector which can be used within one page.""" + results = [] + def add_childs(parent): + if not parent.will_return_items: + return + for child in self.get_direct_childs(parent.id): + results.append(child) + add_childs(child) + add_childs(self.get(parent_id)) + results = sorted(self.index(s) for s in results) + for pos in sorted(results): + yield self[pos] + + def will_return_many(self, selector_id): + selector = self.get(selector_id) + if selector.will_return_many: + return True + for child_selector in self.get_all(selector_id): + if child_selector.will_return_many: + return True + return False + + def get_one_page_css(self, selector_id, parent_ids): + """Return css selector for a given element which includes all parent element selectors. + parent_ids: array of parent selector ids from devtools Breadcumb.""" + css = self.get(selector_id).css + parent_css = self.get_one_page_parent_css(parent_ids) + return ' '.join(s for s in (parent_css, css) if s) + + def get_one_page_parent_css(self, parent_ids): + """Return css selector for parent selectors that are within one page. + parent_ids: array of parent selector ids from devtools Breadcumb.""" + css_deque = [] + for parent_id in parent_ids: + parent_selector = self.get(parent_id) + if parent_selector and parent_selector.will_return_items: + css_deque.append(parent_selector.css) + return ' '.join(s for s in css_deque if s) + + def _has_recursive_selectors(self): + recursion_found = [False] + for top_selector in self: + visited = [] + def check_recursion(parent_selector): + if parent_selector in visited: + recursion_found[0] = True + return + elif parent_selector.will_return_items: + visited.append(parent_selector) + for child in self.get_direct_childs(parent_selector.id): + check_recursion(child) + visited.remove(parent_selector) + check_recursion(top_selector) + return recursion_found[0] + + def _rename_parents(self, current_id, new_id): + for selector in self: + selector.rename_parent(current_id, new_id) + + def _get_start_urls(self): + for url in self._start_urls: + matches = START_URLS_RE.match(url) + matches = matches.groups() if matches else None + if matches: + step = int(matches[4] or 1) + start_str, stop_str = matches[1], matches[2] + start, stop = int(start_str), int(stop_str) + step + lpad = len(start_str) if len(start_str) == len(stop_str) else 1 + fmt = '%s%%0%dd%s' % (matches[0], lpad, matches[5]) + for i in range(start, stop, step): + yield fmt % i + else: + yield url + + def get_csv_rows(self, row_dicts): + headers = self.columns + yield headers + for row_dict in row_dicts: + csv_row = [] + for header in headers: + cell = row_dict.get(header, '') + if not isinstance(cell, str): + cell = json.dumps(cell) + csv_row.append(cell) + yield tuple(csv_row) + + def get_data(self): + for tree in self.trees: + print(tree, self.parent_id, self.parent_item) + for results in self.get_selector_tree_data(tree, self.parent_id, self.parent_item): + yield results + + @property + def trees(self): + """List of independent selector lists. follow=true splits selectors in trees. + Two side by side type=multiple selectors split trees.""" + return self._find_trees(self.parent_id, []) + + def _find_trees(self, parent_id, common_selectors_from_parent): + common_selectors = common_selectors_from_parent[:] + common_selectors += self.get_selectors_common_to_all_trees(parent_id) + + # find selectors that will be making a selector tree + trees = [] + childs = list(self.get_direct_childs(parent_id)) + for selector in childs: + if self.selector_is_common_to_all_trees(selector): + continue + # this selector will be making a new selector tree. + # But this selector might contain some child selectors that are making more trees, + # so here should be a some kind of seperation for that + tree = Sitemap(common_selectors + [selector]) + if selector.can_have_local_childs: + # find selector tree within this selector + trees.extend(self._find_trees(selector.id, tree)) + else: + trees.append(tree) + + # it there were not any selectors that make a separate tree then all common selectors make up a single selector tree + return trees or [Sitemap(common_selectors)] + + def get_selector_tree_data(self, tree, parent_id, parent_item, common_data=None): + child_common_data = self.get_selector_tree_common_data(tree, parent_id, parent_item) + common_data = dict(common_data or {}, **child_common_data) + yielded = False + for selector in tree.get_direct_childs(parent_id): + if tree.will_return_many(selector.id): + new_common_data = dict(common_data) + for responses in self.get_many_selector_data(tree, selector, parent_item, new_common_data): + yield responses + yielded = True + if not yielded and common_data: + yield common_data + + def get_selectors_common_to_all_trees(self, parent_id): + common_selectors = [] + for selector in self.get_direct_childs(parent_id): + if self.selector_is_common_to_all_trees(selector): + common_selectors.append(selector) + # also add all childs which. Childs were also checked + for child in self.get_all(selector.id): + if child not in common_selectors: + common_selectors.append(child) + return common_selectors + + def selector_is_common_to_all_trees(self, selector): + """The selector cannot return multiple records and it also cannot create new jobs. + Also all of its child selectors must have the same features.""" + if selector.will_return_many: + return False + # Link selectors which will follow to a new page also cannot be common to all selectors + if selector.can_create_new_jobs and self.get_direct_childs(selector.id): + return False + # also all child selectors must have the same features + for child in self.get_all(selector.id): + if not self.selector_is_common_to_all_trees(child): + return False + return True + + def get_selector_tree_common_data(self, tree, parent_id, parent_item): + common_data = {} + for child in tree.get_direct_childs(parent_id): + if tree.will_return_many(child.id): + continue + for results in self.get_selector_common_data(tree, child, parent_item): + common_data.update(results) + return common_data + + def get_selector_common_data(self, tree, selector, parent_item): + for data in selector.get_data(parent_item): + if selector.will_return_items: + yield self.get_selector_tree_common_data(tree, selector.id, data[0]) + else: + yield data + + def get_many_selector_data(self, tree, selector, parent_item, common_data): + """Returns all data records for a selector that can return multiple records.""" + # if the selector is not an Item selector then its fetched data is the result. + if selector.will_return_items: + # handle situation when this selector is an Item Selector + for item in selector.get_data(parent_item): + new_common_data = dict(common_data) + for responses in self.get_selector_tree_data(tree, selector.id, item, new_common_data): + yield responses + else: + new_common_data = dict(common_data) + for record in selector.get_data(parent_item): + record.update(new_common_data) + yield record + + def get_single_selector_data(self, parent_ids, selector_id): # pragma: no cover + # to fetch only single selectors data we will create a sitemap that only contains this + # selector, his parents and all child selectors + sitemap = self.sitemap.copy() + selector = sitemap[selector_id] + childs = sitemap.get_all(selector_id) + parents = [sitemap[i] for i in parent_ids] + for parent_id in parent_ids[::-1]: + if parent_id == '_root': + break + parents.append(sitemap[parent_id]) + + # merge all needed selectors together + sitemap.clear() + sitemap.extend(parents + childs + [selector]) + + # find the parent that leaded to the page where required selector is being used + for parent_id in parent_ids[::-1]: + if parent_id == '_root': + break + parent = self.sitemap[parent_id] + if parent.will_return_items: + break + sitemap.parent_id = parent_id + yield from sitemap.get_data() diff --git a/noscrapy/tests/test_sitemap.py b/noscrapy/tests/test_sitemap.py new file mode 100644 index 0000000..4ca5ad8 --- /dev/null +++ b/noscrapy/tests/test_sitemap.py @@ -0,0 +1,507 @@ +import pytest + +from noscrapy import json, Selector, Sitemap, ItemSelector, LinkSelector, TextSelector + +def test_init(): + dicts = [dict(id='a', type='TextSelector')] + sitemap = Sitemap('m', dicts) + assert list(sitemap) + assert isinstance(sitemap[0], Selector) + assert sitemap.ids == ('_root', 'a') + # create selector instances from dicts + assert sitemap[0] == Selector(dicts[0]) + # ignore repeating selectors + assert sitemap == Sitemap(dicts + dicts) + # create selectors list from one selector + assert Sitemap(Selector(dicts[0]))[0] == Selector(dicts[0]) + # create sitemap from other sitemap + assert Sitemap(sitemap) == sitemap + +def test_setitem(): + sitemap = Sitemap([dict(id='a', type='TextSelector'), dict(id='b', type='TextSelector')]) + sitemap[0] = dict(id='c', type='TextSelector') + assert sitemap == [dict(id='c', type='TextSelector'), dict(id='b', type='TextSelector')] + + with pytest.raises(ValueError): + sitemap[0] = dict(id='b', type='TextSelector') + +def test_eq(): + dicts = [dict(id='a', type='TextSelector'), dict(id='b', type='TextSelector')] + sitemap = Sitemap(dicts) + assert sitemap == Sitemap(dicts) + assert sitemap != Sitemap(reversed(dicts)) + assert sitemap != Sitemap(dicts[:1]) + +def test_repr(): + dicts = [dict(id='a', type='TextSelector'), dict(id='b', type='TextSelector')] + sitemap = Sitemap('l', dicts) + assert repr(sitemap) == \ + "Sitemap('l', [TextSelector('a'), TextSelector('b')])" + +def test_copy(): + sitemap = Sitemap([dict(id='a', type='TextSelector')]) + result_list = sitemap.copy() + assert sitemap == result_list + assert sitemap is not result_list + assert sitemap[0] == result_list[0] + assert sitemap[0] is not result_list[0] + sitemap.pop() + assert len(sitemap) == 0 + assert len(result_list) == 1 + +def test_concat(): + sitemapa = Sitemap([dict(id='a', type='TextSelector')]) + sitemapb = Sitemap([dict(id='b', type='TextSelector')]) + sitemapc = Sitemap([dict(id='c', type='TextSelector')]) + result_list = sitemapa.concat(sitemapb, sitemapc) + assert result_list == [sitemapa[0], sitemapb[0], sitemapc[0]] + +def test_possible_parent_ids(): + dicts = [dict(id='a', type='ItemSelector'), + dict(id='b', type='GroupSelector'), + dict(id='c', type='HtmlSelector'), + dict(id='d', type='ImageSelector'), + dict(id='e', type='LinkSelector'), + dict(id='f', type='TextSelector'), + ] + sitemap = Sitemap(selectors=dicts) + assert sitemap.possible_parent_ids == ('_root', 'a', 'e') + +def test_get_all(): + child_dicts = [ + dict(id='a', type='ItemSelector', parents=['_root', 'c']), + dict(id='b', type='ItemSelector', parents=['a']), + dict(id='c', type='ItemSelector', parents=['b']), + ] + all_dicts = child_dicts + [ + dict(id='d', type='ItemSelector'), + ] + sitemap = Sitemap(all_dicts) + # return all selectors + assert list(sitemap.get_all()) == all_dicts + # return all childs of a parent + assert list(sitemap.get_all('a')) == child_dicts + +def test_get_direct_childs(): + expected_dicts = [ + dict(id='b', type='ItemSelector', parents=['a']), + dict(id='c', type='ItemSelector', parents=['a']), + ] + dicts = expected_dicts + [ + dict(id='a', type='ItemSelector', parents=['_root', 'c']), + dict(id='d', type='ItemSelector'), + ] + actual_selectors = list(Sitemap(dicts).get_direct_childs('a')) + assert actual_selectors == expected_dicts + +def test_will_return_many(): + sitemap = Sitemap([ + dict(id='a', type='ItemSelector', many=False), + dict(id='b', type='ItemSelector', parents=['a'], many=True), + dict(id='c', type='ItemSelector', parents=['b'], many=True), + ]) + assert sitemap.will_return_many('a') + + sitemap = Sitemap([ + dict(id='a', type='ItemSelector', many=True), + dict(id='b', type='ItemSelector', parents=['a'], many=False), + dict(id='c', type='ItemSelector', parents=['b'], many=False), + ]) + assert sitemap.will_return_many('a') + + sitemap = Sitemap([ + dict(id='a', type='ItemSelector', many=False), + dict(id='b', type='ItemSelector', parents=['a'], many=False), + dict(id='c', type='ItemSelector', parents=['b'], many=False), + ]) + assert not sitemap.will_return_many('a') + +def test_json(): + dicts = [dict(id='a', type='ItemSelector', many=False)] + sitemap = Sitemap('l', dicts) + + js_dicts = json.loads(json.dumps(sitemap)) + assert js_dicts == {'id': 'l', 'selectors': dicts} + + result_list = Sitemap(js_dicts) + assert result_list == sitemap + +def test_get_one_page_childs(): + expected_dicts = [ + dict(id='child1', type='TextSelector', parents=['parent2'], many=False), + dict(id='child2', type='TextSelector', parents=['parent2'], many=False), + dict(id='child3', type='ItemSelector', parents=['parent2'], many=False), + dict(id='child4', type='ItemSelector', parents=['child3'], many=False), + dict(id='child5', type='TextSelector', parents=['child4'], many=False), + dict(id='link', type='LinkSelector', parents=['child3'], many=False), + ] + sitemap = Sitemap(expected_dicts + [ + dict(id='parent2', type='ItemSelector', many=True), + dict(id='ignore1', type='TextSelector', parents=['link'], many=False), + dict(id='ignore2', type='TextSelector', parents=['link'], many=False), + dict(id='ignore_root', type='TextSelector', many=False), + dict(id='ignore_parent1', type='TextSelector', parents=['parent1'], many=False), + ]) + page_child_selectors = list(sitemap.get_one_page_childs('parent2')) + assert page_child_selectors == expected_dicts + +def test_get_one_page_selectors(): + expected_dicts = [ + dict(id='parent1', type='ItemSelector', many=True), + dict(id='parent2', type='ItemSelector', parents=['parent1'], many=False), + dict(id='child1', type='TextSelector', parents=['parent2'], many=False), + dict(id='child2', type='TextSelector', parents=['parent2'], many=False), + dict(id='child3', type='ItemSelector', parents=['parent2'], many=False), + dict(id='child4', type='ItemSelector', parents=['child3'], many=False), + dict(id='child5', type='TextSelector', parents=['child4'], many=False), + dict(id='link', type='LinkSelector', parents=['parent2'], many=False), + ] + sitemap = Sitemap(expected_dicts + [ + dict(id='ignore1', type='TextSelector', parents=['link'], many=False), + dict(id='ignore2', type='TextSelector', parents=['link'], many=False), + dict(id='ignore_root', type='TextSelector', many=False), + dict(id='ignore_parent1', type='TextSelector', parents=['parent1'], many=False), + ]) + page_child_selectors = list(sitemap.get_one_page_selectors('parent2')) + assert page_child_selectors == expected_dicts + +def test_get_one_page_css(): + sitemap = Sitemap([ + dict(id='div', type='TextSelector', css='div'), + ]) + css = sitemap.get_one_page_css('div', ['_root']) + assert css == 'div' + + sitemap = Sitemap([ + dict(id='parent1', type='ItemSelector', css='div.parent'), + dict(id='div', type='TextSelector', css='div'), + ]) + css = sitemap.get_one_page_css('div', ['_root', 'parent1']) + assert css == 'div.parent div' + + sitemap = Sitemap([ + dict(id='parent2', type='ItemSelector', css='div.parent2'), + dict(id='parent1', type='ItemSelector', css='div.parent'), + dict(id='div', type='TextSelector', css='div'), + ]) + css = sitemap.get_one_page_css('div', ['_root', 'parent2', 'parent1']) + assert css == 'div.parent2 div.parent div' + + sitemap = Sitemap([ + dict(id='parent2', type='LinkSelector', css='div.parent2'), + dict(id='parent1', type='ItemSelector', css='div.parent'), + dict(id='div', type='TextSelector', css='div'), + ]) + css = sitemap.get_one_page_css('div', ['_root', 'parent2', 'parent1']) + assert css == 'div.parent div' + +def test_get_one_page_parent_css(): + sitemap = Sitemap([ + dict(id='parent2', type='ItemSelector', css='div.parent2'), + dict(id='parent1', type='ItemSelector', css='div.parent'), + dict(id='div', type='TextSelector', css='div'), + ]) + css = sitemap.get_one_page_parent_css(['_root', 'parent2', 'parent1']) + assert css == 'div.parent2 div.parent' + +def test_has_recursive_selectors(): + sitemap = Sitemap([ + dict(id='parent1', type='ItemSelector', css='div.parent'), + dict(id='parent2', type='ItemSelector', css='div.parent2', parents=['parent1']), + dict(id='div', type='ItemSelector', css='div', parents=['parent2']), + ]) + assert not sitemap.has_recursive_selectors + + sitemap = Sitemap([ + dict(id='parent1', type='ItemSelector', css='div.parent', parents=['div']), + dict(id='parent2', type='ItemSelector', css='div.parent2', parents=['parent1']), + dict(id='div', type='ItemSelector', css='div', parents=['parent2']), + ]) + result = sitemap.has_recursive_selectors + assert result + + sitemap = Sitemap([ + dict(id='link', type='LinkSelector', css='div.parent', parents=['link', '_root']), + dict(id='parent', type='ItemSelector', css='div.parent2', parents=['link']), + dict(id='div', type='ItemSelector', css='div', parents=['parent', 'link']), + ]) + assert not sitemap.has_recursive_selectors + +def test_update_selector(): + # with parent + dicts = [ + dict(id='parent', type='ItemSelector'), + dict(id='a', type='TextSelector', parents=['parent']), + ] + sitemap = Sitemap(selectors=dicts) + expected = Selector('b', type='TextSelector', parents=['parent']) + sitemap[1] = expected + assert sitemap[1] == expected + + # with childs + dicts = [ + dict(id='child', type='TextSelector', parents=['a']), + dict(id='a', type='ItemSelector'), + ] + sitemap = Sitemap(selectors=dicts) + expected = Selector('b', type='ItemSelector') + expected_child = Selector('child', type='TextSelector', parents=['b']) + sitemap[1] = expected + assert sitemap[1] == expected + assert sitemap[0] == expected_child + + # with itself as parent + dicts = [ + dict(id='a', type='ItemSelector', parents=['a']), + ] + sitemap = Sitemap(selectors=dicts) + update = Selector('b', type='ItemSelector', parents=['a']) + expected = Selector('b', type='ItemSelector', parents=['b']) + sitemap[0] = update + assert sitemap[0] == expected + + # type change + dicts = [ + dict(id='a', type='TextSelector', parents=['a']), + ] + sitemap = Sitemap(selectors=dicts) + update = Selector('a', type='LinkSelector', parents=['a']) + assert not sitemap.selectors[0].can_create_new_jobs + sitemap[0] = update + assert sitemap[0].__class__.__name__ == 'LinkSelector' + +def test_columns(): + dicts = [ + dict(id='a', type='TextSelector', parents=['div']), + dict(id='b', type='LinkSelector', parents=['b']), + ] + sitemap = Sitemap(selectors=dicts) + assert sitemap.columns == ('a', 'b', 'b-href') + +def test_delete(): + dicts = [ + dict(id='a', type='TextSelector'), + dict(id='b', type='LinkSelector'), + ] + sitemap = Sitemap(selectors=dicts) + del sitemap[0] + assert len(sitemap) == 1 + + dicts = [ + dict(id='a', type='TextSelector'), + dict(id='b', type='LinkSelector', parents=['a']), + ] + sitemap = Sitemap(selectors=dicts) + del sitemap[0] + assert len(sitemap) == 0 + + dicts = [ + dict(id='a', type='TextSelector'), + dict(id='b', type='LinkSelector', parents=['a']), + dict(id='c', type='LinkSelector', parents=['b', '_root']), + ] + sitemap = Sitemap(selectors=dicts) + del sitemap[0] + expected = Selector('c', type='LinkSelector') + assert len(sitemap) == 1 + assert sitemap[0] == expected + +URLS = { + 'one': + ('http://a.b/', ['http://a.b/']), + 'tuple': + (('http://a.b/1.html', 'http://a.b/2.html'), ['http://a.b/1.html', 'http://a.b/2.html']), + 'list': + (['http://a.b/1.html', 'http://a.b/2.html'], ['http://a.b/1.html', 'http://a.b/2.html']), + 'range': + ('http://a.b/[1-3].html', ['http://a.b/1.html', 'http://a.b/2.html', 'http://a.b/3.html']), + 'range_in_get': + ('http://a.b/?id=[1-3]', ['http://a.b/?id=1', 'http://a.b/?id=2', 'http://a.b/?id=3']), + 'range_step': + ('http://a.b/?id=[0-4:2]', ['http://a.b/?id=0', 'http://a.b/?id=2', 'http://a.b/?id=4']), + 'range_with_lpad': + ('http://a.b/[001-003]/', ['http://a.b/001/', 'http://a.b/002/', 'http://a.b/003/']), +} +@pytest.mark.parametrize('start_urls,expected', list(URLS.values()), ids=list(URLS)) +def test_start_urls(start_urls, expected): + sitemap = Sitemap(dict(start_urls=start_urls)) + assert list(sitemap.start_urls) == expected + +def test_get_csv_rows(): + dicts = [ + dict(id='a', type='TextSelector', parents=['div']), + dict(id='b', type='TextSelector', parents=['b']), + ] + sitemap = Sitemap(selectors=dicts) + row_dicts = [dict(a='a', b=['b'], c='c')] + csv_rows = list(sitemap.get_csv_rows(row_dicts)) + # can't access the data so I'm just checking whether this runs + assert csv_rows == [('a', 'b'), ('a', '["b"]')] + +IS_COMMON = { + 'one_single_is': + [TextSelector('a', many=0)], + 'one_many_is_not': + [TextSelector('a', many=1)], + 'link_is_not': + [LinkSelector('a', many=0), TextSelector('b', parents=['a'], many=0)], + 'singles_tree_is': + [ItemSelector('a', many=0), TextSelector('b', parents=['a'], many=0)], + 'singles_tree_with_many_is_not': + [ItemSelector('a', many=0), TextSelector('b', parents=['a'], many=1)], +} +@pytest.mark.parametrize('title,selectors', list(IS_COMMON.items()), ids=list(IS_COMMON.keys())) +def test_selector_is_common_to_all_trees(title, selectors): + """Should be able to tell whether a selector will be common to all selector tree groups.""" + expected = not title.endswith('not') + result = Sitemap(selectors).selector_is_common_to_all_trees(selectors[0]) + assert result is expected + + +def test_get_selectors_common_to_all_trees(): + """Should be able to find selectors common to all selector trees.""" + selectors = [ + ItemSelector('a', many=0), + TextSelector('b', many=0, parents=['a']), + TextSelector('c', many=0), + TextSelector('d', many=1), + ItemSelector('e', many=0), + TextSelector('f', many=1, parents=['e']), + ] + result = Sitemap(selectors).get_selectors_common_to_all_trees('_root') + assert result == selectors[:3] + +TREES = { + 'single_item': + ([TextSelector('a')], [['a']]), + 'link_tree': + ([LinkSelector('a', many=1), TextSelector('b', many=0, parents=['a'])], [['a']]), + 'tree_with_many': + ([ItemSelector('a', many=0), LinkSelector('b', many=1, parents=['a'])], [['a', 'b']]), + 'tree_without_many': + ([ItemSelector('a', many=0), + ItemSelector('b', many=0), + TextSelector('c', many=0, parents=['a'])], + [['a', 'c', 'b']]), + # TODO: jasmine result: [['a', 'b', 'c']] + 'many_link_trees': + ([ItemSelector('common ', many=0), + ItemSelector('parent1 ', many=0), + ItemSelector('parent2 ', many=0), + LinkSelector('follow1 ', many=1, parents=['parent1']), + LinkSelector('follow11', many=1, parents=['parent1']), + LinkSelector('follow2 ', many=1, parents=['parent2']), + LinkSelector('follow3 ', many=1)], + [['common', 'parent1', 'follow1'], + ['common', 'parent1', 'follow11'], + ['common', 'parent2', 'follow2'], + ['common', 'follow3']]), + 'many_results_in_many_trees': + ([ItemSelector('common ', many=0), + ItemSelector('parent1', many=0), + ItemSelector('parent2', many=0), + TextSelector('common1', many=0, parents=['parent1']), + TextSelector('many1 ', many=1, parents=['parent1']), + TextSelector('many11 ', many=1, parents=['parent1']), + TextSelector('many2 ', many=1, parents=['parent2']), + TextSelector('many3 ', many=1)], + [['common', 'parent1', 'common1', 'many1'], + ['common', 'parent1', 'common1', 'many11'], + ['common', 'parent2', 'many2'], + ['common', 'many3']]), + # TODO: jasmine result: [ + # ['common', 'common1', 'parent1', 'many1'], + # ['common', 'common1', 'parent1', 'many11'], + # ['common', 'parent2', 'many2'], + # ['common', 'many3'] + # ] + 'chained_many': + ([ItemSelector('div ', many=0, css='div'), + ItemSelector('table', many=1, css='table', parents=['div']), + ItemSelector('tr ', many=1, css='tr', parents=['table']), + TextSelector('td ', many=0, css='td', parents=['tr'])], + [['div', 'table', 'tr', 'td']]), + 'return_one_for_this_map': + # for url: http://www.centos.org/modules/tinycontent/index.php?id=30 + ([ItemSelector('mirror-row ', many=1, parents=['_root', 'mirror-page'], + css='table#cter tr:nth-of-type(n+3)'), + TextSelector('region ', many=0, parents=['mirror-row'], + css='td:nth-of-type(1)'), + TextSelector('state ', many=0, parents=['mirror-row'], + css='td:nth-of-type(2)'), + LinkSelector('url-http ', many=0, parents=['mirror-row'], + css='td:nth-of-type(7) a')], + [['mirror-row', 'region', 'state', 'url-http']]), +} +@pytest.mark.parametrize('selectors,expected', list(TREES.values()), ids=list(TREES)) +def test_trees(selectors, expected): + result = [[s.id for s in t] for t in Sitemap(selectors).trees] + assert result == expected + +HTML1 = 'ABC' +HTML2 = '
A
B
' +HTML3 = ('
result1
result2
' + '
result3
result4
') + +GET_DATA = { + 'text_from_single': (HTML1, + [TextSelector('a', many=0, css='a')], + [{'a': 'A'}]), + 'text_from_parent': (HTML1, + [TextSelector('span', many=0, css='span')], + [{'span': 'C'}]), + 'multiple_texts': (HTML1, + [TextSelector('a', many=1, css='a')], + [{'a': 'A'}, {'a': 'B'}]), + 'multiple_texts_with_common_data': (HTML1, + [TextSelector('a', many=1, css='a'), TextSelector('c', many=0, css='.c')], + [{'a': 'A', 'c': 'C'}, {'a': 'B', 'c': 'C'}]), + 'text_within_element': (HTML2, + [ItemSelector('e', css='div', many=0), TextSelector('a', css='a', parents=['e'], many=0)], + [{'a': 'A'}]), + 'texts_within_elements': (HTML2, + [ItemSelector('e', css='div', many=1), TextSelector('a', css='a', parents=['e'], many=0)], + [{'a': 'A'}, {'a': 'B'}]), + 'texts_within_element': (HTML2, + [ItemSelector('e', css='div', many=0), TextSelector('a', css='a', parents=['e'], many=1)], + [{'a': 'A'}]), + 'many_from_chained': (HTML3, + [ItemSelector('div', css='div', many=0), + ItemSelector('table', css='table', parents=['div'], many=1), + ItemSelector('tr', css='tr', parents=['table'], many=1), + TextSelector('td', css='td', parents=['tr'], many=0)], + [{'td': 'result1'}, {'td': 'result2'}, {'td': 'result3'}, {'td': 'result4'}]), + 'empty_from_single': (HTML3, + [ItemSelector('span', css='span.non', many=0)], + []), # TODO: jasmine result: [{'span': None}] + 'empty_from_many': (HTML3, + [ItemSelector('span', css='span.non', many=1)], + []), +} +@pytest.mark.parametrize('html,selectors,expected', list(GET_DATA.values()), ids=list(GET_DATA)) +def test_get_data(html, selectors, expected): + sitemap = Sitemap(selectors, parent_id='_root', parent_item=html) + assert list(sitemap.get_data()) == expected + +def test_get_selector_common_data(): + # with one selector + selectors = [Selector('a', 'TextSelector', css='a', many=0)] + sitemap = Sitemap(selectors) + result = list(sitemap.get_selector_common_data(sitemap, selectors[0], HTML1)) + assert result == [{'a': 'A'}] + +def test_get_selector_tree_common_data(): + # with one selector + selectors = [Selector('a', 'TextSelector', css='a', many=0)] + sitemap = Sitemap(selectors, parent_item=HTML1) + assert sitemap.get_selector_tree_common_data(sitemap, '_root', HTML1) == {'a': 'A'} + + # with multiple selectors + parent_item = """
A
""" + selectors = [ + Selector('parent1', 'ItemSelector', css='div', many=0), + Selector('parent2', 'ItemSelector', css='div', parents=['parent1'], many=0), + Selector('a', 'TextSelector', css='a', parents=['parent2'], many=0), + ] + sitemap = Sitemap(selectors, parent_item=parent_item) + assert sitemap.get_selector_tree_common_data(sitemap, '_root', parent_item) == {'a': 'A'}