Merge branch 'master' into develop

iAnanich · Jun 20, 2017 · 6fa9bfb · 6fa9bfb
2 parents d869b21 + 18b73b1
commit 6fa9bfb
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -55,3 +55,10 @@ and two `-----` strings.
 When spider scrapes `news` page, first of all it fetches `indexes` list of scraped
 articles from last week using Scrapy Cloud API. Then spider iterates over
 links to articles and scrapes only articles that aren't in the `indexes` list.
+
+#### Inheriting
+
+In `scrapy_climate/spider.py` Python module it is `TemplateSpider` class
+which can be use used as parent for actually running spiders. To make new
+spider you will need to configure it's selectors and define name, domain,
+and relative path to first page. More about it in the docstrings.
diff --git a/scrapy_climate/spider.py b/scrapy_climate/spider.py
@@ -67,8 +67,15 @@ def _scraped_in_past(self):
         return fetch_scraped_indexes(self.name)
 
     ### "yield" methods that returns generators
-    def _yield_request(self, path: str):
-        url = '{protocol}://{host}/{path}'.format(protocol=self._protocol, host=self.allowed_domains[0], path=path)
+    def _yield_request(self, path_or_url: str):
+        if '://' in path_or_url:
+            url = path_or_url
+            # extracting relative path from url
+            _protocol = self._protocol + '://'
+            path = path_or_url[path_or_url[len(_protocol):].find('/') + len(_protocol) + 1:]
+        else:
+            path = path_or_url
+            url = '{protocol}://{host}/{path}'.format(protocol=self._protocol, host=self.allowed_domains[0], path=path)
         index = self._convert_path_to_index(path)
         if index not in self._scraped_indexes:
             yield scrapy.http.Request(url=url,