wip

jaesivsm · Dec 1, 2016 · 433c203 · 433c203
1 parent bb82b44
commit 433c203
Show file tree

Hide file tree

Showing 6 changed files with 259 additions and 215 deletions.
diff --git a/src/crawler/lib/headers_handling.py b/src/crawler/lib/headers_handling.py
@@ -22,6 +22,28 @@ def rfc_1123_utc(time_obj=None, delta=None):
     return time_obj.strftime(RFC_1123_FORMAT)
 
 
+def _extract_max_age(headers, feed_info, now):
+    if 'max-age' in headers.get('cache-control', ''):
+        try:
+            max_age = int(MAX_AGE_RE.search(headers['cache-control']).group(1))
+            feed_info['expires'] = now + timedelta(seconds=max_age)
+        except Exception:
+            pass
+
+
+def _extract_expires(headers, feed_info):
+    if headers.get('expires'):
+        try:
+            expires = dateutil.parser.parse(headers['expires'])
+            if expires.tzinfo:
+                expires = expires.astimezone(timezone.utc)
+            else:
+                expires = expires.replace(tzinfo=timezone.utc)
+            feed_info['expires'] = expires
+        except Exception:
+            pass
+
+
 def extract_feed_info(headers):
     """providing the headers of a feed response, will calculate the headers
     needed for basic cache control.
@@ -37,32 +59,13 @@ def extract_feed_info(headers):
 
     feed_info = {'etag': headers.get('etag', ''),
                  'last_modified': headers.get('last-modified', rfc_1123_utc())}
-    msg = "didn't found expiring mechanism, expiring at %r"
-    if 'max-age' in headers.get('cache-control', ''):
-        msg = 'found Cache-Control "max-age" header, expiring at %r'
-        try:
-            max_age = int(MAX_AGE_RE.search(headers['cache-control']).group(1))
-        except Exception:
-            pass
-        else:
-            feed_info['expires'] = now + timedelta(seconds=max_age)
-    if 'expires' not in feed_info and headers.get('expires'):
-        msg = "found Expires header, expiring at %r"
-        try:
-            expires = dateutil.parser.parse(headers['expires'])
-            if expires.tzinfo:
-                expires = expires.astimezone(timezone.utc)
-            else:
-                expires = expires.replace(tzinfo=timezone.utc)
-        except Exception:
-            pass
-        else:
-            feed_info['expires'] = expires
-
+    _extract_max_age(headers, feed_info, now)
+    if 'expires' not in feed_info:
+        _extract_expires(headers, feed_info)
     if not feed_info.get('expires'):
         feed_info['expires'] = now + timedelta(
                 seconds=conf.FEED_DEFAULT_EXPIRES)
-    logger.info(msg, feed_info['expires'].isoformat())
+
     if max_expires < feed_info['expires']:
         logger.info("expiring too late, forcing expiring at %r",
                     max_expires.isoformat())

diff --git a/src/lib/article_utils.py b/src/lib/article_utils.py
@@ -65,32 +65,31 @@ def get_article_content(entry):
     return content
 
 
+def _fetch_article(link):
+    try:
+        # resolves URL behind proxies (like feedproxy.google.com)
+        return jarr_get(link, timeout=5)
+    except MissingSchema:
+        split = urlsplit(link)
+        for scheme in 'https', 'http':
+            new_link = urlunsplit(SplitResult(scheme, *split[1:]))
+            try:
+                return jarr_get(new_link, timeout=5)
+            except Exception as error:
+                continue
+    except Exception as error:
+        logger.info("Unable to get the real URL of %s. Won't fix "
+                    "link or title. Error: %s", link, error)
+
+
 def get_article_details(entry, fetch=True):
     article_link = entry.get('link')
     article_title = html.unescape(entry.get('title', ''))
     tags = {tag.get('term').strip() for tag in entry.get('tags', [])
             if tag.get('term').strip()}
     if fetch and conf.CRAWLER_RESOLV and article_link or not article_title:
-        try:
-            # resolves URL behind proxies (like feedproxy.google.com)
-            response = jarr_get(article_link, timeout=5)
-        except MissingSchema:
-            split, failed = urlsplit(article_link), False
-            for scheme in 'https', 'http':
-                new_link = urlunsplit(SplitResult(scheme, *split[1:]))
-                try:
-                    response = jarr_get(new_link, timeout=5)
-                except Exception as error:
-                    failed = True
-                    continue
-                failed = False
-                article_link = new_link
-                break
-            if failed:
-                return article_link, article_title or 'No title', tags
-        except Exception as error:
-            logger.info("Unable to get the real URL of %s. Won't fix "
-                        "link or title. Error: %s", article_link, error)
+        response = _fetch_article(article_link)
+        if response is None:
             return article_link, article_title or 'No title', tags
         article_link = response.url
         if not article_title:
@@ -118,47 +117,53 @@ class FiltersTrigger(Enum):
     NO_MATCH = 'no match'
 
 
+def _is_filter_to_skip(filter_action, only_actions, article):
+    if filter_action not in only_actions:
+        return True
+    if filter_action in {FiltersType.REGEX, FiltersType.MATCH,
+            FiltersType.EXACT_MATCH} and 'title' not in article:
+        return True
+    if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \
+            and 'tags' not in article:
+        return True
+    return False
+
+
+def _is_filter_matching(filter_, article):
+    pattern = filter_.get('pattern', '')
+    filter_type = FiltersType(filter_.get('type'))
+    filter_trigger = FiltersTrigger(filter_.get('action on'))
+    if filter_type is not FiltersType.REGEX:
+        pattern = pattern.lower()
+    title = article.get('title', '').lower()
+    tags = [tag.lower() for tag in article.get('tags', [])]
+    if filter_type is FiltersType.REGEX:
+        match = re.match(pattern, title)
+    elif filter_type is FiltersType.MATCH:
+        match = pattern in title
+    elif filter_type is FiltersType.EXACT_MATCH:
+        match = pattern == title
+    elif filter_type is FiltersType.TAG_MATCH:
+        match = pattern in tags
+    elif filter_type is FiltersType.TAG_CONTAINS:
+        match = any(pattern in tag for tag in tags)
+    return match and filter_trigger is FiltersTrigger.MATCH \
+            or not match and filter_trigger is FiltersTrigger.NO_MATCH
+
+
 def process_filters(filters, article, only_actions=None):
     skipped, read, liked = False, None, False
     filters = filters or []
     if only_actions is None:
         only_actions = set(FiltersAction)
     for filter_ in filters:
-        match = False
-        try:
-            pattern = filter_.get('pattern', '')
-            filter_type = FiltersType(filter_.get('type'))
-            filter_action = FiltersAction(filter_.get('action'))
-            filter_trigger = FiltersTrigger(filter_.get('action on'))
-            if filter_type is not FiltersType.REGEX:
-                pattern = pattern.lower()
-        except ValueError:
-            continue
-        if filter_action not in only_actions:
+        filter_action = FiltersAction(filter_.get('action'))
+
+        if _is_filter_to_skip(filter_action, only_actions, article):
             logger.debug('ignoring filter %r' % filter_)
             continue
-        if filter_action in {FiltersType.REGEX, FiltersType.MATCH,
-                FiltersType.EXACT_MATCH} and 'title' not in article:
-            continue
-        if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \
-                and 'tags' not in article:
-            continue
-        title = article.get('title', '').lower()
-        tags = [tag.lower() for tag in article.get('tags', [])]
-        if filter_type is FiltersType.REGEX:
-            match = re.match(pattern, title)
-        elif filter_type is FiltersType.MATCH:
-            match = pattern in title
-        elif filter_type is FiltersType.EXACT_MATCH:
-            match = pattern == title
-        elif filter_type is FiltersType.TAG_MATCH:
-            match = pattern in tags
-        elif filter_type is FiltersType.TAG_CONTAINS:
-            match = any(pattern in tag for tag in tags)
-        take_action = match and filter_trigger is FiltersTrigger.MATCH \
-                or not match and filter_trigger is FiltersTrigger.NO_MATCH
-
-        if not take_action:
+
+        if not _is_filter_matching(filter_, article):
             continue
 
         if filter_action is FiltersAction.READ:

diff --git a/src/lib/feed_utils.py b/src/lib/feed_utils.py
@@ -2,6 +2,7 @@
 import logging
 import urllib
 from copy import deepcopy
+from functools import lru_cache
 
 import feedparser
 
@@ -53,24 +54,17 @@ def get_parsed_feed(url):
     return fp_parsed
 
 
-@correct_feed_values
-def construct_feed_from(url=None, fp_parsed=None, feed=None):
-    """
-    Will try to construct the most complete feed dict possible.
+@lru_cache(maxsize=None)
+def get_splits(url, site_link=None):
+    # trying to make up for missing values
+    feed_split = urllib.parse.urlsplit(url)
+    site_split = None
+    if site_link:
+        site_split = urllib.parse.urlsplit(site_link)
+    return site_split, feed_split
 
-    url: an url of a feed or a site that might be hosting a feed
-    fp_parsed: a feedparser object previously obtained
-    feed: an existing feed dict, will be updated
-    """
-    feed = deepcopy(feed) if feed else {}
-    if not url and hasattr(fp_parsed, 'get') and fp_parsed.get('href'):
-        url = fp_parsed.get('href')
-
-    # we'll try to obtain our first parsing from feedparser
-    if url and not fp_parsed:
-        fp_parsed = get_parsed_feed(url)
-    assert url is not None and fp_parsed is not None
 
+def _extract_links(url, feed, fp_parsed):
     if is_parsing_ok(fp_parsed):
         feed['link'] = url
     else:
@@ -93,25 +87,18 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None):
                         and link['type'] == 'text/html'))
         feed['site_link'] = site_link or feed.get('site_link')
         feed['site_link'] = feed['site_link'] or feed.get('link')
+    return fp_parsed
 
-    if not feed.get('title'):  # not overriding user pref for title
-        if fp_parsed['feed'].get('title'):
-            feed['title'] = fp_parsed['feed'].get('title')
-        elif fp_parsed['feed'].get('title_detail', {}).get('value'):
-            feed['title'] = fp_parsed['feed']['title_detail']['value']
 
-    if fp_parsed['feed'].get('summary'):
-        feed['description'] = fp_parsed['feed']['summary']
-    elif fp_parsed['feed'].get('subtitle_detail', {}).get('value'):
-        feed['description'] = fp_parsed['feed']['subtitle_detail']['value']
+def _update_feed_w_parsed(fkey, simple_key, value_key, feed, fp_parsed):
+    if fp_parsed['feed'].get(simple_key):
+        feed[fkey] = fp_parsed['feed'].get(simple_key)
+    elif fp_parsed['feed'].get(value_key, {}).get('value'):
+        feed[fkey] = fp_parsed['feed'][value_key]['value']
 
-    # trying to make up for missing values
-    feed_split = urllib.parse.urlsplit(url)
-    site_split = None
-    if feed.get('site_link'):
-        feed['site_link'] = rebuild_url(feed['site_link'], feed_split)
-        site_split = urllib.parse.urlsplit(feed['site_link'])
 
+def _check_and_fix_icon(url, feed, fp_parsed):
+    site_split, feed_split = get_splits(url, feed.get('site_link'))
     new_icon_urls = [fp_parsed.get('feed', {}).get('icon')] \
                     + list(_browse_feedparser_feed(fp_parsed,
                            lambda link: 'icon' in link['rel']))
@@ -124,16 +111,12 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None):
                 feed['icon_url'] = icon_url
                 break
 
-    nothing_to_fill = all(bool(feed.get(key))
-                          for key in ('link', 'title', 'icon_url'))
-    # here we have all we want or we do not have the main url,
-    # either way we're leaving
-    if not feed.get('site_link') or nothing_to_fill:
-        return feed
 
-    # trying to parse the page of the site for some rel link in the header
+def _fetch_url_and_enhance_feed(url, feed):
+    """trying to parse the page of the site for some rel link in the header"""
+    site_split, feed_split = get_splits(url, feed.get('site_link'))
     try:
-        response = jarr_get(feed['site_link'])
+        response = jarr_get(url)
     except Exception as error:
         logger.warn('failed to retreive %r: %r', feed['site_link'], error)
         return feed
@@ -147,3 +130,52 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None):
     if not feed.get('link'):
         feed['link'] = extract_feed_link(response, feed_split)
     return feed
+
+
+def _is_processing_complete(feed, site_link_necessary=False):
+    all_filled = all(bool(feed.get(key))
+                     for key in ('link', 'title', 'icon_url'))
+    # here we have all we want or we do not have the main url,
+    # either way we're leaving
+    return (site_link_necessary and not feed.get('site_link')) or all_filled
+
+
+@correct_feed_values
+def construct_feed_from(url=None, fp_parsed=None, feed=None):
+    """
+    Will try to construct the most complete feed dict possible.
+
+    url: an url of a feed or a site that might be hosting a feed
+    fp_parsed: a feedparser object previously obtained
+    feed: an existing feed dict, will be updated
+    """
+    feed = deepcopy(feed) if feed else {}
+    if not url and hasattr(fp_parsed, 'get') and fp_parsed.get('href'):
+        url = fp_parsed.get('href')
+
+    # we'll try to obtain our first parsing from feedparser
+    if url and not fp_parsed:
+        fp_parsed = get_parsed_feed(url)
+    assert url is not None and fp_parsed is not None
+
+    fp_parsed = _extract_links(url, feed, fp_parsed)
+
+    if not feed.get('title'):  # not overriding user pref for title
+        _update_feed_w_parsed('title', 'title', 'title_detail',
+                              feed, fp_parsed)
+    _update_feed_w_parsed('description', 'summary', 'subtitle_detail',
+                          feed, fp_parsed)
+
+    if _is_processing_complete(feed):
+        return feed
+
+    if feed.get('site_link'):
+        feed['site_link'] = rebuild_url(feed['site_link'],
+                                        get_splits(url, feed['site_link'])[1])
+
+    _check_and_fix_icon(url, feed, fp_parsed)
+
+    if _is_processing_complete(feed, site_link_necessary=True):
+        return feed
+
+    return _fetch_url_and_enhance_feed(feed['site_link'], feed)