Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed Dec 1, 2016
1 parent bb82b44 commit 433c203
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 215 deletions.
49 changes: 26 additions & 23 deletions src/crawler/lib/headers_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,28 @@ def rfc_1123_utc(time_obj=None, delta=None):
return time_obj.strftime(RFC_1123_FORMAT)


def _extract_max_age(headers, feed_info, now):
if 'max-age' in headers.get('cache-control', ''):
try:
max_age = int(MAX_AGE_RE.search(headers['cache-control']).group(1))
feed_info['expires'] = now + timedelta(seconds=max_age)
except Exception:
pass


def _extract_expires(headers, feed_info):
if headers.get('expires'):
try:
expires = dateutil.parser.parse(headers['expires'])
if expires.tzinfo:
expires = expires.astimezone(timezone.utc)
else:
expires = expires.replace(tzinfo=timezone.utc)
feed_info['expires'] = expires
except Exception:
pass


def extract_feed_info(headers):
"""providing the headers of a feed response, will calculate the headers
needed for basic cache control.
Expand All @@ -37,32 +59,13 @@ def extract_feed_info(headers):

feed_info = {'etag': headers.get('etag', ''),
'last_modified': headers.get('last-modified', rfc_1123_utc())}
msg = "didn't found expiring mechanism, expiring at %r"
if 'max-age' in headers.get('cache-control', ''):
msg = 'found Cache-Control "max-age" header, expiring at %r'
try:
max_age = int(MAX_AGE_RE.search(headers['cache-control']).group(1))
except Exception:
pass
else:
feed_info['expires'] = now + timedelta(seconds=max_age)
if 'expires' not in feed_info and headers.get('expires'):
msg = "found Expires header, expiring at %r"
try:
expires = dateutil.parser.parse(headers['expires'])
if expires.tzinfo:
expires = expires.astimezone(timezone.utc)
else:
expires = expires.replace(tzinfo=timezone.utc)
except Exception:
pass
else:
feed_info['expires'] = expires

_extract_max_age(headers, feed_info, now)
if 'expires' not in feed_info:
_extract_expires(headers, feed_info)
if not feed_info.get('expires'):
feed_info['expires'] = now + timedelta(
seconds=conf.FEED_DEFAULT_EXPIRES)
logger.info(msg, feed_info['expires'].isoformat())

if max_expires < feed_info['expires']:
logger.info("expiring too late, forcing expiring at %r",
max_expires.isoformat())
Expand Down
111 changes: 58 additions & 53 deletions src/lib/article_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,32 +65,31 @@ def get_article_content(entry):
return content


def _fetch_article(link):
try:
# resolves URL behind proxies (like feedproxy.google.com)
return jarr_get(link, timeout=5)
except MissingSchema:
split = urlsplit(link)
for scheme in 'https', 'http':
new_link = urlunsplit(SplitResult(scheme, *split[1:]))
try:
return jarr_get(new_link, timeout=5)
except Exception as error:
continue
except Exception as error:
logger.info("Unable to get the real URL of %s. Won't fix "
"link or title. Error: %s", link, error)


def get_article_details(entry, fetch=True):
article_link = entry.get('link')
article_title = html.unescape(entry.get('title', ''))
tags = {tag.get('term').strip() for tag in entry.get('tags', [])
if tag.get('term').strip()}
if fetch and conf.CRAWLER_RESOLV and article_link or not article_title:
try:
# resolves URL behind proxies (like feedproxy.google.com)
response = jarr_get(article_link, timeout=5)
except MissingSchema:
split, failed = urlsplit(article_link), False
for scheme in 'https', 'http':
new_link = urlunsplit(SplitResult(scheme, *split[1:]))
try:
response = jarr_get(new_link, timeout=5)
except Exception as error:
failed = True
continue
failed = False
article_link = new_link
break
if failed:
return article_link, article_title or 'No title', tags
except Exception as error:
logger.info("Unable to get the real URL of %s. Won't fix "
"link or title. Error: %s", article_link, error)
response = _fetch_article(article_link)
if response is None:
return article_link, article_title or 'No title', tags
article_link = response.url
if not article_title:
Expand Down Expand Up @@ -118,47 +117,53 @@ class FiltersTrigger(Enum):
NO_MATCH = 'no match'


def _is_filter_to_skip(filter_action, only_actions, article):
if filter_action not in only_actions:
return True
if filter_action in {FiltersType.REGEX, FiltersType.MATCH,
FiltersType.EXACT_MATCH} and 'title' not in article:
return True
if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \
and 'tags' not in article:
return True
return False


def _is_filter_matching(filter_, article):
pattern = filter_.get('pattern', '')
filter_type = FiltersType(filter_.get('type'))
filter_trigger = FiltersTrigger(filter_.get('action on'))
if filter_type is not FiltersType.REGEX:
pattern = pattern.lower()
title = article.get('title', '').lower()
tags = [tag.lower() for tag in article.get('tags', [])]
if filter_type is FiltersType.REGEX:
match = re.match(pattern, title)
elif filter_type is FiltersType.MATCH:
match = pattern in title
elif filter_type is FiltersType.EXACT_MATCH:
match = pattern == title
elif filter_type is FiltersType.TAG_MATCH:
match = pattern in tags
elif filter_type is FiltersType.TAG_CONTAINS:
match = any(pattern in tag for tag in tags)
return match and filter_trigger is FiltersTrigger.MATCH \
or not match and filter_trigger is FiltersTrigger.NO_MATCH


def process_filters(filters, article, only_actions=None):
skipped, read, liked = False, None, False
filters = filters or []
if only_actions is None:
only_actions = set(FiltersAction)
for filter_ in filters:
match = False
try:
pattern = filter_.get('pattern', '')
filter_type = FiltersType(filter_.get('type'))
filter_action = FiltersAction(filter_.get('action'))
filter_trigger = FiltersTrigger(filter_.get('action on'))
if filter_type is not FiltersType.REGEX:
pattern = pattern.lower()
except ValueError:
continue
if filter_action not in only_actions:
filter_action = FiltersAction(filter_.get('action'))

if _is_filter_to_skip(filter_action, only_actions, article):
logger.debug('ignoring filter %r' % filter_)
continue
if filter_action in {FiltersType.REGEX, FiltersType.MATCH,
FiltersType.EXACT_MATCH} and 'title' not in article:
continue
if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \
and 'tags' not in article:
continue
title = article.get('title', '').lower()
tags = [tag.lower() for tag in article.get('tags', [])]
if filter_type is FiltersType.REGEX:
match = re.match(pattern, title)
elif filter_type is FiltersType.MATCH:
match = pattern in title
elif filter_type is FiltersType.EXACT_MATCH:
match = pattern == title
elif filter_type is FiltersType.TAG_MATCH:
match = pattern in tags
elif filter_type is FiltersType.TAG_CONTAINS:
match = any(pattern in tag for tag in tags)
take_action = match and filter_trigger is FiltersTrigger.MATCH \
or not match and filter_trigger is FiltersTrigger.NO_MATCH

if not take_action:

if not _is_filter_matching(filter_, article):
continue

if filter_action is FiltersAction.READ:
Expand Down
110 changes: 71 additions & 39 deletions src/lib/feed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import urllib
from copy import deepcopy
from functools import lru_cache

import feedparser

Expand Down Expand Up @@ -53,24 +54,17 @@ def get_parsed_feed(url):
return fp_parsed


@correct_feed_values
def construct_feed_from(url=None, fp_parsed=None, feed=None):
"""
Will try to construct the most complete feed dict possible.
@lru_cache(maxsize=None)
def get_splits(url, site_link=None):
# trying to make up for missing values
feed_split = urllib.parse.urlsplit(url)
site_split = None
if site_link:
site_split = urllib.parse.urlsplit(site_link)
return site_split, feed_split

url: an url of a feed or a site that might be hosting a feed
fp_parsed: a feedparser object previously obtained
feed: an existing feed dict, will be updated
"""
feed = deepcopy(feed) if feed else {}
if not url and hasattr(fp_parsed, 'get') and fp_parsed.get('href'):
url = fp_parsed.get('href')

# we'll try to obtain our first parsing from feedparser
if url and not fp_parsed:
fp_parsed = get_parsed_feed(url)
assert url is not None and fp_parsed is not None

def _extract_links(url, feed, fp_parsed):
if is_parsing_ok(fp_parsed):
feed['link'] = url
else:
Expand All @@ -93,25 +87,18 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None):
and link['type'] == 'text/html'))
feed['site_link'] = site_link or feed.get('site_link')
feed['site_link'] = feed['site_link'] or feed.get('link')
return fp_parsed

if not feed.get('title'): # not overriding user pref for title
if fp_parsed['feed'].get('title'):
feed['title'] = fp_parsed['feed'].get('title')
elif fp_parsed['feed'].get('title_detail', {}).get('value'):
feed['title'] = fp_parsed['feed']['title_detail']['value']

if fp_parsed['feed'].get('summary'):
feed['description'] = fp_parsed['feed']['summary']
elif fp_parsed['feed'].get('subtitle_detail', {}).get('value'):
feed['description'] = fp_parsed['feed']['subtitle_detail']['value']
def _update_feed_w_parsed(fkey, simple_key, value_key, feed, fp_parsed):
if fp_parsed['feed'].get(simple_key):
feed[fkey] = fp_parsed['feed'].get(simple_key)
elif fp_parsed['feed'].get(value_key, {}).get('value'):
feed[fkey] = fp_parsed['feed'][value_key]['value']

# trying to make up for missing values
feed_split = urllib.parse.urlsplit(url)
site_split = None
if feed.get('site_link'):
feed['site_link'] = rebuild_url(feed['site_link'], feed_split)
site_split = urllib.parse.urlsplit(feed['site_link'])

def _check_and_fix_icon(url, feed, fp_parsed):
site_split, feed_split = get_splits(url, feed.get('site_link'))
new_icon_urls = [fp_parsed.get('feed', {}).get('icon')] \
+ list(_browse_feedparser_feed(fp_parsed,
lambda link: 'icon' in link['rel']))
Expand All @@ -124,16 +111,12 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None):
feed['icon_url'] = icon_url
break

nothing_to_fill = all(bool(feed.get(key))
for key in ('link', 'title', 'icon_url'))
# here we have all we want or we do not have the main url,
# either way we're leaving
if not feed.get('site_link') or nothing_to_fill:
return feed

# trying to parse the page of the site for some rel link in the header
def _fetch_url_and_enhance_feed(url, feed):
"""trying to parse the page of the site for some rel link in the header"""
site_split, feed_split = get_splits(url, feed.get('site_link'))
try:
response = jarr_get(feed['site_link'])
response = jarr_get(url)
except Exception as error:
logger.warn('failed to retreive %r: %r', feed['site_link'], error)
return feed
Expand All @@ -147,3 +130,52 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None):
if not feed.get('link'):
feed['link'] = extract_feed_link(response, feed_split)
return feed


def _is_processing_complete(feed, site_link_necessary=False):
all_filled = all(bool(feed.get(key))
for key in ('link', 'title', 'icon_url'))
# here we have all we want or we do not have the main url,
# either way we're leaving
return (site_link_necessary and not feed.get('site_link')) or all_filled


@correct_feed_values
def construct_feed_from(url=None, fp_parsed=None, feed=None):
"""
Will try to construct the most complete feed dict possible.
url: an url of a feed or a site that might be hosting a feed
fp_parsed: a feedparser object previously obtained
feed: an existing feed dict, will be updated
"""
feed = deepcopy(feed) if feed else {}
if not url and hasattr(fp_parsed, 'get') and fp_parsed.get('href'):
url = fp_parsed.get('href')

# we'll try to obtain our first parsing from feedparser
if url and not fp_parsed:
fp_parsed = get_parsed_feed(url)
assert url is not None and fp_parsed is not None

fp_parsed = _extract_links(url, feed, fp_parsed)

if not feed.get('title'): # not overriding user pref for title
_update_feed_w_parsed('title', 'title', 'title_detail',
feed, fp_parsed)
_update_feed_w_parsed('description', 'summary', 'subtitle_detail',
feed, fp_parsed)

if _is_processing_complete(feed):
return feed

if feed.get('site_link'):
feed['site_link'] = rebuild_url(feed['site_link'],
get_splits(url, feed['site_link'])[1])

_check_and_fix_icon(url, feed, fp_parsed)

if _is_processing_complete(feed, site_link_necessary=True):
return feed

return _fetch_url_and_enhance_feed(feed['site_link'], feed)

0 comments on commit 433c203

Please sign in to comment.