Skip to content

Commit

Permalink
wip remaking tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed Jul 27, 2017
1 parent 8904086 commit 5bd4b3d
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 109 deletions.
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,21 @@ Flask-Babel==0.11.2
Flask-Login==0.4.0
Flask-Migrate==2.0.4
Flask-Principal==0.4.0
Flask-RESTful==0.3.5
Flask-RESTful==0.3.6
Flask-Script==2.0.5
Flask-SQLAlchemy==2.2
Flask-SSLify==0.1.5
Flask-WTF==0.14.2
Jinja2==2.9.6
lxml==3.7.3
lxml==3.8.0
nltk==3.2.4
numpy==1.12.1
numpy==1.13.1
opml==0.5
python-dateutil==2.6.1
python-postmark==0.4.12
pytz==2017.2
rauth==0.7.3
requests==2.14.2
requests==2.18.2
SQLAlchemy==1.1.12

click==6.7
Expand Down
32 changes: 14 additions & 18 deletions src/crawler/http_crawler.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
"""
Here's a sum up on how it works :
CrawlerScheduler.run
will retreive a list of feeds to be refreshed and pass result to
CrawlerScheduler.callback
which will retreive each feed and treat result with
FeedCrawler.callback
which will interprete the result (status_code, etag) collect ids
and match them agaisnt jarr which will cause
JarrUpdater.callback
to create the missing entries
"""

import asyncio
import json
import logging
Expand Down Expand Up @@ -44,6 +30,7 @@ def query_jarr(method_name, urn, auth, pool=None, data=None):
data = {}
method = getattr(requests, method_name)
url = "%s%s/%s" % (conf.PLATFORM_URL, conf.API_ROOT.strip('/'), urn)

future = loop.run_in_executor(None,
partial(method, url, auth=auth, timeout=conf.CRAWLER_TIMEOUT,
data=json.dumps(data, default=default_handler),
Expand Down Expand Up @@ -109,10 +96,10 @@ def response_match_cache(feed, resp):
return False


async def main(username, password, **kwargs):
async def crawl(username, password, **kwargs):
"""entry point, will retreive feeds to be fetch
and launch the whole thing"""
logger.debug('retrieving fetchable feed')
logger.debug('Crawler start - retrieving fetchable feed')
loop = asyncio.get_event_loop()
auth = username, password
no_resp_pool = []
Expand Down Expand Up @@ -184,8 +171,10 @@ async def main(username, password, **kwargs):
skipped_list, parsed_response)
return
logger.debug('found %d entries %r', len(ids), ids)
future = query_jarr('get', 'articles/challenge', auth, data={'ids': ids})
article_adding_pool.append((future, feed, entries, response.headers))
article_adding_pool.append((
query_jarr('get', 'articles/challenge', auth,
data={'ids': ids}),
feed, entries, response.headers))

for future, feed, entries, headers in article_adding_pool:
response = await future
Expand Down Expand Up @@ -214,5 +203,12 @@ async def main(username, password, **kwargs):
if not article_created:
logger.info('all article matched in db, adding nothing')

logger.debug("awaiting for all future we're not waiting response from")
for no_resp_future in no_resp_pool:
await no_resp_future
logger.info('Crawler End')


def main(username, password, **kwargs):
loop = asyncio.get_event_loop()
loop.run_until_complete(crawl(username, password, **kwargs))
7 changes: 3 additions & 4 deletions src/manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import asyncio
import logging
from datetime import datetime, timedelta, timezone

Expand Down Expand Up @@ -35,9 +34,9 @@ def db_create(login='admin', password='admin'):
def fetch(limit=0, retreive_all=False):
"Crawl the feeds with the client crawler."
from crawler.http_crawler import main
loop = asyncio.get_event_loop()
loop.run_until_complete(main(conf.CRAWLER_LOGIN, conf.CRAWLER_PASSWD,
limit=limit, retreive_all=retreive_all))
main(conf.CRAWLER_LOGIN, conf.CRAWLER_PASSWD,
limit=limit, retreive_all=retreive_all)


@manager.command
def reset_feeds():
Expand Down
159 changes: 78 additions & 81 deletions src/tests/crawler_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from tests.base import BaseJarrTest, JarrFlaskCommon

import asyncio
import logging
import unittest
from datetime import datetime, timezone
Expand All @@ -8,14 +9,20 @@
from mock import Mock, patch

from bootstrap import conf
from crawler.http_crawler import CrawlerScheduler, FeedCrawler
from crawler.http_crawler import (main as crawler, response_match_cache,
set_feed_error, clean_feed)
from web.controllers import FeedController, UserController
from lib.utils import to_hash
from lib.const import UNIX_START

logger = logging.getLogger('web')


def get_first_call(query_jarr):
method, urn, _, _, data = query_jarr.mock_calls[0][1]
return method, urn, data


class CrawlerTest(JarrFlaskCommon):

def setUp(self):
Expand All @@ -28,7 +35,7 @@ def setUp(self):
UserController().update({'login': 'admin'}, {'is_api': True})
self._is_secure_served \
= patch('web.lib.article_cleaner.is_secure_served')
self._p_req = patch('requests.Session.request')
self._p_req = patch('crawler.http_crawler.requests.api.request')
self._p_con = patch('crawler.http_crawler.construct_feed_from')
self.is_secure_served = self._is_secure_served.start()
self.jarr_req = self._p_req.start()
Expand All @@ -51,6 +58,7 @@ def _api_req(method, url, **kwargs):

url = url.split(conf.API_ROOT)[1].strip('/')
kwargs.pop('allow_redirects', None)
kwargs.pop('params', None)
kwargs.pop('json', None)
if 'auth' in kwargs:
kwargs['user'] = kwargs['auth'][0]
Expand Down Expand Up @@ -80,12 +88,10 @@ def _reset_feeds_freshness(self, **kwargs):
FeedController().update({}, kwargs)

def test_http_crawler_add_articles(self):
scheduler = CrawlerScheduler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))

scheduler.run()
scheduler.wait(**self.wait_params)
crawler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36 + self.new_entries_cnt, len(resp.json()))

Expand All @@ -94,25 +100,20 @@ def test_http_crawler_add_articles(self):
self.assertFalse('src="/' in art['content'])

self.resp_status_code = 304
scheduler.run()
scheduler.wait(**self.wait_params)
crawler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36 + self.new_entries_cnt, len(resp.json()))

def test_no_add_on_304(self):
scheduler = CrawlerScheduler('admin', 'admin')
self.resp_status_code = 304
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))

scheduler.run()
scheduler.wait(**self.wait_params)
crawler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))

@patch('crawler.http_crawler.JarrUpdater.callback')
def test_no_add_feed_skip(self, jarr_updated_callback):
scheduler = CrawlerScheduler('admin', 'admin')
def test_no_add_feed_skip(self):
self.resp_status_code = 304
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))
Expand All @@ -129,10 +130,7 @@ def test_no_add_feed_skip(self, jarr_updated_callback):
"pattern": "pattern5",
"action": "skipped"}]})

scheduler.run()
scheduler.wait(**self.wait_params)
self.assertFalse(jarr_updated_callback.called,
"all articles should have been skipped")
crawler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))

Expand All @@ -141,26 +139,22 @@ def test_matching_etag(self):
self.resp_headers = {'etag': 'fake etag'}
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))
scheduler = CrawlerScheduler('admin', 'admin')

scheduler.run()
scheduler.wait(**self.wait_params)
crawler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))

self._reset_feeds_freshness(etag='jarr/fake etag')
self.resp_headers = {'etag': 'jarr/fake etag'}

scheduler.run()
scheduler.wait(**self.wait_params)
crawler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36, len(resp.json()))

self._reset_feeds_freshness(etag='jarr/fake etag')
self.resp_headers = {'etag': '########################'}

scheduler.run()
scheduler.wait(**self.wait_params)
crawler('admin', 'admin')
resp = self._api('get', 'articles', data={'limit': 1000}, user='admin')
self.assertEquals(36 + self.new_entries_cnt, len(resp.json()))

Expand All @@ -173,79 +167,82 @@ def setUp(self):
'description': 'description',
'etag': '', 'error_count': 5, 'link': 'link'}
self.resp = Mock(text='text', headers={}, status_code=304, history=[])
self.crawler = FeedCrawler(self.feed, Mock())
self.pool, self.auth = [], ('admin', 'admin')

def test_etag_matching_w_constructed_etag(self):
self.feed['etag'] = 'jarr/"%s"' % to_hash('text')
self.assertTrue(self.crawler.response_match_cache(self.resp))
self.assertTrue(response_match_cache(self.feed, self.resp))

def test_etag_no_matching_wo_etag(self):
self.assertFalse(self.crawler.response_match_cache(self.resp))
self.assertFalse(response_match_cache(self.feed, self.resp))

def test_etag_matching(self):
self.resp.headers['etag'] = self.feed['etag'] = 'etag'
self.assertTrue(self.crawler.response_match_cache(self.resp))
self.assertTrue(response_match_cache(self.feed, self.resp))

def test_set_feed_error_w_error(self):
@patch('crawler.http_crawler.query_jarr')
def test_set_feed_error_w_error(self, query_jarr):
original_error_count = self.feed['error_count']
self.crawler.query_jarr = Mock()
self.crawler.set_feed_error(Exception('an error'))
call = self.crawler.query_jarr.mock_calls[0][1]
set_feed_error(self.feed, self.auth, self.pool, Exception('an error'))
method, urn, data = get_first_call(query_jarr)

self.assertEquals('put', call[0])
self.assertEquals('feed/%d' % self.feed['id'], call[1])
self.assertEquals(original_error_count + 1, call[2]['error_count'])
self.assertEquals('an error', call[2]['last_error'])
self.assertEquals('put', method)
self.assertEquals('feed/%d' % self.feed['id'], urn)
self.assertEquals(original_error_count + 1, data['error_count'])
self.assertEquals('an error', data['last_error'])

def test_set_feed_error_w_parsed(self):
@patch('crawler.http_crawler.query_jarr')
def test_set_feed_error_w_parsed(self, query_jarr):
original_error_count = self.feed['error_count']
self.crawler.query_jarr = Mock()
self.crawler.set_feed_error(parsed_feed={'bozo_exception': 'an error'})
call = self.crawler.query_jarr.mock_calls[0][1]
self.assertEquals('put', call[0])
self.assertEquals('feed/%d' % self.feed['id'], call[1])
self.assertEquals(original_error_count + 1, call[2]['error_count'])
self.assertEquals('an error', call[2]['last_error'])

def test_clean_feed(self):
self.crawler.query_jarr = Mock()
self.crawler.clean_feed(self.resp)
call = self.crawler.query_jarr.mock_calls[0][1]

self.assertEquals('put', call[0])
self.assertEquals('feed/%d' % self.feed['id'], call[1])
self.assertTrue('link' not in call[2])
self.assertTrue('title' not in call[2])
self.assertTrue('description' not in call[2])
self.assertTrue('site_link' not in call[2])
self.assertTrue('icon_url' not in call[2])

def test_clean_feed_update_link(self):
self.crawler.query_jarr = Mock()
set_feed_error(self.feed, ('admin', 'admin'), self.pool,
parsed_feed={'bozo_exception': 'an error'})
method, urn, data = get_first_call(query_jarr)
self.assertEquals('put', method)
self.assertEquals('feed/%d' % self.feed['id'], urn)
self.assertEquals(original_error_count + 1, data['error_count'])
self.assertEquals('an error', data['last_error'])

@patch('crawler.http_crawler.query_jarr')
def test_clean_feed(self, query_jarr):
clean_feed(self.feed, self.auth, self.pool, self.resp)
method, urn, data = get_first_call(query_jarr)

self.assertEquals('put', method)
self.assertEquals('feed/%d' % self.feed['id'], urn)
self.assertTrue('link' not in data)
self.assertTrue('title' not in data)
self.assertTrue('description' not in data)
self.assertTrue('site_link' not in data)
self.assertTrue('icon_url' not in data)

@patch('crawler.http_crawler.query_jarr')
def test_clean_feed_update_link(self, query_jarr):
self.resp.history.append(Mock(status_code=301))
self.resp.url = 'new_link'
self.crawler.clean_feed(self.resp)
call = self.crawler.query_jarr.mock_calls[0][1]
clean_feed(self.feed, self.auth, self.pool, self.resp)
method, urn, data = get_first_call(query_jarr)

self.assertEquals('put', call[0])
self.assertEquals('feed/%d' % self.feed['id'], call[1])
self.assertEquals('new_link', call[2]['link'])
self.assertTrue('title' not in call[2])
self.assertTrue('description' not in call[2])
self.assertTrue('site_link' not in call[2])
self.assertTrue('icon_url' not in call[2])
self.assertEquals('put', method)
self.assertEquals('feed/%d' % self.feed['id'], urn)
self.assertEquals('new_link', data['link'])
self.assertTrue('title' not in data)
self.assertTrue('description' not in data)
self.assertTrue('site_link' not in data)
self.assertTrue('icon_url' not in data)

@patch('crawler.http_crawler.construct_feed_from')
def test_clean_feed_w_constructed(self, construct_feed_mock):
@patch('crawler.http_crawler.query_jarr')
def test_clean_feed_w_constructed(self, query_jarr, construct_feed_mock):
construct_feed_mock.return_value = {'description': 'new description'}
self.crawler.query_jarr = Mock()
self.crawler.clean_feed(self.resp, True)
call = self.crawler.query_jarr.mock_calls[0][1]

self.assertEquals('put', call[0])
self.assertEquals('feed/%d' % self.feed['id'], call[1])
self.assertEquals('new description', call[2]['description'])
self.assertTrue('link' not in call[2])
self.assertTrue('title' not in call[2])
self.assertTrue('site_link' not in call[2])
self.assertTrue('icon_url' not in call[2])
clean_feed(self.feed, self.auth, self.pool, self.resp, True)
method, urn, data = get_first_call(query_jarr)

print(data)

self.assertEquals('put', method)
self.assertEquals('feed/%d' % self.feed['id'], urn)
self.assertEquals('new description', data['description'])
self.assertTrue('link' not in data)
self.assertTrue('title' not in data)
self.assertTrue('site_link' not in data)
self.assertTrue('icon_url' not in data)
2 changes: 1 addition & 1 deletion src/web/controllers/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def _get_attrs_desc(cls, role, right=None):
result = {}
for column in cls._get_columns(role, right):
if isinstance(getattr(cls._db_cls, column), AssociationProxy):
result[column] = {'type': list, 'default': []}
result[column] = {'default': [], 'action': 'append'}
continue
try:
db_col = getattr(cls._db_cls, column).property.columns[0]
Expand Down

0 comments on commit 5bd4b3d

Please sign in to comment.