Skip to content

Commit

Permalink
redoing and testing cluster control switch
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed Aug 30, 2018
1 parent d5368e6 commit b301e7a
Show file tree
Hide file tree
Showing 18 changed files with 489 additions and 207 deletions.
49 changes: 20 additions & 29 deletions jarr/api/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,31 @@
ClusterController)

prout = {'debug': False}
TO_DENORM = {'cluster_enabled', 'cluster_tfidf', 'cluster_same_feed',
'cluster_tfidf_same_cat', 'cluster_tfidf_min_score', 'cluster_wake_up'}
category_ns = Namespace('category', path='/categor',
description='Category related operation')
parser = category_ns.parser()
parser_edit = parser.copy()
parser.add_argument('name', type=str, required=True)
model = category_ns.model('Category', {
'id': fields.Integer(readOnly=True),
'unread_cnt': fields.Integer(default=0, readOnly=True),
})
suffix = "(if your global settings " \
"and the article's feed settings allows it)"
set_model_n_parser(model, parser, 'cluster_enabled', bool,
description="will allow article in your feeds and categories to be "
"clusterized" + suffix)
set_model_n_parser(model, parser, 'cluster_tfidf_enabled', bool,
description="will allow article in your feeds and categories to be "
"clusterized through document comparison" + suffix)
set_model_n_parser(model, parser, 'cluster_same_category', bool,
description="will allow article in your feeds and categories to be "
"clusterized while beloning to the same category" + suffix)
set_model_n_parser(model, parser, 'cluster_same_feed', bool,
description="will allow article in your feeds and categories to be "
"clusterized while beloning to the same feed" + suffix)

parser_edit = parser.copy()
parser.add_argument('name', type=str, required=True)
set_model_n_parser(model, parser_edit, 'name', str)
suffix = ' (will be denormed on all feeds below)'
parser_edit.add_argument('cluster_enabled', type=bool,
help='is clustering enabled whitin this feed' + suffix)
parser_edit.add_argument('cluster_tfidf', type=bool,
help='is clustering through document comparison enabled' + suffix)
parser_edit.add_argument('cluster_tfidf_same_cat', type=bool,
help='is clustering through document comparison within a single '
'category allowed' + suffix)
parser_edit.add_argument('cluster_same_feed', type=bool,
help='is clustering several article from the same feed allowed'
+ suffix)
parser_edit.add_argument('cluster_tfidf_min_score', type=float,
help='minimum score for clustering with TFIDF algorithm' + suffix)
parser_edit.add_argument('cluster_wake_up', type=bool,
help='if true, on clustering if the cluster is already read, '
'it will be unread' + suffix)


@category_ns.route('y')
Expand Down Expand Up @@ -83,16 +80,10 @@ def put(self, category_id):
"Update an existing category"
cctrl = CategoryController(current_identity.id)
attrs = parse_meaningful_params(parser_edit)
feed_attrs = {key: attrs[key] for key in TO_DENORM.intersection(attrs)}
attrs = {key: attrs[key] for key in attrs if key not in TO_DENORM}
changed = 0
if feed_attrs:
changed += FeedController(current_identity.id).update(
{'category_id': category_id}, feed_attrs)
if attrs:
changed += cctrl.update({'id': category_id}, attrs)
if not changed:
cctrl.assert_right_ok(category_id)
changed = cctrl.update({'id': category_id}, attrs)
if not changed:
cctrl.assert_right_ok(category_id)
return None, 204

@category_ns.expect(parser)
Expand Down
28 changes: 15 additions & 13 deletions jarr/api/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,23 @@
'last_retrieved': fields.DateTime(readOnly=True,
description='Date of the last time this feed was fetched'),
})
suffix = "(if your global settings " \
"and the article's category settings allows it)"
set_model_n_parser(feed_model, feed_parser, 'cluster_enabled', bool,
description='is clustering enabled whitin this feed')
set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf', bool,
description='is clustering through document comparison enabled')
set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf_same_cat', bool,
description='is clustering through document comparison within a '
'single category allowed')
description="will allow article in your feeds and categories to be "
"clusterized" + suffix)
set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf_enabled', bool,
description="will allow article in your feeds and categories to be "
"clusterized through document comparison" + suffix)
set_model_n_parser(feed_model, feed_parser, 'cluster_same_category', bool,
description="will allow article in your feeds and categories to be "
"clusterized while beloning to the same category" + suffix)
set_model_n_parser(feed_model, feed_parser, 'cluster_same_feed', bool,
description='is clustering several article from the same feed allowed')
set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf_min_score', float,
default=conf.cluster_tfidf_min_score,
description='minimum score for clustering with TFIDF algorithm')
description="will allow article in your feeds and categories to be "
"clusterized while beloning to the same feed" + suffix)
set_model_n_parser(feed_model, feed_parser, 'cluster_wake_up', bool,
description='if true, on clustering if the cluster is already read, '
'it will be unread')
description='will unread cluster when article '
'from that feed are added to it')
set_model_n_parser(feed_model, feed_parser, 'category_id', int)
set_model_n_parser(feed_model, feed_parser, 'site_link', str)
set_model_n_parser(feed_model, feed_parser, 'description', str)
Expand All @@ -70,7 +72,7 @@ class NewFeedResource(Resource):
@jwt_required()
def post(self):
"Create an new feed"
attrs = feed_parser.parse_args()
attrs = parse_meaningful_params(feed_parser)
return FeedController(current_identity.id).create(**attrs), 201


Expand Down
13 changes: 13 additions & 0 deletions jarr/api/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@
set_model_n_parser(user_model, user_parser, 'login', str)
set_model_n_parser(user_model, user_parser, 'email', str)
set_model_n_parser(user_model, user_parser, 'timezone', str)
suffix = "(if the article feed's and category's settings allows it)"
set_model_n_parser(user_model, user_parser, 'cluster_enabled', bool,
description="will allow article in your feeds and categories to be "
"clusterized" + suffix)
set_model_n_parser(user_model, user_parser, 'cluster_tfidf_enabled', bool,
description="will allow article in your feeds and categories to be "
"clusterized through document comparison" + suffix)
set_model_n_parser(user_model, user_parser, 'cluster_same_category', bool,
description="will allow article in your feeds and categories to be "
"clusterized while beloning to the same category" + suffix)
set_model_n_parser(user_model, user_parser, 'cluster_same_feed', bool,
description="will allow article in your feeds and categories to be "
"clusterized while beloning to the same feed" + suffix)
user_parser.add_argument('password', type=str)


Expand Down
6 changes: 5 additions & 1 deletion jarr/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
'source_order': ['env', 'cmd', 'files'],
'parameters': [
{'jarr_testing': {'default': False, 'type': bool}},
{'cluster_tfidf_min_score': {'default': .75, 'type': float}},
{'cluster_default': [
{'time_delta': {'default': 7, 'type': int}},
{'tfidf_enabled': {'default': True, 'type': bool}},
{'tfidf_min_sample_size': {'default': 10, 'type': int}},
{'tfidf_min_score': {'default': .75, 'type': float}}]},
{'timezone': {'default': 'Europe/Paris', 'type': str}},
{'platform_url': {'default': 'http://0.0.0.0:5000/'}},
{'sqlalchemy': [{'db_uri': {}},
Expand Down
6 changes: 4 additions & 2 deletions jarr/controllers/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sqlalchemy import and_, or_
from werkzeug.exceptions import Forbidden, NotFound, Unauthorized

from jarr.bootstrap import session
from jarr.bootstrap import session, Base

logger = logging.getLogger(__name__)

Expand All @@ -18,7 +18,7 @@ def cast_to_utc(dt_obj):


class AbstractController:
_db_cls = object # reference to the database class
_db_cls = Base # reference to the database class, to redefine in child cls
_user_id_key = 'user_id'

def __init__(self, user_id=None, ignore_context=False):
Expand All @@ -39,6 +39,8 @@ def _to_comparison(key, model):
if '__' not in key:
return getattr(model, key).__eq__
attr, ope = key.rsplit('__', 1)
if ope == 'nin':
return getattr(model, attr).notin_
if ope == 'in':
return getattr(model, attr).in_
if ope not in {'like', 'ilike'}:
Expand Down
102 changes: 66 additions & 36 deletions jarr/controllers/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from jarr_common.reasons import ClusterReason, ReadReason
from jarr_common.clustering_af.grouper import get_best_match_and_score

from jarr.utils import get_cluster_pref
from jarr.bootstrap import SQLITE_ENGINE, session
from jarr.controllers.article import ArticleController
from jarr.models import Article, Cluster
from jarr.models import Article, Cluster, Feed, User

from .abstract import AbstractController

Expand All @@ -25,49 +26,77 @@
JR_LENGTH = 1000


def _get_parent_attr(obj, attr):
return (getattr(obj.user, attr) and getattr(obj.category, attr, True)
and getattr(obj.feed, attr))


def is_same_ok(obj, parent):
return _get_parent_attr(obj, 'cluster_same_%s' % parent)


class ClusterController(AbstractController):
_db_cls = Cluster
max_day_dist = timedelta(days=7)

def _get_cluster_by_link(self, article):
filters = {'user_id': article.user_id,
'main_date__lt': article.date + self.max_day_dist,
'main_date__gt': article.date - self.max_day_dist,
'main_link': article.link}

cluster = self.read(**filters).first()
if not cluster:
return None
if not article.feed.cluster_same_feed:
for clustered_article in cluster.articles:
if clustered_article.feed_id == article.feed_id:
return None
article.cluster_reason = ClusterReason.link
return cluster
def _get_query_for_clustering(self, article, filters, join_filters=None):
time_delta = timedelta(
days=get_cluster_pref(article.feed, 'time_delta'))
date_cond = {'date__lt': article.date + time_delta,
'date__gt': article.date - time_delta}
retr_cond = {'retrieved_date__lt': article.retrieved_date + time_delta,
'retrieved_date__gt': article.retrieved_date - time_delta}
filters.update({'cluster_id__ne': None,
'user_id': article.user_id,
'id__ne': article.id,
'__or__': [date_cond, retr_cond]})
if article.category_id and not is_same_ok(article, 'category'):
filters['category_id__ne'] = article.category_id
if not is_same_ok(article, 'feed'):
filters['feed_id__ne'] = article.feed_id

query = ArticleController(self.user_id).read(**filters)\
.join(Feed, Feed.id == Article.feed_id)\
.join(User, User.id == Article.user_id)\
.filter(User.cluster_enabled.__eq__(True),
Feed.cluster_enabled.__eq__(True))

for join_filter in join_filters or []:
query = query.filter(join_filter)

# operations involving categories are complicated, handling in software
for candidate in query:
if candidate.category_id:
if not candidate.category.cluster_enabled:
continue
yield candidate

def _get_cluster_by_similarity(self, article, min_sample_size=10):
if not article.lang:
return None
art_contr = ArticleController(self.user_id)

filters = {'user_id': article.user_id,
'date__lt': article.date + self.max_day_dist,
'date__gt': article.date - self.max_day_dist,
# article isn't this one, and already in a cluster
'cluster_id__ne': None, 'id__ne': article.id,
# article is matchable
'valuable_tokens__ne': []}
if article.feed.cluster_tfidf_same_cat:
filters['category_id'] = article.category_id

neighbors = list(art_contr.read(**filters))
def _get_cluster_by_link(self, article):
for candidate in self._get_query_for_clustering(article,
{'link': article.link}):
article.cluster_reason = ClusterReason.link
return candidate.cluster

def _get_cluster_by_similarity(self, article):
query = self._get_query_for_clustering(article,
# article is matchable
{'valuable_tokens__ne': []},
(User.cluster_tfidf_enabled.__eq__(True),
Feed.cluster_tfidf_enabled.__eq__(True))
)

neighbors = [neighbor for neighbor in query
if not neighbor.category_id
or neighbor.category.cluster_tfidf_enabled]

min_sample_size = get_cluster_pref(article.feed,
'tfidf_min_sample_size')
if len(neighbors) < min_sample_size:
logger.info('only %d docs against %d required, no TFIDF for %r',
len(neighbors), min_sample_size, article)
return None

best_match, score = get_best_match_and_score(article, neighbors)
if score > article.feed.cluster_tfidf_min_score:
if score > get_cluster_pref(article.feed, 'tfidf_min_score'):
article.cluster_reason = ClusterReason.tf_idf
article.cluster_score = int(score * 1000)
article.cluster_tfidf_neighbor_size = len(neighbors)
Expand Down Expand Up @@ -116,9 +145,10 @@ def enrich_cluster(cluster, article,
def clusterize(self, article, cluster_read=None, cluster_liked=False):
"""Will add given article to a fitting cluster or create a cluster
fitting that article."""
if article.feed.cluster_enabled:
if _get_parent_attr(article, 'cluster_enabled'):
cluster = self._get_cluster_by_link(article)
if not cluster and article.feed.cluster_tfidf:
if not cluster \
and _get_parent_attr(article, 'cluster_tfidf_enabled'):
cluster = self._get_cluster_by_similarity(article)
if cluster:
return self.enrich_cluster(cluster, article,
Expand Down
9 changes: 8 additions & 1 deletion jarr/models/category.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy import (Column, Integer, String,
from sqlalchemy import (Column, Integer, String, Boolean, PickleType,
Index, ForeignKeyConstraint)
from sqlalchemy.orm import relationship

Expand All @@ -11,6 +11,13 @@ class Category(Base):
id = Column(Integer, primary_key=True)
name = Column(String)

# clustering control
cluster_enabled = Column(Boolean, default=True)
cluster_tfidf_enabled = Column(Boolean, default=True)
cluster_same_category = Column(Boolean, default=True)
cluster_same_feed = Column(Boolean, default=True)
cluster_conf = Column(PickleType, default={})

# foreign keys
user_id = Column(Integer, nullable=False)

Expand Down
17 changes: 8 additions & 9 deletions jarr/models/feed.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from sqlalchemy import (Boolean, Column, Integer, PickleType, String, Enum,
FLOAT, Index, ForeignKeyConstraint)
Index, ForeignKeyConstraint)
from sqlalchemy.orm import relationship, validates

from jarr_common.utils import utc_now
from jarr_common.const import UNIX_START
from jarr_common.reasons import CacheReason
from jarr.bootstrap import Base, conf
from jarr.bootstrap import Base
from jarr.models.utc_datetime_type import UTCDateTime


Expand All @@ -24,13 +24,12 @@ class Feed(Base):
integration_reddit = Column(Boolean, default=False)

# clustering control
cluster_enabled = Column(Boolean, default=None)
cluster_tfidf = Column(Boolean, default=None)
cluster_tfidf_same_cat = Column(Boolean, default=None)
cluster_same_feed = Column(Boolean, default=None)
cluster_tfidf_min_score = Column(FLOAT,
default=conf.cluster_tfidf_min_score)
cluster_wake_up = Column(Boolean, default=False)
cluster_enabled = Column(Boolean, default=True)
cluster_tfidf_enabled = Column(Boolean, default=True)
cluster_same_category = Column(Boolean, default=True)
cluster_same_feed = Column(Boolean, default=True)
cluster_wake_up = Column(Boolean, default=True)
cluster_conf = Column(PickleType, default={})

# cache reasons
cache_type = Column(Enum(CacheReason), default=None)
Expand Down
13 changes: 11 additions & 2 deletions jarr/models/user.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re

from sqlalchemy import Boolean, Column, Integer, String
from sqlalchemy import Boolean, Column, Integer, String, PickleType
from sqlalchemy.orm import relationship, validates

from jarr_common.utils import utc_now
Expand All @@ -21,6 +21,14 @@ class User(Base):
renew_password_token = Column(String, default='')

timezone = Column(String, default=conf.timezone)

# clustering control
cluster_enabled = Column(Boolean, default=True)
cluster_tfidf_enabled = Column(Boolean, default=True)
cluster_same_category = Column(Boolean, default=True)
cluster_same_feed = Column(Boolean, default=True)
cluster_conf = Column(PickleType, default={})

# user rights
is_active = Column(Boolean, default=True)
is_admin = Column(Boolean, default=False)
Expand All @@ -47,5 +55,6 @@ class User(Base):
foreign_keys='[Cluster.user_id]')

@validates('login')
def validates_login(self, key, value):
@staticmethod
def validates_login(key, value):
return re.sub(r'[^a-zA-Z0-9_\.]', '', value.strip())

0 comments on commit b301e7a

Please sign in to comment.