redoing and testing cluster control switch

jaesivsm · Aug 30, 2018 · b301e7a · b301e7a
1 parent d5368e6
commit b301e7a
Show file tree

Hide file tree

Showing 18 changed files with 489 additions and 207 deletions.
diff --git a/jarr/api/category.py b/jarr/api/category.py
@@ -6,34 +6,31 @@
         ClusterController)
 
 prout = {'debug': False}
-TO_DENORM = {'cluster_enabled', 'cluster_tfidf', 'cluster_same_feed',
-        'cluster_tfidf_same_cat', 'cluster_tfidf_min_score', 'cluster_wake_up'}
 category_ns = Namespace('category', path='/categor',
         description='Category related operation')
 parser = category_ns.parser()
-parser_edit = parser.copy()
-parser.add_argument('name', type=str, required=True)
 model = category_ns.model('Category', {
         'id': fields.Integer(readOnly=True),
         'unread_cnt': fields.Integer(default=0, readOnly=True),
 })
+suffix = "(if your global settings " \
+        "and the article's feed settings allows it)"
+set_model_n_parser(model, parser, 'cluster_enabled', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized" + suffix)
+set_model_n_parser(model, parser, 'cluster_tfidf_enabled', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized through document comparison" + suffix)
+set_model_n_parser(model, parser, 'cluster_same_category', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized while beloning to the same category" + suffix)
+set_model_n_parser(model, parser, 'cluster_same_feed', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized while beloning to the same feed" + suffix)
+
+parser_edit = parser.copy()
+parser.add_argument('name', type=str, required=True)
 set_model_n_parser(model, parser_edit, 'name', str)
-suffix = ' (will be denormed on all feeds below)'
-parser_edit.add_argument('cluster_enabled', type=bool,
-        help='is clustering enabled whitin this feed' + suffix)
-parser_edit.add_argument('cluster_tfidf', type=bool,
-        help='is clustering through document comparison enabled' + suffix)
-parser_edit.add_argument('cluster_tfidf_same_cat', type=bool,
-        help='is clustering through document comparison within a single '
-             'category allowed' + suffix)
-parser_edit.add_argument('cluster_same_feed', type=bool,
-        help='is clustering several article from the same feed allowed'
-             + suffix)
-parser_edit.add_argument('cluster_tfidf_min_score', type=float,
-        help='minimum score for clustering with TFIDF algorithm' + suffix)
-parser_edit.add_argument('cluster_wake_up', type=bool,
-        help='if true, on clustering if the cluster is already read, '
-             'it will be unread' + suffix)
 
 
 @category_ns.route('y')
@@ -83,16 +80,10 @@ def put(self, category_id):
         "Update an existing category"
         cctrl = CategoryController(current_identity.id)
         attrs = parse_meaningful_params(parser_edit)
-        feed_attrs = {key: attrs[key] for key in TO_DENORM.intersection(attrs)}
-        attrs = {key: attrs[key] for key in attrs if key not in TO_DENORM}
-        changed = 0
-        if feed_attrs:
-            changed += FeedController(current_identity.id).update(
-                    {'category_id': category_id}, feed_attrs)
         if attrs:
-            changed += cctrl.update({'id': category_id}, attrs)
-        if not changed:
-            cctrl.assert_right_ok(category_id)
+            changed = cctrl.update({'id': category_id}, attrs)
+            if not changed:
+                cctrl.assert_right_ok(category_id)
         return None, 204
 
     @category_ns.expect(parser)

diff --git a/jarr/api/feed.py b/jarr/api/feed.py
@@ -33,21 +33,23 @@
         'last_retrieved': fields.DateTime(readOnly=True,
             description='Date of the last time this feed was fetched'),
 })
+suffix = "(if your global settings " \
+        "and the article's category settings allows it)"
 set_model_n_parser(feed_model, feed_parser, 'cluster_enabled', bool,
-        description='is clustering enabled whitin this feed')
-set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf', bool,
-        description='is clustering through document comparison enabled')
-set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf_same_cat', bool,
-        description='is clustering through document comparison within a '
-                    'single category allowed')
+        description="will allow article in your feeds and categories to be "
+                    "clusterized" + suffix)
+set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf_enabled', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized through document comparison" + suffix)
+set_model_n_parser(feed_model, feed_parser, 'cluster_same_category', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized while beloning to the same category" + suffix)
 set_model_n_parser(feed_model, feed_parser, 'cluster_same_feed', bool,
-        description='is clustering several article from the same feed allowed')
-set_model_n_parser(feed_model, feed_parser, 'cluster_tfidf_min_score', float,
-        default=conf.cluster_tfidf_min_score,
-        description='minimum score for clustering with TFIDF algorithm')
+        description="will allow article in your feeds and categories to be "
+                    "clusterized while beloning to the same feed" + suffix)
 set_model_n_parser(feed_model, feed_parser, 'cluster_wake_up', bool,
-        description='if true, on clustering if the cluster is already read, '
-                    'it will be unread')
+        description='will unread cluster when article '
+                    'from that feed are added to it')
 set_model_n_parser(feed_model, feed_parser, 'category_id', int)
 set_model_n_parser(feed_model, feed_parser, 'site_link', str)
 set_model_n_parser(feed_model, feed_parser, 'description', str)
@@ -70,7 +72,7 @@ class NewFeedResource(Resource):
     @jwt_required()
     def post(self):
         "Create an new feed"
-        attrs = feed_parser.parse_args()
+        attrs = parse_meaningful_params(feed_parser)
         return FeedController(current_identity.id).create(**attrs), 201
 
 

diff --git a/jarr/api/user.py b/jarr/api/user.py
@@ -10,6 +10,19 @@
 set_model_n_parser(user_model, user_parser, 'login', str)
 set_model_n_parser(user_model, user_parser, 'email', str)
 set_model_n_parser(user_model, user_parser, 'timezone', str)
+suffix = "(if the article feed's and category's settings allows it)"
+set_model_n_parser(user_model, user_parser, 'cluster_enabled', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized" + suffix)
+set_model_n_parser(user_model, user_parser, 'cluster_tfidf_enabled', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized through document comparison" + suffix)
+set_model_n_parser(user_model, user_parser, 'cluster_same_category', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized while beloning to the same category" + suffix)
+set_model_n_parser(user_model, user_parser, 'cluster_same_feed', bool,
+        description="will allow article in your feeds and categories to be "
+                    "clusterized while beloning to the same feed" + suffix)
 user_parser.add_argument('password', type=str)
 
 

diff --git a/jarr/bootstrap.py b/jarr/bootstrap.py
@@ -17,7 +17,11 @@
         'source_order': ['env', 'cmd', 'files'],
         'parameters': [
             {'jarr_testing': {'default': False, 'type': bool}},
-            {'cluster_tfidf_min_score': {'default': .75, 'type': float}},
+            {'cluster_default': [
+                {'time_delta': {'default': 7, 'type': int}},
+                {'tfidf_enabled': {'default': True, 'type': bool}},
+                {'tfidf_min_sample_size': {'default': 10, 'type': int}},
+                {'tfidf_min_score': {'default': .75, 'type': float}}]},
             {'timezone': {'default': 'Europe/Paris', 'type': str}},
             {'platform_url': {'default': 'http://0.0.0.0:5000/'}},
             {'sqlalchemy': [{'db_uri': {}},

diff --git a/jarr/controllers/abstract.py b/jarr/controllers/abstract.py
@@ -5,7 +5,7 @@
 from sqlalchemy import and_, or_
 from werkzeug.exceptions import Forbidden, NotFound, Unauthorized
 
-from jarr.bootstrap import session
+from jarr.bootstrap import session, Base
 
 logger = logging.getLogger(__name__)
 
@@ -18,7 +18,7 @@ def cast_to_utc(dt_obj):
 
 
 class AbstractController:
-    _db_cls = object  # reference to the database class
+    _db_cls = Base  # reference to the database class, to redefine in child cls
     _user_id_key = 'user_id'
 
     def __init__(self, user_id=None, ignore_context=False):
@@ -39,6 +39,8 @@ def _to_comparison(key, model):
         if '__' not in key:
             return getattr(model, key).__eq__
         attr, ope = key.rsplit('__', 1)
+        if ope == 'nin':
+            return getattr(model, attr).notin_
         if ope == 'in':
             return getattr(model, attr).in_
         if ope not in {'like', 'ilike'}:

diff --git a/jarr/controllers/cluster.py b/jarr/controllers/cluster.py
@@ -10,9 +10,10 @@
 from jarr_common.reasons import ClusterReason, ReadReason
 from jarr_common.clustering_af.grouper import get_best_match_and_score
 
+from jarr.utils import get_cluster_pref
 from jarr.bootstrap import SQLITE_ENGINE, session
 from jarr.controllers.article import ArticleController
-from jarr.models import Article, Cluster
+from jarr.models import Article, Cluster, Feed, User
 
 from .abstract import AbstractController
 
@@ -25,49 +26,77 @@
 JR_LENGTH = 1000
 
 
+def _get_parent_attr(obj, attr):
+    return (getattr(obj.user, attr) and getattr(obj.category, attr, True)
+            and getattr(obj.feed, attr))
+
+
+def is_same_ok(obj, parent):
+    return _get_parent_attr(obj, 'cluster_same_%s' % parent)
+
+
 class ClusterController(AbstractController):
     _db_cls = Cluster
-    max_day_dist = timedelta(days=7)
-
-    def _get_cluster_by_link(self, article):
-        filters = {'user_id': article.user_id,
-                   'main_date__lt': article.date + self.max_day_dist,
-                   'main_date__gt': article.date - self.max_day_dist,
-                   'main_link': article.link}
 
-        cluster = self.read(**filters).first()
-        if not cluster:
-            return None
-        if not article.feed.cluster_same_feed:
-            for clustered_article in cluster.articles:
-                if clustered_article.feed_id == article.feed_id:
-                    return None
-        article.cluster_reason = ClusterReason.link
-        return cluster
+    def _get_query_for_clustering(self, article, filters, join_filters=None):
+        time_delta = timedelta(
+                days=get_cluster_pref(article.feed, 'time_delta'))
+        date_cond = {'date__lt': article.date + time_delta,
+                     'date__gt': article.date - time_delta}
+        retr_cond = {'retrieved_date__lt': article.retrieved_date + time_delta,
+                     'retrieved_date__gt': article.retrieved_date - time_delta}
+        filters.update({'cluster_id__ne': None,
+                        'user_id': article.user_id,
+                        'id__ne': article.id,
+                        '__or__': [date_cond, retr_cond]})
+        if article.category_id and not is_same_ok(article, 'category'):
+            filters['category_id__ne'] = article.category_id
+        if not is_same_ok(article, 'feed'):
+            filters['feed_id__ne'] = article.feed_id
+
+        query = ArticleController(self.user_id).read(**filters)\
+                .join(Feed, Feed.id == Article.feed_id)\
+                .join(User, User.id == Article.user_id)\
+                .filter(User.cluster_enabled.__eq__(True),
+                        Feed.cluster_enabled.__eq__(True))
+
+        for join_filter in join_filters or []:
+            query = query.filter(join_filter)
+
+        # operations involving categories are complicated, handling in software
+        for candidate in query:
+            if candidate.category_id:
+                if not candidate.category.cluster_enabled:
+                    continue
+            yield candidate
 
-    def _get_cluster_by_similarity(self, article, min_sample_size=10):
-        if not article.lang:
-            return None
-        art_contr = ArticleController(self.user_id)
-
-        filters = {'user_id': article.user_id,
-                   'date__lt': article.date + self.max_day_dist,
-                   'date__gt': article.date - self.max_day_dist,
-                   # article isn't this one, and already in a cluster
-                   'cluster_id__ne': None, 'id__ne': article.id,
-                   # article is matchable
-                   'valuable_tokens__ne': []}
-        if article.feed.cluster_tfidf_same_cat:
-            filters['category_id'] = article.category_id
-
-        neighbors = list(art_contr.read(**filters))
+    def _get_cluster_by_link(self, article):
+        for candidate in self._get_query_for_clustering(article,
+                {'link': article.link}):
+            article.cluster_reason = ClusterReason.link
+            return candidate.cluster
+
+    def _get_cluster_by_similarity(self, article):
+        query = self._get_query_for_clustering(article,
+                # article is matchable
+                {'valuable_tokens__ne': []},
+                (User.cluster_tfidf_enabled.__eq__(True),
+                 Feed.cluster_tfidf_enabled.__eq__(True))
+                )
+
+        neighbors = [neighbor for neighbor in query
+                     if not neighbor.category_id
+                        or neighbor.category.cluster_tfidf_enabled]
+
+        min_sample_size = get_cluster_pref(article.feed,
+                'tfidf_min_sample_size')
         if len(neighbors) < min_sample_size:
             logger.info('only %d docs against %d required, no TFIDF for %r',
                         len(neighbors), min_sample_size, article)
             return None
 
         best_match, score = get_best_match_and_score(article, neighbors)
-        if score > article.feed.cluster_tfidf_min_score:
+        if score > get_cluster_pref(article.feed, 'tfidf_min_score'):
             article.cluster_reason = ClusterReason.tf_idf
             article.cluster_score = int(score * 1000)
             article.cluster_tfidf_neighbor_size = len(neighbors)
@@ -116,9 +145,10 @@ def enrich_cluster(cluster, article,
     def clusterize(self, article, cluster_read=None, cluster_liked=False):
         """Will add given article to a fitting cluster or create a cluster
         fitting that article."""
-        if article.feed.cluster_enabled:
+        if _get_parent_attr(article, 'cluster_enabled'):
             cluster = self._get_cluster_by_link(article)
-            if not cluster and article.feed.cluster_tfidf:
+            if not cluster \
+                    and _get_parent_attr(article, 'cluster_tfidf_enabled'):
                 cluster = self._get_cluster_by_similarity(article)
             if cluster:
                 return self.enrich_cluster(cluster, article,

diff --git a/jarr/models/category.py b/jarr/models/category.py
@@ -1,4 +1,4 @@
-from sqlalchemy import (Column, Integer, String,
+from sqlalchemy import (Column, Integer, String, Boolean, PickleType,
                         Index, ForeignKeyConstraint)
 from sqlalchemy.orm import relationship
 
@@ -11,6 +11,13 @@ class Category(Base):
     id = Column(Integer, primary_key=True)
     name = Column(String)
 
+    # clustering control
+    cluster_enabled = Column(Boolean, default=True)
+    cluster_tfidf_enabled = Column(Boolean, default=True)
+    cluster_same_category = Column(Boolean, default=True)
+    cluster_same_feed = Column(Boolean, default=True)
+    cluster_conf = Column(PickleType, default={})
+
     # foreign keys
     user_id = Column(Integer, nullable=False)
 

diff --git a/jarr/models/feed.py b/jarr/models/feed.py
@@ -1,11 +1,11 @@
 from sqlalchemy import (Boolean, Column, Integer, PickleType, String, Enum,
-                        FLOAT, Index, ForeignKeyConstraint)
+                        Index, ForeignKeyConstraint)
 from sqlalchemy.orm import relationship, validates
 
 from jarr_common.utils import utc_now
 from jarr_common.const import UNIX_START
 from jarr_common.reasons import CacheReason
-from jarr.bootstrap import Base, conf
+from jarr.bootstrap import Base
 from jarr.models.utc_datetime_type import UTCDateTime
 
 
@@ -24,13 +24,12 @@ class Feed(Base):
     integration_reddit = Column(Boolean, default=False)
 
     # clustering control
-    cluster_enabled = Column(Boolean, default=None)
-    cluster_tfidf = Column(Boolean, default=None)
-    cluster_tfidf_same_cat = Column(Boolean, default=None)
-    cluster_same_feed = Column(Boolean, default=None)
-    cluster_tfidf_min_score = Column(FLOAT,
-            default=conf.cluster_tfidf_min_score)
-    cluster_wake_up = Column(Boolean, default=False)
+    cluster_enabled = Column(Boolean, default=True)
+    cluster_tfidf_enabled = Column(Boolean, default=True)
+    cluster_same_category = Column(Boolean, default=True)
+    cluster_same_feed = Column(Boolean, default=True)
+    cluster_wake_up = Column(Boolean, default=True)
+    cluster_conf = Column(PickleType, default={})
 
     # cache reasons
     cache_type = Column(Enum(CacheReason), default=None)

diff --git a/jarr/models/user.py b/jarr/models/user.py
@@ -1,6 +1,6 @@
 import re
 
-from sqlalchemy import Boolean, Column, Integer, String
+from sqlalchemy import Boolean, Column, Integer, String, PickleType
 from sqlalchemy.orm import relationship, validates
 
 from jarr_common.utils import utc_now
@@ -21,6 +21,14 @@ class User(Base):
     renew_password_token = Column(String, default='')
 
     timezone = Column(String, default=conf.timezone)
+
+    # clustering control
+    cluster_enabled = Column(Boolean, default=True)
+    cluster_tfidf_enabled = Column(Boolean, default=True)
+    cluster_same_category = Column(Boolean, default=True)
+    cluster_same_feed = Column(Boolean, default=True)
+    cluster_conf = Column(PickleType, default={})
+
     # user rights
     is_active = Column(Boolean, default=True)
     is_admin = Column(Boolean, default=False)
@@ -47,5 +55,6 @@ class User(Base):
                             foreign_keys='[Cluster.user_id]')
 
     @validates('login')
-    def validates_login(self, key, value):
+    @staticmethod
+    def validates_login(key, value):
         return re.sub(r'[^a-zA-Z0-9_\.]', '', value.strip())