Merge pull request #275 from mdekstrand/feature/ii-diagnostics

Improve item-item docs and diagnostics for implicit feedback
lenskit · Oct 19, 2021 · aa8a64a · aa8a64a
2 parents 8e47505 + 12b6646
commit aa8a64a
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 1 deletion.
diff --git a/doc/knn.rst b/doc/knn.rst
@@ -6,6 +6,18 @@ implementations.  These lightly-configurable implementations are intended
 to capture the behavior of the Java-based LensKit implementations to provide
 a good upgrade path and enable basic experiments out of the box.
 
+There are two different primary that you can use these algorithms in.  When using **explicit
+feedback** (rating values), you usually want to use the defaults of weighted-average aggregation and
+mean-centering normalization.
+
+With **implicit feedback** (unary data such as clicks and purchases, typically represented with
+rating values of 1 for positive items), the usual design is sum aggregation and no centering::
+
+    implicit_knn = ItemItem(20, center=False, aggregate='sum')
+
+Attempting to center data on the same scale (all 1, for example) will typically produce invalid
+results.  ItemKNN has diagnostics to warn you about this.
+
 .. toctree::
 
 
@@ -14,6 +26,8 @@ Item-based k-NN
 
 .. module:: lenskit.algorithms.item_knn
 
+This is LensKit's item-based k-NN model, based on the description by :cite:t:`Deshpande2004-ht`.
+
 .. autoclass:: ItemItem
     :members:
     :show-inheritance:

diff --git a/lenskit/algorithms/item_knn.py b/lenskit/algorithms/item_knn.py
@@ -205,7 +205,8 @@ class ItemItem(Predictor):
     """
     Item-item nearest-neighbor collaborative filtering with ratings. This item-item implementation
     is not terribly configurable; it hard-codes design decisions found to work well in the previous
-    Java-based LensKit code.
+    Java-based LensKit code :cite:p:`Ekstrand2011-bp`.  This implementation is based on the description
+    of item-based CF by :cite:t:`Deshpande2004-ht`, and produces results equivalent to Java LensKit.
 
     The k-NN predictor supports several aggregate functions:
 
@@ -320,6 +321,10 @@ def _mean_center(self, ratings, rmat, items):
         mcvals = rmat.values - item_means[rmat.colinds]
         nmat = rmat.copy(False)
         nmat.values = mcvals
+        if np.allclose(nmat.values, 0):
+            _logger.warn('normalized ratings are zero, centering is not recommended')
+            warnings.warn("Ratings seem to have the same value, centering is not recommended.",
+                          DataWarning)
         _logger.info('[%s] computed means for %d items', self._timer, len(item_means))
         return nmat, item_means
 

diff --git a/tests/test_knn_item_item.py b/tests/test_knn_item_item.py
@@ -152,6 +152,14 @@ def test_ii_warn_duplicates():
         pass  # this is fine
 
 
+def test_ii_warns_center():
+    "Test that item-item warns if you center non-centerable data"
+    data = simple_ratings.assign(rating=1)
+    algo = knn.ItemItem(5)
+    with pytest.warns(DataWarning):
+        algo.fit(data)
+
+
 @lktu.wantjit
 @mark.skip("redundant with large_models")
 def test_ii_train_big():