Merge pull request #27 from lenskit/feature/ii-sync

Make item-item CF match LensKit
lenskit · Sep 12, 2018 · d1aba29 · d1aba29
2 parents 03bbbb5 + 7e0a56f
commit d1aba29
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 34 deletions.
diff --git a/lenskit/algorithms/_item_knn.pyx b/lenskit/algorithms/_item_knn.pyx
@@ -3,6 +3,7 @@ from cpython cimport array
 import array
 import pandas as pd
 import numpy as np
+from scipy import sparse as sps
 cimport numpy as np
 from numpy cimport math as npm
 from cython.parallel cimport parallel, prange, threadid
@@ -20,7 +21,7 @@ ELSE:
     cdef int omp_get_num_threads():
         return 0
 
-cdef _logger = logging.getLogger('_item_knn')
+cdef _logger = logging.getLogger('lenskit._item_knn')
 
 cdef class BuildContext:
     cdef readonly int n_users
@@ -43,7 +44,8 @@ cdef class BuildContext:
                       self.n_users, self.n_items, matrix.nnz)
 
         self.matrix = matrix
-        self.cscmat = matrix.tocsc(copy=False)
+        self.cscmat = matrix.tocsc()
+        assert sps.isspmatrix_csc(self.cscmat)
 
         self.uptrs = matrix.indptr
         self.items = matrix.indices
@@ -151,7 +153,7 @@ cdef void tr_add_nitems(ThreadState* self, int item, size_t nitems,
     lku.ah_free(acc)
 
 
-cdef dict tr_results(ThreadState* self):
+cdef object tr_results(ThreadState* self):
     cdef np.npy_intp size = self.size
     cdef np.ndarray items, nbrs, sims
     items = np.empty(size, dtype=np.int32)
@@ -163,7 +165,7 @@ cdef dict tr_results(ThreadState* self):
     # items = np.PyArray_SimpleNewFromData(1, &size, np.NPY_INT32, self.items)
     # nbrs = np.PyArray_SimpleNewFromData(1, &size, np.NPY_INT32, self.nbrs)
     # sims = np.PyArray_SimpleNewFromData(1, &size, np.NPY_DOUBLE, self.sims)
-    return {'item': items, 'neighbor': nbrs, 'similarity': sims}
+    return pd.DataFrame({'item': items, 'neighbor': nbrs, 'similarity': sims})
 
 
 cpdef sim_matrix(BuildContext context, double threshold, int nnbrs):
@@ -186,12 +188,15 @@ cpdef sim_matrix(BuildContext context, double threshold, int nnbrs):
             _logger.debug('thread %d computed %d pairs', omp_get_thread_num(), tres.size)
             if tres.size > 0:
                 neighborhoods.append(tr_results(tres))
+                _logger.debug('finished parallel item-item build, %d neighbors',
+                              len(neighborhoods[-1]))
+            else:
+                _logger.debug('canceling with no neighbors')
             tr_free(tres)
-            _logger.debug('finished parallel item-item build')
-
+            tres = NULL
+            
     _logger.debug('stacking %d neighborhood frames', len(neighborhoods))
-    return pd.concat([pd.DataFrame(d) for d in neighborhoods],
-                     ignore_index=True)
+    return pd.concat(neighborhoods, ignore_index=True)
 
 
 cdef void train_row(int item, ThreadState* tres, BuildContext context,
@@ -228,16 +233,19 @@ cdef void train_row(int item, ThreadState* tres, BuildContext context,
         tr_add_all(tres, item, context.n_items, threshold)
 
 
-cpdef void predict(matrix, int nitems, int min_nbrs, int max_nbrs,
-                   np.float_t[:] ratings,
-                   np.int64_t[:] targets,
-                   np.float_t[:] scores):
+cpdef predict(matrix, int nitems, int min_nbrs, int max_nbrs,
+              np.float_t[:] ratings,
+              np.int64_t[:] targets,
+              np.float_t[:] scores):
     cdef int[:] indptr = matrix.indptr
     cdef int[:] indices = matrix.indices
     cdef double[:] similarity = matrix.data
     cdef int i, j, iidx, rptr, rend, nidx, nnbrs
     cdef double num, denom
 
+    assert ratings.shape[0] == nitems
+    assert scores.shape[0] == nitems
+
     with nogil:
         for i in range(targets.shape[0]):
             iidx = targets[i]
@@ -261,6 +269,8 @@ cpdef void predict(matrix, int nitems, int min_nbrs, int max_nbrs,
                     break
 
             if nnbrs < min_nbrs:
-                break
+                continue
 
             scores[iidx] = num / denom
+
+    return None
diff --git a/lenskit/algorithms/item_knn.py b/lenskit/algorithms/item_knn.py
@@ -37,9 +37,11 @@ def __init__(self, nnbrs, min_nbrs=1, min_sim=1.0e-6, save_nbrs=None):
                 (``None`` for unlimited)
         """
         self.max_neighbors = nnbrs
-        if self.max_neighbors is not None and self.max_neighbors < 0:
-            self.max_neighbors = 0
+        if self.max_neighbors is not None and self.max_neighbors < 1:
+            self.max_neighbors = -1
         self.min_neighbors = min_nbrs
+        if self.min_neighbors is not None and self.min_neighbors < 1:
+            self.min_neighbors = 1
         self.min_similarity = min_sim
         self.save_neighbors = save_nbrs
 
@@ -68,8 +70,9 @@ def train(self, ratings):
 
         def normalize(x):
             xmc = x - x.mean()
-            if xmc.abs().sum() > 1.0e-10:
-                return xmc / np.linalg.norm(xmc)
+            norm = np.linalg.norm(xmc)
+            if norm > 1.0e-10:
+                return xmc / norm
             else:
                 return xmc
 
@@ -93,6 +96,7 @@ def _cy_matrix(self, ratings, uir, watch):
         # the Cython implementation requires contiguous numeric IDs.
         # so let's make those
         rmat, user_idx, item_idx = matrix.sparse_ratings(uir)
+        assert rmat.nnz == len(uir)
         n_items = len(item_idx)
 
         context = accel.BuildContext(rmat)
@@ -156,7 +160,10 @@ def sim_row(irdf):
         return neighborhoods
 
     def predict(self, model, user, items, ratings=None):
+        _logger.debug('predicting %d items for user %s', len(items), user)
         if ratings is None:
+            if user not in model.rating_matrix.index:
+                return pd.Series(np.nan, index=items)
             ratings = model.rating_matrix.loc[user]
 
         # set up rating array
@@ -167,6 +174,7 @@ def predict(self, model, user, items, ratings=None):
         rate_v = np.full(len(model.items), np.nan, dtype=np.float_)
         rate_v[ri_pos] = m_rates.values - model.means[ri_pos]
         _logger.debug('user %s: %d of %d rated items in model', user, len(ri_pos), len(ratings))
+        assert np.sum(np.logical_not(np.isnan(rate_v))) == len(ri_pos)
 
         # set up item result vector
         # ipos will be an array of item indices
@@ -179,8 +187,7 @@ def predict(self, model, user, items, ratings=None):
 
         # now compute the predictions
         accel.predict(model.sim_matrix, len(model.items),
-                      self.min_neighbors if self.min_neighbors else 0,
-                      self.max_neighbors if self.max_neighbors else -1,
+                      self.min_neighbors, self.max_neighbors,
                       rate_v, i_pos, iscore)
 
         nscored = np.sum(np.logical_not(np.isnan(iscore)))
@@ -198,29 +205,36 @@ def predict(self, model, user, items, ratings=None):
         return results
 
     def save_model(self, model, file):
+        _logger.info('saving I-I model to %s', file)
         with pd.HDFStore(file, 'w') as hdf:
             h5 = hdf._handle
-            group = h5.create_group('/', 'ii-model')
+            group = h5.create_group('/', 'ii_model')
             h5.create_array(group, 'items', model.items.values)
             h5.create_array(group, 'means', model.means)
+            _logger.debug('saving matrix with %d entries (%d nnz)',
+                          model.sim_matrix.nnz, np.sum(model.sim_matrix.data != 0))
             h5.create_array(group, 'col_ptrs', model.sim_matrix.indptr)
             h5.create_array(group, 'row_nums', model.sim_matrix.indices)
             h5.create_array(group, 'sim_values', model.sim_matrix.data)
 
             hdf['ratings'] = model.rating_matrix
 
     def load_model(self, file):
+        _logger.info('loading I-I model from %s', file)
         with pd.HDFStore(file, 'r') as hdf:
             ratings = hdf['ratings']
             h5 = hdf._handle
 
-            items = h5.get_node('/ii-model', 'items').read()
+            items = h5.get_node('/ii_model', 'items').read()
             items = pd.Index(items)
-            means = h5.get_node('/ii-model', 'means').read()
-
-            indptr = h5.get_node('/ii-model', 'col_ptrs').read()
-            indices = h5.get_node('/ii-model', 'row_nums').read()
-            values = h5.get_node('/ii-model', 'sim_values').read()
+            means = h5.get_node('/ii_model', 'means').read()
+
+            indptr = h5.get_node('/ii_model', 'col_ptrs').read()
+            indices = h5.get_node('/ii_model', 'row_nums').read()
+            values = h5.get_node('/ii_model', 'sim_values').read()
+            _logger.debug('loading matrix with %d entries (%d nnz)',
+                          len(values), np.sum(values != 0))
+            assert np.all(values > self.min_similarity)
 
             matrix = sps.csr_matrix((values, indices, indptr))
 

diff --git a/lenskit/batch.py b/lenskit/batch.py
@@ -49,9 +49,12 @@ def predict(algo, pairs, model=None):
     else:
         pfun = algo
 
-    ures = (pfun(user, udf.item).reset_index(name='prediction').assign(user=user)
-            for (user, udf) in pairs.groupby('user'))
-    res = pd.concat(ures).loc[:, ['user', 'item', 'prediction']]
+    def run(user, udf):
+        res = pfun(user, udf.item)
+        return pd.DataFrame({'user': user, 'item': res.index, 'prediction': res.values})
+
+    ures = (run(user, udf) for (user, udf) in pairs.groupby('user'))
+    res = pd.concat(ures)
     if 'rating' in pairs:
         return pairs.join(res.set_index(['user', 'item']), on=('user', 'item'))
     return res

diff --git a/tasks.py b/tasks.py
@@ -10,7 +10,7 @@
 
 
 @task
-def build(c, cover=False):
+def build(c, cover=False, openmp=None):
     try:
         if cover:
             print('enabling coverage & profiling in Cython build')
@@ -23,6 +23,14 @@ def build(c, cover=False):
         if 'COVERAGE' in os.environ:
             del os.environ['COVERAGE']
 
+    if openmp is not None:
+        if not openmp:
+            os.environ['USE_OPENMP'] = 'no'
+        elif openmp is True:
+            os.environ['USE_OPENMP'] = 'yes'
+        else:
+            os.environ['USE_OPENMP'] = openmp
+
     ldir = Path('build/lib.%s-%d.%d' % (du.get_platform(), *sys.version_info[:2]))
     files = set()
     for ext in importlib.machinery.EXTENSION_SUFFIXES:
@@ -32,7 +40,12 @@ def build(c, cover=False):
         path = pyd.relative_to(ldir)
         if not path.exists() or pyd.stat().st_mtime > path.stat().st_mtime:
             print('copying', pyd, '->', path)
-            shutil.copy2(str(pyd), str(path))
+            try:
+                shutil.copy2(str(pyd), str(path))
+            except PermissionError:
+                print(path, 'in use, renaming')
+                path.replace(str(path) + '.old.pyd')
+                shutil.copy2(str(pyd), str(path))
         else:
             print(path, 'is up to date')
 

diff --git a/tests/test_batch_predict.py b/tests/test_batch_predict.py
@@ -28,7 +28,8 @@ def test_predict_single(mlb):
 
     assert len(res) == 1
     assert all(res.user == 1)
-    assert list(res.columns) == ['user', 'item', 'prediction']
+    assert set(res.columns) == set(['user', 'item', 'prediction'])
+    assert all(res.item == 31)
 
     expected = mlb.model.mean + mlb.model.items.loc[31] + mlb.model.users.loc[1]
     assert res.prediction.iloc[0] == pytest.approx(expected)
@@ -40,7 +41,8 @@ def test_predict_single_model(mlb):
 
     assert len(res) == 1
     assert all(res.user == 1)
-    assert list(res.columns) == ['user', 'item', 'prediction']
+    assert set(res.columns) == set(['user', 'item', 'prediction'])
+    assert all(res.item == 31)
 
     expected = mlb.model.mean + mlb.model.items.loc[31] + mlb.model.users.loc[1]
     assert res.prediction.iloc[0] == pytest.approx(expected)
@@ -59,7 +61,7 @@ def test_predict_user(mlb):
     res = lkb.predict(mlb.predictor, tf)
 
     assert len(res) == 15
-    assert list(res.columns) == ['user', 'item', 'prediction']
+    assert set(res.columns) == set(['user', 'item', 'prediction'])
     assert all(res.user == uid)
     assert set(res.item) == set(test_items)
 

diff --git a/tests/test_knn_item_item.py b/tests/test_knn_item_item.py
@@ -141,6 +141,42 @@ def test_ii_train_big_unbounded():
     assert means[model.items].values == approx(model.means)
 
 
+@mark.slow
+@mark.skipif(not lktu.ml100k.available, reason='ML100K data not present')
+def test_ii_train_ml100k(tmpdir):
+    "Test an unbounded model on ML-100K"
+    ratings = lktu.ml100k.load_ratings()
+    algo = knn.ItemItem(30)
+    _log.info('training model')
+    model = algo.train(ratings)
+
+    _log.info('testing model')
+    assert model is not None
+
+    assert all(np.logical_not(np.isnan(model.sim_matrix.data)))
+    assert all(model.sim_matrix.data > 0)
+
+    # a little tolerance
+    assert all(model.sim_matrix.data < 1 + 1.0e-6)
+
+    assert model.counts.sum() == model.sim_matrix.nnz
+
+    means = ratings.groupby('item').rating.mean()
+    assert means[model.items].values == approx(model.means)
+
+    # save
+    fn = os.path.join(tmpdir, 'ii.mod')
+    _log.info('saving model to %s', fn)
+    algo.save_model(model, fn)
+    _log.info('reloading model')
+    restored = algo.load_model(fn)
+    assert restored is not None and restored is not model
+    assert all(restored.sim_matrix.data > 0)
+    assert all(restored.sim_matrix.indptr == model.sim_matrix.indptr)
+    assert all(restored.sim_matrix.indices == model.sim_matrix.indices)
+    assert all(restored.sim_matrix.data == model.sim_matrix.data)
+
+
 @mark.slow
 def test_ii_large_models():
     "Several tests of large trained I-I models"