Merge branch 'feature/implicit'

lenskit · Nov 28, 2018 · 16d72f8 · 16d72f8
2 parents 75bc90b + 59f6c2a
commit 16d72f8
Show file tree

Hide file tree

Showing 10 changed files with 203 additions and 19 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -1,16 +1,14 @@
-# Python package
-# Create and test a Python package on multiple Python versions.
-# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
-# https://docs.microsoft.com/vsts/pipelines/languages/python
 variables:
   conda.deps: >
     python=$(python.version)
     pandas scipy pytables fastparquet python-snappy numba cffi
-    invoke coverage pytest pytest-cov
+    invoke coverage pytest pytest-cov cython
   pip.deps: >
     invoke pytest coverage pytest-cov
     pandas scipy pyarrow
-    numba
+    numba cython
+  pip.extras: >
+    hpfrec==0.2.2.5 implicit
 
 jobs:
 
@@ -31,7 +29,10 @@ jobs:
       createCustomEnvironment: true
       environmentName: lkpy
       packageSpecs: $(conda.deps)
-      updateConda: false
+
+  - script: |
+        sudo /usr/envs/lkpy/bin/pip install $(pip.extras)
+    displayName: 'Extra PyPI deps'
     
   - script: |
       if [ ! -r ~/ml-100k/u.data ]; then
@@ -98,7 +99,7 @@ jobs:
       python -m pip install --upgrade pip
       pip install $(pip.deps)
     displayName: 'Install dependencies'
-
+  
   - script: |
       if [ ! -r ~/ml-100k/u.data ]; then
         wget --no-verbose -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
@@ -247,6 +248,9 @@ jobs:
       packageSpecs: $(conda.deps)
       updateConda: false
 
+  - script: sudo conda install -y llvm-openmp
+    displayName: 'Install OpenMP'
+
   - script: |
       if [ ! -r ~/ml-100k/u.data ]; then
         wget --no-verbose -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip

diff --git a/doc/algorithms.rst b/doc/algorithms.rst
@@ -13,3 +13,4 @@ algorithms.
    knn
    mf
    hpf
+   implicit
diff --git a/doc/conf.py b/doc/conf.py
@@ -180,5 +180,6 @@
     'pandas': ('http://pandas.pydata.org/pandas-docs/stable/', None),
     'numpy': ('https://docs.scipy.org/doc/numpy/', None),
     'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None),
-    'hpfrec': ('https://hpfrec.readthedocs.io/en/latest/', None)
+    'hpfrec': ('https://hpfrec.readthedocs.io/en/latest/', None),
+    'implicit': ('https://implicit.readthedocs.io/en/latest/', None)
 }
diff --git a/doc/implicit.rst b/doc/implicit.rst
@@ -0,0 +1,15 @@
+Implicit
+========
+
+.. module:: lenskit.algorithms.implicit
+
+This module provides a LensKit bridge to the implicit_ library implementing
+several implicit-feedback recommenders.
+
+.. _implicit: https://implicit.readthedocs.io/en/latest/
+
+.. autoclass:: ALS
+    :members:
+
+.. autoclass:: BPR
+    :members:
diff --git a/lenskit/algorithms/implicit.py b/lenskit/algorithms/implicit.py
@@ -0,0 +1,84 @@
+from collections import namedtuple
+import pandas as pd
+import numpy as np
+
+from implicit.als import AlternatingLeastSquares
+from implicit.bpr import BayesianPersonalizedRanking
+
+from ..matrix import sparse_ratings
+from . import Trainable, Recommender
+
+ImplicitModel = namedtuple('ImplicitModel', [
+    'algo', 'matrix', 'users', 'items'
+])
+ImplicitModel.__doc__ = '''
+Model for *implicit*-backed recommenders.
+
+Attributes:
+    algo(implicit.RecommenderBase): the underlying algorithm.
+    matrix(scipy.sparse.csr_matrix): the user-item matrix.
+    users(pandas.Index): the user ID to user position index.
+    items(pandas.Index): the item ID to item position index.
+'''
+
+
+class BaseRec(Trainable, Recommender):
+    """
+    Base class for Implicit-backed recommenders.
+    """
+    def __init__(self, algo, *args, **kwargs):
+        self.algo_class = algo
+        self.algo_args = args
+        self.algo_kwargs = kwargs
+
+    def train(self, ratings):
+        matrix, users, items = sparse_ratings(ratings, scipy=True)
+        iur = matrix.T.tocsr()
+
+        algo = self.algo_class(*self.algo_args, **self.algo_kwargs)
+        algo.fit(iur)
+
+        return ImplicitModel(algo, matrix, users, items)
+
+    def recommend(self, model: ImplicitModel, user, n=None, candidates=None, ratings=None):
+        try:
+            uid = model.users.get_loc(user)
+        except KeyError:
+            return pd.DataFrame({'item': []})
+
+        if candidates is None:
+            recs = model.algo.recommend(uid, model.matrix, N=n)
+        else:
+            cands = model.items.get_indexer(candidates)
+            cands = cands[cands >= 0]
+            recs = model.algo.rank_items(uid, model.matrix, cands)
+
+        if n is not None:
+            recs = recs[:n]
+        rec_df = pd.DataFrame.from_records(recs, columns=['item_pos', 'score'])
+        rec_df['item'] = model.items[rec_df.item_pos]
+        return rec_df.loc[:, ['item', 'score']]
+
+
+class ALS(BaseRec):
+    """
+    LensKit interface to :py:mod:`implicit.als`.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Construct an ALS recommender.  The arguments are passed as-is to
+        :py:class:`implicit.als.AlternatingLeastSquares`.
+        """
+        super().__init__(AlternatingLeastSquares, *args, **kwargs)
+
+
+class BPR(BaseRec):
+    """
+    LensKit interface to :py:mod:`implicit.bpr`.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Construct an ALS recommender.  The arguments are passed as-is to
+        :py:class:`implicit.als.BayesianPersonalizedRanking`.
+        """
+        super().__init__(BayesianPersonalizedRanking, *args, **kwargs)
diff --git a/lenskit/algorithms/mf_common.py b/lenskit/algorithms/mf_common.py
@@ -43,12 +43,12 @@ def n_features(self):
     @property
     def n_users(self):
         "The number of users."
-        return len(self.users)
+        return len(self.user_index)
 
     @property
     def n_items(self):
         "The number of items."
-        return len(self.items)
+        return len(self.item_index)
 
     def lookup_user(self, user):
         """

diff --git a/setup.py b/setup.py
@@ -41,6 +41,12 @@
             'sphinx >= 1.8',
             'sphinx_rtd_theme',
             'nbsphinx'
+        ],
+        'hpf': [
+            'hpfrec'
+        ],
+        'implicit': [
+            'implicit'
         ]
     },
     packages=find_packages()

diff --git a/tests/test_batch_recommend.py b/tests/test_batch_recommend.py
@@ -34,10 +34,7 @@ def test_recommend_single(mlb):
     assert len(res) == 1
     assert all(res['user'] == 1)
     assert all(res['rank'] == 1)
-    if sys.version_info >= (3, 6):
-        assert list(res.columns) == ['user', 'rank', 'item', 'score']
-    else:
-        warnings.warn('Python 3.5 loses column order')
+    assert set(res.columns) == set(['user', 'rank', 'item', 'score'])
 
     expected = mlb.model.mean + mlb.model.items.loc[31] + mlb.model.users.loc[1]
     assert res.score.iloc[0] == pytest.approx(expected)
@@ -54,10 +51,7 @@ def candidates(user):
     res = lkb.recommend(mlb.algo, mlb.model, [5], 10, candidates)
 
     assert len(res) == 10
-    if sys.version_info >= (3, 6):
-        assert list(res.columns) == ['user', 'rank', 'item', 'score']
-    else:
-        warnings.warn('Python 3.5 loses column order')
+    assert set(res.columns) == set(['user', 'rank', 'item', 'score'])
     assert all(res['user'] == uid)
     assert all(res['rank'] == np.arange(10) + 1)
     # they should be in decreasing order

diff --git a/tests/test_hpf.py b/tests/test_hpf.py
@@ -27,6 +27,7 @@
 def test_hpf_train_large():
     algo = hpf.HPF(20)
     ratings = lktu.ml_pandas.renamed.ratings
+    ratings = ratings.assign(rating=ratings.rating + 0.5)
     model = algo.train(ratings)
 
     assert model is not None

diff --git a/tests/test_implicit.py b/tests/test_implicit.py
@@ -0,0 +1,78 @@
+import logging
+
+import pandas as pd
+import numpy as np
+
+from pytest import mark
+
+import lk_test_utils as lktu
+
+try:
+    from lenskit.algorithms import implicit
+    have_implicit = True
+except ImportError:
+    have_implicit = False
+
+_log = logging.getLogger(__name__)
+
+simple_df = pd.DataFrame({'item': [1, 1, 2, 3],
+                          'user': [10, 12, 10, 13],
+                          'rating': [4.0, 3.0, 5.0, 2.0]})
+
+
+@mark.slow
+@mark.skipif(not have_implicit, reason='implicit not installed')
+def test_implicit_als_train_rec():
+    algo = implicit.ALS(25)
+    ratings = lktu.ml_pandas.renamed.ratings
+
+    model = algo.train(ratings)
+    assert model is not None
+
+    recs = algo.recommend(model, 100, n=20)
+    assert len(recs) == 20
+
+
+@mark.slow
+@mark.eval
+@mark.skipif(not have_implicit, reason='implicit not installed')
+@mark.skipif(not lktu.ml100k.available, reason='ML100K data not present')
+def test_implicit_als_batch_accuracy():
+    import lenskit.crossfold as xf
+    from lenskit import batch, topn
+    import lenskit.metrics.topn as lm
+
+    ratings = lktu.ml100k.load_ratings()
+
+    algo = implicit.ALS(25)
+
+    def eval(train, test):
+        _log.info('running training')
+        train['rating'] = train.rating.astype(np.float_)
+        model = algo.train(train)
+        users = test.user.unique()
+        _log.info('testing %d users', len(users))
+        candidates = topn.UnratedCandidates(train)
+        recs = batch.recommend(algo, model, users, 100, candidates, test)
+        return recs
+
+    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
+    recs = pd.concat(eval(train, test) for (train, test) in folds)
+
+    _log.info('analyzing recommendations')
+    ndcg = recs.groupby('user').rating.apply(lm.ndcg)
+    _log.info('ndcg for users is %.4f', ndcg.mean())
+    assert ndcg.mean() > 0
+
+
+@mark.slow
+@mark.skipif(not have_implicit, reason='implicit not installed')
+def test_implicit_bpr_train_rec():
+    algo = implicit.BPR(25)
+    ratings = lktu.ml_pandas.renamed.ratings
+
+    model = algo.train(ratings)
+    assert model is not None
+
+    recs = algo.recommend(model, 100, n=20)
+    assert len(recs) == 20