Merge pull request #38 from lenskit/feature/clean-infra

Some infrastructure cleanups
lenskit · Oct 21, 2018 · c7e6196 · c7e6196
2 parents 5926ec6 + f32e69e
commit c7e6196
Show file tree

Hide file tree

Showing 22 changed files with 615 additions and 180 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -5,12 +5,11 @@
 variables:
   conda.deps: >
     python=$(python.version)
-    pandas scipy pytables numba
-    pytest pytest-arraydiff pytest-xdist
-    invoke coverage pytest-cov
+    pandas scipy pytables fastparquet python-snappy numba
+    invoke coverage pytest pytest-cov
   pip.deps: >
-    invoke pytest
-    pandas scipy tables
+    invoke pytest coverage pytest-cov
+    pandas scipy pyarrow
     numba
 
 jobs:
@@ -29,16 +28,13 @@ jobs:
     maxParallel: 4
 
   steps:
-  - script: sudo install -d -m 0777 /usr/envs
-    displayName: Fix Conda permissions
-
   - task: CondaEnvironment@1
     inputs:
       createCustomEnvironment: true
       environmentName: lkpy
       packageSpecs: $(conda.deps)
       updateConda: false
-
+    
   - script: |
       if [ ! -r ~/ml-100k/u.data ]; then
         wget --no-verbose -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
@@ -84,8 +80,8 @@ jobs:
         python.version: '3.5'
       Python36:
         python.version: '3.6'
-      # Python37:
-      #   python.version: '3.7'
+      Python37:
+        python.version: '3.7'
     maxParallel: 4
 
   steps:
@@ -109,6 +105,27 @@ jobs:
   - script: |
       python3 setup.py test
     displayName: 'pytest'
+  
+  - script: |
+      env NUMBA_DISABLE_JIT=1 invoke test --cover --no-eval --no-slow --verbose
+    displayName: 'Test Coverage'
+
+  - script: |
+      coverage xml
+      echo "Fetching Codecov script"
+      curl -o /tmp/codecov.sh https://codecov.io/bash
+      
+      echo "Building for $BUILD_REASON"
+      cc_args=
+      if [ -n "$SYSTEM_PULLREQUEST_PULLREQUESTNUMBER" ]; then
+        ccargs="-P $SYSTEM_PULLREQUEST_PULLREQUESTNUMBER"
+      fi
+      if [ -z "$CODECOV_TOKEN" ]; then
+        echo "no CODECOV_TOKEN :(" >&2
+      fi
+
+      bash /tmp/codecov.sh -C "$BUILD_SOURCEVERSION" -B $BUILD_SOURCEBRANCH $cc_args
+    displayName: 'Upload Coverage'
 
 - job: 'WindowsVanilla'
   pool:
@@ -168,9 +185,6 @@ jobs:
     maxParallel: 4
 
   steps:
-  - script: sudo chmod 0777 /usr/local/miniconda/envs
-    displayName: Fix Conda permissions
-
   - task: CondaEnvironment@1
     inputs:
       createCustomEnvironment: true

diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -21,13 +21,15 @@ requirements:
     - pandas
     - numpy
     - scipy
-    - pytables
+    - pyarrow
+    - python-snappy
   run:
     - python
     - pandas
     - scipy
     - numba >= 0.40
-    - pytables
+    - pyarrow
+    - python-snappy
 
 test:
   source_files:

diff --git a/lenskit/algorithms/__init__.py b/lenskit/algorithms/__init__.py
@@ -9,6 +9,7 @@
 
 from abc import ABCMeta, abstractmethod
 import pickle
+import pathlib
 
 
 class Predictor(metaclass=ABCMeta):
@@ -99,31 +100,34 @@ def train(self, ratings):
         """
         raise NotImplemented()
 
-    def save_model(self, model, file):
+    def save_model(self, model, path):
         """
-        Save a trained model to a file.  The default implementation pickles the model.
+        Save a trained model to a file or directory.  The default implementation pickles
+        the model.
 
         Algorithms are allowed to use any format for saving their models, including
         directories.
 
         Args:
             model: the trained model.
-            file(str):
-                the file in which to save the model.
+            path(str):
+                the path at which to save the model.
         """
-
-        with open(file, 'wb') as f:
+        path = pathlib.Path(path)
+        with path.open('wb') as f:
             pickle.dump(model, f)
 
-    def load_model(self, file):
+    def load_model(self, path):
         """
         Save a trained model to a file.
 
         Args:
-            file(str): the path to file from which to load the model.
+            path(str): the path to file from which to load the model.
 
         Returns:
             the re-loaded model (of an implementation-defined type).
         """
-        with open(file, 'rb') as f:
+
+        path = pathlib.Path(path)
+        with path.open('rb') as f:
             return pickle.load(f)
diff --git a/lenskit/algorithms/als.py b/lenskit/algorithms/als.py
@@ -208,6 +208,12 @@ def predict(self, model: BiasMFModel, user, items, ratings=None):
         # look up user index
         return model.score_by_ids(user, items)
 
+    def save_model(self, model, path):
+        model.save(path)
+
+    def load_model(self, path):
+        return BiasMFModel.load(path)
+
     def __str__(self):
         return 'als.BiasedMF(features={}, regularization={})'.\
             format(self.features, self.regularization)
@@ -286,3 +292,13 @@ def _initial_model(self, ratings):
     def predict(self, model: MFModel, user, items, ratings=None):
         # look up user index
         return model.score_by_ids(user, items)
+
+    def save_model(self, model, path):
+        model.save(path)
+
+    def load_model(self, path):
+        return MFModel.load(path)
+
+    def __str__(self):
+        return 'als.ImplicitMF(features={}, regularization={})'.\
+            format(self.features, self.regularization)
diff --git a/lenskit/algorithms/basic.py b/lenskit/algorithms/basic.py
@@ -57,6 +57,7 @@ def __init__(self, items=True, users=True, damping=0.0):
         if isinstance(damping, tuple):
             self.user_damping, self.item_damping = damping
         else:
+            self.damping = damping
             self.user_damping = damping
             self.item_damping = damping
 
@@ -144,6 +145,9 @@ def _mean(self, series, damping):
         else:
             return series.mean()
 
+    def __str__(self):
+        return 'Bias(ud={}, id={})'.format(self.user_damping, self.item_damping)
+
 
 class Popular(Recommender, Trainable):
     def train(self, ratings):
@@ -163,6 +167,9 @@ def recommend(self, model, user, n=None, candidates=None, ratings=None):
         else:
             return scores.nlargest(n).reset_index()
 
+    def __str__(self):
+        return 'Popular'
+
 
 class Memorized:
     """
@@ -223,8 +230,8 @@ def predict(self, model, user, items, ratings=None):
 
         return preds.reindex(items)
 
-    def save_model(self, model, file):
-        path = pathlib.Path(file)
+    def save_model(self, model, path):
+        path = pathlib.Path(path)
         path.mkdir(parents=True, exist_ok=True)
         for i, algo in enumerate(self.algorithms):
             mp = path / 'algo-{}.dat'.format(i+1)
@@ -248,6 +255,9 @@ def load_model(self, file):
 
         return model
 
+    def __str__(self):
+        return 'Fallback([{}])'.format(', '.join(self.algorithms))
+
 
 class TopN(Recommender):
     """
@@ -287,8 +297,8 @@ class _TrainableTopN(TopN, Trainable):
     def train(self, ratings):
         return self.predictor.train(ratings)
 
-    def save_model(self, model, file):
-        self.predictor.save_model(model, file)
+    def save_model(self, model, path):
+        self.predictor.save_model(model, path)
 
-    def load_model(self, file):
-        return self.predictor.load_model(file)
+    def load_model(self, path):
+        return self.predictor.load_model(path)
diff --git a/lenskit/algorithms/funksvd.py b/lenskit/algorithms/funksvd.py
@@ -292,3 +292,13 @@ def predict(self, model, user, items, ratings=None):
         res = pd.Series(rv, index=good_items)
         res = res.reindex(items)
         return res
+
+    def save_model(self, model, path):
+        model.save(path)
+
+    def load_model(self, path):
+        return BiasMFModel.load(path)
+
+    def __str__(self):
+        return 'FunkSVD(features={}, regularization={})'.\
+            format(self.features, self.regularization)
diff --git a/lenskit/algorithms/item_knn.py b/lenskit/algorithms/item_knn.py
@@ -2,6 +2,7 @@
 Item-based k-NN collaborative filtering.
 """
 
+import pathlib
 from collections import namedtuple
 import logging
 
@@ -265,41 +266,50 @@ def predict(self, model, user, items, ratings=None):
 
         return results
 
-    def save_model(self, model, file):
-        _logger.info('saving I-I model to %s', file)
-        with pd.HDFStore(file, 'w') as hdf:
-            h5 = hdf._handle
-            group = h5.create_group('/', 'ii_model')
-            h5.create_array(group, 'items', model.items.values)
-            h5.create_array(group, 'means', model.means)
-            _logger.debug('saving matrix with %d entries (%d nnz)',
-                          model.sim_matrix.nnz, np.sum(model.sim_matrix.data != 0))
-            h5.create_array(group, 'col_ptrs', model.sim_matrix.indptr)
-            h5.create_array(group, 'row_nums', model.sim_matrix.indices)
-            h5.create_array(group, 'sim_values', model.sim_matrix.data)
-
-            hdf['ratings'] = model.rating_matrix
-
-    def load_model(self, file):
-        _logger.info('loading I-I model from %s', file)
-        with pd.HDFStore(file, 'r') as hdf:
-            ratings = hdf['ratings']
-            h5 = hdf._handle
-
-            items = h5.get_node('/ii_model', 'items').read()
-            items = pd.Index(items)
-            means = h5.get_node('/ii_model', 'means').read()
-
-            indptr = h5.get_node('/ii_model', 'col_ptrs').read()
-            indices = h5.get_node('/ii_model', 'row_nums').read()
-            values = h5.get_node('/ii_model', 'sim_values').read()
-            _logger.debug('loading matrix with %d entries (%d nnz)',
-                          len(values), np.sum(values != 0))
-            assert np.all(values > self.min_similarity)
-
-            matrix = sps.csr_matrix((values, indices, indptr))
-
-            return IIModel(items, means, np.diff(indptr), matrix, ratings)
+    def save_model(self, model, path):
+        path = pathlib.Path(path)
+        _logger.info('saving I-I model to %s', path)
+        path.mkdir(parents=True, exist_ok=True)
+
+        imeans = pd.DataFrame({'item': model.items.values, 'mean': model.means})
+        imeans.to_parquet(str(path / 'items.parquet'))
+
+        coo = model.sim_matrix.tocoo()
+        coo_df = pd.DataFrame({'item': coo.row, 'neighbor': coo.col, 'similarity': coo.data})
+        coo_df.to_parquet(str(path / 'similarities.parquet'))
+
+        model.rating_matrix.reset_index().to_parquet(str(path / 'ratings.parquet'))
+
+    def load_model(self, path):
+        path = pathlib.Path(path)
+        _logger.info('loading I-I model from %s', path)
+
+        imeans = pd.read_parquet(str(path / 'items.parquet'))
+        items = pd.Index(imeans.item)
+        means = imeans['mean'].values
+        nitems = len(items)
+
+        coo_df = pd.read_parquet(str(path / 'similarities.parquet'))
+        _logger.info('read %d similarities for %d items', len(coo_df), nitems)
+        csr = sps.csr_matrix((coo_df['similarity'].values,
+                              (coo_df['item'].values, coo_df['neighbor'].values)),
+                             (nitems, nitems))
+
+        for i in range(nitems):
+            sp = csr.indptr[i]
+            ep = csr.indptr[i+1]
+            if ep == sp:
+                continue
+
+            ord = np.argsort(csr.data[sp:ep])
+            ord = ord[::-1]
+            csr.indices[sp:ep] = csr.indices[sp + ord]
+            csr.data[sp:ep] = csr.data[sp + ord]
+
+        rmat = pd.read_parquet(str(path / 'ratings.parquet'))
+        rmat = rmat.set_index(['user', 'item'])
+
+        return IIModel(items, means, np.diff(csr.indptr), csr, rmat)
 
     def __str__(self):
         return 'ItemItem(nnbrs={}, msize={})'.format(self.max_neighbors, self.save_neighbors)