Skip to content

Commit

Permalink
Merge pull request #38 from lenskit/feature/clean-infra
Browse files Browse the repository at this point in the history
Some infrastructure cleanups
  • Loading branch information
mdekstrand committed Oct 21, 2018
2 parents 5926ec6 + f32e69e commit c7e6196
Show file tree
Hide file tree
Showing 22 changed files with 615 additions and 180 deletions.
42 changes: 28 additions & 14 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
variables:
conda.deps: >
python=$(python.version)
pandas scipy pytables numba
pytest pytest-arraydiff pytest-xdist
invoke coverage pytest-cov
pandas scipy pytables fastparquet python-snappy numba
invoke coverage pytest pytest-cov
pip.deps: >
invoke pytest
pandas scipy tables
invoke pytest coverage pytest-cov
pandas scipy pyarrow
numba
jobs:
Expand All @@ -29,16 +28,13 @@ jobs:
maxParallel: 4

steps:
- script: sudo install -d -m 0777 /usr/envs
displayName: Fix Conda permissions

- task: CondaEnvironment@1
inputs:
createCustomEnvironment: true
environmentName: lkpy
packageSpecs: $(conda.deps)
updateConda: false

- script: |
if [ ! -r ~/ml-100k/u.data ]; then
wget --no-verbose -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
Expand Down Expand Up @@ -84,8 +80,8 @@ jobs:
python.version: '3.5'
Python36:
python.version: '3.6'
# Python37:
# python.version: '3.7'
Python37:
python.version: '3.7'
maxParallel: 4

steps:
Expand All @@ -109,6 +105,27 @@ jobs:
- script: |
python3 setup.py test
displayName: 'pytest'
- script: |
env NUMBA_DISABLE_JIT=1 invoke test --cover --no-eval --no-slow --verbose
displayName: 'Test Coverage'
- script: |
coverage xml
echo "Fetching Codecov script"
curl -o /tmp/codecov.sh https://codecov.io/bash
echo "Building for $BUILD_REASON"
cc_args=
if [ -n "$SYSTEM_PULLREQUEST_PULLREQUESTNUMBER" ]; then
ccargs="-P $SYSTEM_PULLREQUEST_PULLREQUESTNUMBER"
fi
if [ -z "$CODECOV_TOKEN" ]; then
echo "no CODECOV_TOKEN :(" >&2
fi
bash /tmp/codecov.sh -C "$BUILD_SOURCEVERSION" -B $BUILD_SOURCEBRANCH $cc_args
displayName: 'Upload Coverage'
- job: 'WindowsVanilla'
pool:
Expand Down Expand Up @@ -168,9 +185,6 @@ jobs:
maxParallel: 4

steps:
- script: sudo chmod 0777 /usr/local/miniconda/envs
displayName: Fix Conda permissions

- task: CondaEnvironment@1
inputs:
createCustomEnvironment: true
Expand Down
6 changes: 4 additions & 2 deletions conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ requirements:
- pandas
- numpy
- scipy
- pytables
- pyarrow
- python-snappy
run:
- python
- pandas
- scipy
- numba >= 0.40
- pytables
- pyarrow
- python-snappy

test:
source_files:
Expand Down
22 changes: 13 additions & 9 deletions lenskit/algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from abc import ABCMeta, abstractmethod
import pickle
import pathlib


class Predictor(metaclass=ABCMeta):
Expand Down Expand Up @@ -99,31 +100,34 @@ def train(self, ratings):
"""
raise NotImplemented()

def save_model(self, model, file):
def save_model(self, model, path):
"""
Save a trained model to a file. The default implementation pickles the model.
Save a trained model to a file or directory. The default implementation pickles
the model.
Algorithms are allowed to use any format for saving their models, including
directories.
Args:
model: the trained model.
file(str):
the file in which to save the model.
path(str):
the path at which to save the model.
"""

with open(file, 'wb') as f:
path = pathlib.Path(path)
with path.open('wb') as f:
pickle.dump(model, f)

def load_model(self, file):
def load_model(self, path):
"""
Save a trained model to a file.
Args:
file(str): the path to file from which to load the model.
path(str): the path to file from which to load the model.
Returns:
the re-loaded model (of an implementation-defined type).
"""
with open(file, 'rb') as f:

path = pathlib.Path(path)
with path.open('rb') as f:
return pickle.load(f)
16 changes: 16 additions & 0 deletions lenskit/algorithms/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ def predict(self, model: BiasMFModel, user, items, ratings=None):
# look up user index
return model.score_by_ids(user, items)

def save_model(self, model, path):
model.save(path)

def load_model(self, path):
return BiasMFModel.load(path)

def __str__(self):
return 'als.BiasedMF(features={}, regularization={})'.\
format(self.features, self.regularization)
Expand Down Expand Up @@ -286,3 +292,13 @@ def _initial_model(self, ratings):
def predict(self, model: MFModel, user, items, ratings=None):
# look up user index
return model.score_by_ids(user, items)

def save_model(self, model, path):
model.save(path)

def load_model(self, path):
return MFModel.load(path)

def __str__(self):
return 'als.ImplicitMF(features={}, regularization={})'.\
format(self.features, self.regularization)
22 changes: 16 additions & 6 deletions lenskit/algorithms/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(self, items=True, users=True, damping=0.0):
if isinstance(damping, tuple):
self.user_damping, self.item_damping = damping
else:
self.damping = damping
self.user_damping = damping
self.item_damping = damping

Expand Down Expand Up @@ -144,6 +145,9 @@ def _mean(self, series, damping):
else:
return series.mean()

def __str__(self):
return 'Bias(ud={}, id={})'.format(self.user_damping, self.item_damping)


class Popular(Recommender, Trainable):
def train(self, ratings):
Expand All @@ -163,6 +167,9 @@ def recommend(self, model, user, n=None, candidates=None, ratings=None):
else:
return scores.nlargest(n).reset_index()

def __str__(self):
return 'Popular'


class Memorized:
"""
Expand Down Expand Up @@ -223,8 +230,8 @@ def predict(self, model, user, items, ratings=None):

return preds.reindex(items)

def save_model(self, model, file):
path = pathlib.Path(file)
def save_model(self, model, path):
path = pathlib.Path(path)
path.mkdir(parents=True, exist_ok=True)
for i, algo in enumerate(self.algorithms):
mp = path / 'algo-{}.dat'.format(i+1)
Expand All @@ -248,6 +255,9 @@ def load_model(self, file):

return model

def __str__(self):
return 'Fallback([{}])'.format(', '.join(self.algorithms))


class TopN(Recommender):
"""
Expand Down Expand Up @@ -287,8 +297,8 @@ class _TrainableTopN(TopN, Trainable):
def train(self, ratings):
return self.predictor.train(ratings)

def save_model(self, model, file):
self.predictor.save_model(model, file)
def save_model(self, model, path):
self.predictor.save_model(model, path)

def load_model(self, file):
return self.predictor.load_model(file)
def load_model(self, path):
return self.predictor.load_model(path)
10 changes: 10 additions & 0 deletions lenskit/algorithms/funksvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,13 @@ def predict(self, model, user, items, ratings=None):
res = pd.Series(rv, index=good_items)
res = res.reindex(items)
return res

def save_model(self, model, path):
model.save(path)

def load_model(self, path):
return BiasMFModel.load(path)

def __str__(self):
return 'FunkSVD(features={}, regularization={})'.\
format(self.features, self.regularization)
80 changes: 45 additions & 35 deletions lenskit/algorithms/item_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Item-based k-NN collaborative filtering.
"""

import pathlib
from collections import namedtuple
import logging

Expand Down Expand Up @@ -265,41 +266,50 @@ def predict(self, model, user, items, ratings=None):

return results

def save_model(self, model, file):
_logger.info('saving I-I model to %s', file)
with pd.HDFStore(file, 'w') as hdf:
h5 = hdf._handle
group = h5.create_group('/', 'ii_model')
h5.create_array(group, 'items', model.items.values)
h5.create_array(group, 'means', model.means)
_logger.debug('saving matrix with %d entries (%d nnz)',
model.sim_matrix.nnz, np.sum(model.sim_matrix.data != 0))
h5.create_array(group, 'col_ptrs', model.sim_matrix.indptr)
h5.create_array(group, 'row_nums', model.sim_matrix.indices)
h5.create_array(group, 'sim_values', model.sim_matrix.data)

hdf['ratings'] = model.rating_matrix

def load_model(self, file):
_logger.info('loading I-I model from %s', file)
with pd.HDFStore(file, 'r') as hdf:
ratings = hdf['ratings']
h5 = hdf._handle

items = h5.get_node('/ii_model', 'items').read()
items = pd.Index(items)
means = h5.get_node('/ii_model', 'means').read()

indptr = h5.get_node('/ii_model', 'col_ptrs').read()
indices = h5.get_node('/ii_model', 'row_nums').read()
values = h5.get_node('/ii_model', 'sim_values').read()
_logger.debug('loading matrix with %d entries (%d nnz)',
len(values), np.sum(values != 0))
assert np.all(values > self.min_similarity)

matrix = sps.csr_matrix((values, indices, indptr))

return IIModel(items, means, np.diff(indptr), matrix, ratings)
def save_model(self, model, path):
path = pathlib.Path(path)
_logger.info('saving I-I model to %s', path)
path.mkdir(parents=True, exist_ok=True)

imeans = pd.DataFrame({'item': model.items.values, 'mean': model.means})
imeans.to_parquet(str(path / 'items.parquet'))

coo = model.sim_matrix.tocoo()
coo_df = pd.DataFrame({'item': coo.row, 'neighbor': coo.col, 'similarity': coo.data})
coo_df.to_parquet(str(path / 'similarities.parquet'))

model.rating_matrix.reset_index().to_parquet(str(path / 'ratings.parquet'))

def load_model(self, path):
path = pathlib.Path(path)
_logger.info('loading I-I model from %s', path)

imeans = pd.read_parquet(str(path / 'items.parquet'))
items = pd.Index(imeans.item)
means = imeans['mean'].values
nitems = len(items)

coo_df = pd.read_parquet(str(path / 'similarities.parquet'))
_logger.info('read %d similarities for %d items', len(coo_df), nitems)
csr = sps.csr_matrix((coo_df['similarity'].values,
(coo_df['item'].values, coo_df['neighbor'].values)),
(nitems, nitems))

for i in range(nitems):
sp = csr.indptr[i]
ep = csr.indptr[i+1]
if ep == sp:
continue

ord = np.argsort(csr.data[sp:ep])
ord = ord[::-1]
csr.indices[sp:ep] = csr.indices[sp + ord]
csr.data[sp:ep] = csr.data[sp + ord]

rmat = pd.read_parquet(str(path / 'ratings.parquet'))
rmat = rmat.set_index(['user', 'item'])

return IIModel(items, means, np.diff(csr.indptr), csr, rmat)

def __str__(self):
return 'ItemItem(nnbrs={}, msize={})'.format(self.max_neighbors, self.save_neighbors)

0 comments on commit c7e6196

Please sign in to comment.