Skip to content

Commit

Permalink
Merge pull request #55 from lenskit/feature/perf
Browse files Browse the repository at this point in the history
Item-item memory improvements & test fixes
  • Loading branch information
mdekstrand committed Jan 10, 2019
2 parents cfa4856 + a205ff5 commit 36bc4c4
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ build/
dist/
.coverage
coverage.xml
htmlcov/
.doit.db*
ml-100k
*.pyd
Expand Down
29 changes: 23 additions & 6 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
variables:
conda.deps: >
python=$(python.version)
pandas scipy pytables fastparquet python-snappy numba cffi
pandas scipy fastparquet python-snappy numba cffi
invoke coverage pytest pytest-cov pytest-doctestplus cython
pip.deps: >
invoke pytest coverage pytest-cov pytest-doctestplus
Expand Down Expand Up @@ -33,6 +33,10 @@ jobs:
- script: |
pip install $(pip.extras)
displayName: 'Extra PyPI deps'
- script: |
numba -s
displayName: 'Inspect Numba environment'
- script: |
if [ ! -r ml-100k/u.data ]; then
Expand All @@ -42,19 +46,26 @@ jobs:
displayName: 'Download ML-100K'
- script: |
mkdir -p build
python3 setup.py build
displayName: 'Build LKPY'
- script: |
python3 -m pytest
export NUMBA_THREADING_LAYER=omp # Tests don't work with TBB
python3 -m pytest --junitxml=build/test-results.xml --verbose
displayName: 'Test LKPY'
- task: PublishTestResults@2
condition: succeededOrFailed()
inputs:
testResultsFiles: 'build/test-results.xml'
testRunTitle: 'Publish test results for Python $(python.version)'

- script: |
env NUMBA_DISABLE_JIT=1 invoke test --cover --no-eval
displayName: 'Test Coverage'
env NUMBA_DISABLE_JIT=1 python3 -m pytest --cov=lenskit --cov-report=xml --cov-report=html -m 'not eval'
displayName: 'Run Tests with Coverage'
- script: |
coverage xml
echo "Fetching Codecov script"
curl -o /tmp/codecov.sh https://codecov.io/bash
Expand All @@ -68,7 +79,13 @@ jobs:
fi
bash /tmp/codecov.sh -C "$BUILD_SOURCEVERSION" -B $BUILD_SOURCEBRANCH $cc_args
displayName: 'Upload Coverage'
displayName: 'Update CodeCov'
- task: PublishCodeCoverageResults@1
inputs:
codeCoverageTool: Cobertura
summaryFileLocation: '$(System.DefaultWorkingDirectory)/coverage.xml'
reportDirectory: '$(System.DefaultWorkingDirectory)/htmlcov'

- job: 'LinuxVanilla'
pool:
Expand Down
4 changes: 3 additions & 1 deletion lenskit/_mkl_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def handle(self):

def __del__(self):
if self.h_ptr[0]:
_logger.debug('destroying MKL sparse matrix')
_mkl_lib.mkl_sparse_destroy(self.handle)

def export(self):
Expand Down Expand Up @@ -165,8 +166,9 @@ def csr_syrk(csr: CSR):
mult = SparseM()
rv = _mkl_lib.mkl_sparse_syrk(11, src.handle, mult.h_ptr)
_mkl_check_return(rv, 'mkl_sparse_syrk')
_logger.debug('syrk: exporting matrix')
del src # free a little memory

_logger.debug('syrk: exporting matrix')
result = mult.export()
_logger.debug('syrk: received %dx%d matrix (%d nnz)',
result.nrows, result.ncols, result.nnz)
Expand Down
2 changes: 1 addition & 1 deletion lenskit/algorithms/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,4 +288,4 @@ def recommend(self, user, n=None, candidates=None, ratings=None):
return scores.reset_index()

def __str__(self):
return 'TN/' + str(self.predidctor)
return 'TopN/' + str(self.predictor)
10 changes: 7 additions & 3 deletions lenskit/algorithms/item_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
import scipy.sparse as sps
import scipy.sparse.linalg as spla
from numba import njit
from numba import njit, prange

from lenskit import util, matrix
from . import Predictor
Expand All @@ -22,7 +22,7 @@ def _predict_weighted_average(model, nitems, nrange, ratings, targets):
min_nbrs, max_nbrs = nrange
scores = np.full(nitems, np.nan, dtype=np.float_)

for i in range(targets.shape[0]):
for i in prange(targets.shape[0]):
iidx = targets[i]
rptr = model.rowptrs[iidx]
rend = model.rowptrs[iidx + 1]
Expand Down Expand Up @@ -56,7 +56,7 @@ def _predict_sum(model, nitems, nrange, ratings, targets):
min_nbrs, max_nbrs = nrange
scores = np.full(nitems, np.nan, dtype=np.float_)

for i in range(targets.shape[0]):
for i in prange(targets.shape[0]):
iidx = targets[i]
rptr = model.rowptrs[iidx]
rend = model.rowptrs[iidx + 1]
Expand Down Expand Up @@ -239,6 +239,7 @@ def _mkl_similarities(self, mkl, rmat):
vals = smat.values

rows, cols, vals = self._filter_similarities(rows, cols, vals)
del smat
nnz = len(rows)

_logger.info('[%s] making matrix symmetric (%d nnz)', self._timer, nnz)
Expand Down Expand Up @@ -282,6 +283,7 @@ def _select_similarities(self, nitems, rows, cols, vals):
_logger.debug('will have %d rows in size range [%d,%d]',
len(ncounts), np.min(ncounts), np.max(ncounts))
assert np.all(ncounts <= self.save_nbrs)
assert np.all(ncounts >= 0)
nnz = np.sum(ncounts)

rp2 = np.zeros_like(csr.rowptrs)
Expand All @@ -294,6 +296,8 @@ def _select_similarities(self, nitems, rows, cols, vals):

ep1 = sp1 + ncounts[i]
ep2 = sp2 + ncounts[i]
assert ep1 - sp1 == ep2 - sp2

ci2[sp2:ep2] = csr.colinds[sp1:ep1]
vs2[sp2:ep2] = csr.values[sp1:ep1]

Expand Down
7 changes: 4 additions & 3 deletions lenskit/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def row_nnzs(self):
return diff

def sort_values(self):
"Sort CSR rows in nonincreasing order by value."
_csr_sort(self.nrows, self.rowptrs, self.colinds, self.values)

def transpose(self):
Expand Down Expand Up @@ -204,10 +205,10 @@ def csr_from_coo(rows, cols, vals, shape=None):

_csr_align(rows, nrows, rowptrs, align)

colinds = cols[align].copy()
values = vals[align].copy() if vals is not None else None
cols = cols[align].copy()
vals = vals[align].copy() if vals is not None else None

return CSR(nrows, ncols, nnz, rowptrs, colinds, values)
return CSR(nrows, ncols, nnz, rowptrs, cols, vals)


@njit
Expand Down
47 changes: 36 additions & 11 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np

import lk_test_utils as lktu
from pytest import approx, mark
from pytest import approx

simple_df = pd.DataFrame({'item': [1, 1, 2, 3],
'user': [10, 12, 10, 13],
Expand Down Expand Up @@ -117,28 +117,36 @@ def test_fallback_predict():
assert isinstance(bias, basic.Bias)
assert bias.mean_ == approx(lktu.ml_pandas.ratings.rating.mean())

def exp_val(user, item):
v = bias.mean_
if user is not None:
v += bias.user_offsets_.loc[user]
if item is not None:
v += bias.item_offsets_.loc[item]
return v

# first user + item
preds = algo.predict_for_user(10, [1])
assert preds.loc[1] == 4.0
# second user + first item
preds = algo.predict_for_user(15, [1])
assert preds.loc[1] == approx(bias.mean_ + bias.user_offsets_.loc[15] + bias.item_offsets_.loc[1])
assert preds.loc[1] == approx(exp_val(15, 1))

# second item + user item
preds = algo.predict_for_user(12, [2])
assert preds.loc[2] == approx(bias.mean_ + bias.user_offsets_.loc[12] + bias.item_offsets_.loc[2])
assert preds.loc[2] == approx(exp_val(12, 2))

# blended
preds = algo.predict_for_user(10, [1, 5])
assert preds.loc[1] == 4.0
assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] + bias.item_offsets_.loc[5])
assert preds.loc[5] == approx(exp_val(10, 5))

# blended unknown
preds = algo.predict_for_user(10, [5, 1, -23081])
assert len(preds) == 3
assert preds.loc[1] == 4.0
assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] + bias.item_offsets_.loc[5])
assert preds.loc[-23081] == approx(bias.mean_ + bias.user_offsets_.loc[10])
assert preds.loc[5] == approx(exp_val(10, 5))
assert preds.loc[-23081] == approx(exp_val(10, None))


def test_fallback_save_load(tmp_path):
Expand All @@ -156,33 +164,42 @@ def test_fallback_save_load(tmp_path):
bias = algo.algorithms[1]
assert bias.mean_ == approx(lktu.ml_pandas.ratings.rating.mean())

def exp_val(user, item):
v = bias.mean_
if user is not None:
v += bias.user_offsets_.loc[user]
if item is not None:
v += bias.item_offsets_.loc[item]
return v

# first user + item
preds = algo.predict_for_user(10, [1])
assert preds.loc[1] == 4.0
# second user + first item
preds = algo.predict_for_user(15, [1])
assert preds.loc[1] == approx(bias.mean_ + bias.user_offsets_.loc[15] + bias.item_offsets_.loc[1])
assert preds.loc[1] == approx(exp_val(15, 1))

# second item + user item
preds = algo.predict_for_user(12, [2])
assert preds.loc[2] == approx(bias.mean_ + bias.user_offsets_.loc[12] + bias.item_offsets_.loc[2])
assert preds.loc[2] == approx(exp_val(12, 2))

# blended
preds = algo.predict_for_user(10, [1, 5])
assert preds.loc[1] == 4.0
assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] + bias.item_offsets_.loc[5])
assert preds.loc[5] == approx(exp_val(10, 5))

# blended unknown
preds = algo.predict_for_user(10, [5, 1, -23081])
assert len(preds) == 3
assert preds.loc[1] == 4.0
assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] + bias.item_offsets_.loc[5])
assert preds.loc[-23081] == approx(bias.mean_ + bias.user_offsets_.loc[10])
assert preds.loc[5] == approx(exp_val(10, 5))
assert preds.loc[-23081] == approx(exp_val(10, None))


def test_topn_recommend():
pred = basic.Memorized(simple_df)
rec = basic.TopN(pred)
rec.fit(simple_df)

rec10 = rec.recommend(10, candidates=[1, 2])
assert all(rec10.item == [2, 1])
Expand All @@ -199,6 +216,14 @@ def test_topn_recommend():
assert all(rec10.score == [5])


def test_topn_config():
pred = basic.Memorized(simple_df)
rec = basic.TopN(pred)

rs = str(rec)
assert rs.startswith('TopN/')


def test_popular():
algo = basic.Popular()
algo.fit(lktu.ml_pandas.renamed.ratings)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_batch_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import logging
from collections import namedtuple
from functools import partial
import pandas as pd
import numpy as np

Expand Down Expand Up @@ -104,6 +103,7 @@ def test_predict_include_rating(mlb):


@pytest.mark.skipif(not lktu.ml100k.available, reason='ML-100K required')
@pytest.mark.eval
@pytest.mark.parametrize('ncpus', [None, 2])
def test_bias_batch_predict(ncpus):
from lenskit.algorithms import basic
Expand Down

0 comments on commit 36bc4c4

Please sign in to comment.