Skip to content

Commit

Permalink
Merge pull request #27 from lenskit/feature/ii-sync
Browse files Browse the repository at this point in the history
Make item-item CF match LensKit
  • Loading branch information
mdekstrand committed Sep 12, 2018
2 parents 03bbbb5 + 7e0a56f commit d1aba29
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 34 deletions.
36 changes: 23 additions & 13 deletions lenskit/algorithms/_item_knn.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ from cpython cimport array
import array
import pandas as pd
import numpy as np
from scipy import sparse as sps
cimport numpy as np
from numpy cimport math as npm
from cython.parallel cimport parallel, prange, threadid
Expand All @@ -20,7 +21,7 @@ ELSE:
cdef int omp_get_num_threads():
return 0

cdef _logger = logging.getLogger('_item_knn')
cdef _logger = logging.getLogger('lenskit._item_knn')

cdef class BuildContext:
cdef readonly int n_users
Expand All @@ -43,7 +44,8 @@ cdef class BuildContext:
self.n_users, self.n_items, matrix.nnz)

self.matrix = matrix
self.cscmat = matrix.tocsc(copy=False)
self.cscmat = matrix.tocsc()
assert sps.isspmatrix_csc(self.cscmat)

self.uptrs = matrix.indptr
self.items = matrix.indices
Expand Down Expand Up @@ -151,7 +153,7 @@ cdef void tr_add_nitems(ThreadState* self, int item, size_t nitems,
lku.ah_free(acc)


cdef dict tr_results(ThreadState* self):
cdef object tr_results(ThreadState* self):
cdef np.npy_intp size = self.size
cdef np.ndarray items, nbrs, sims
items = np.empty(size, dtype=np.int32)
Expand All @@ -163,7 +165,7 @@ cdef dict tr_results(ThreadState* self):
# items = np.PyArray_SimpleNewFromData(1, &size, np.NPY_INT32, self.items)
# nbrs = np.PyArray_SimpleNewFromData(1, &size, np.NPY_INT32, self.nbrs)
# sims = np.PyArray_SimpleNewFromData(1, &size, np.NPY_DOUBLE, self.sims)
return {'item': items, 'neighbor': nbrs, 'similarity': sims}
return pd.DataFrame({'item': items, 'neighbor': nbrs, 'similarity': sims})


cpdef sim_matrix(BuildContext context, double threshold, int nnbrs):
Expand All @@ -186,12 +188,15 @@ cpdef sim_matrix(BuildContext context, double threshold, int nnbrs):
_logger.debug('thread %d computed %d pairs', omp_get_thread_num(), tres.size)
if tres.size > 0:
neighborhoods.append(tr_results(tres))
_logger.debug('finished parallel item-item build, %d neighbors',
len(neighborhoods[-1]))
else:
_logger.debug('canceling with no neighbors')
tr_free(tres)
_logger.debug('finished parallel item-item build')

tres = NULL
_logger.debug('stacking %d neighborhood frames', len(neighborhoods))
return pd.concat([pd.DataFrame(d) for d in neighborhoods],
ignore_index=True)
return pd.concat(neighborhoods, ignore_index=True)


cdef void train_row(int item, ThreadState* tres, BuildContext context,
Expand Down Expand Up @@ -228,16 +233,19 @@ cdef void train_row(int item, ThreadState* tres, BuildContext context,
tr_add_all(tres, item, context.n_items, threshold)


cpdef void predict(matrix, int nitems, int min_nbrs, int max_nbrs,
np.float_t[:] ratings,
np.int64_t[:] targets,
np.float_t[:] scores):
cpdef predict(matrix, int nitems, int min_nbrs, int max_nbrs,
np.float_t[:] ratings,
np.int64_t[:] targets,
np.float_t[:] scores):
cdef int[:] indptr = matrix.indptr
cdef int[:] indices = matrix.indices
cdef double[:] similarity = matrix.data
cdef int i, j, iidx, rptr, rend, nidx, nnbrs
cdef double num, denom

assert ratings.shape[0] == nitems
assert scores.shape[0] == nitems

with nogil:
for i in range(targets.shape[0]):
iidx = targets[i]
Expand All @@ -261,6 +269,8 @@ cpdef void predict(matrix, int nitems, int min_nbrs, int max_nbrs,
break

if nnbrs < min_nbrs:
break
continue

scores[iidx] = num / denom

return None
40 changes: 27 additions & 13 deletions lenskit/algorithms/item_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ def __init__(self, nnbrs, min_nbrs=1, min_sim=1.0e-6, save_nbrs=None):
(``None`` for unlimited)
"""
self.max_neighbors = nnbrs
if self.max_neighbors is not None and self.max_neighbors < 0:
self.max_neighbors = 0
if self.max_neighbors is not None and self.max_neighbors < 1:
self.max_neighbors = -1
self.min_neighbors = min_nbrs
if self.min_neighbors is not None and self.min_neighbors < 1:
self.min_neighbors = 1
self.min_similarity = min_sim
self.save_neighbors = save_nbrs

Expand Down Expand Up @@ -68,8 +70,9 @@ def train(self, ratings):

def normalize(x):
xmc = x - x.mean()
if xmc.abs().sum() > 1.0e-10:
return xmc / np.linalg.norm(xmc)
norm = np.linalg.norm(xmc)
if norm > 1.0e-10:
return xmc / norm
else:
return xmc

Expand All @@ -93,6 +96,7 @@ def _cy_matrix(self, ratings, uir, watch):
# the Cython implementation requires contiguous numeric IDs.
# so let's make those
rmat, user_idx, item_idx = matrix.sparse_ratings(uir)
assert rmat.nnz == len(uir)
n_items = len(item_idx)

context = accel.BuildContext(rmat)
Expand Down Expand Up @@ -156,7 +160,10 @@ def sim_row(irdf):
return neighborhoods

def predict(self, model, user, items, ratings=None):
_logger.debug('predicting %d items for user %s', len(items), user)
if ratings is None:
if user not in model.rating_matrix.index:
return pd.Series(np.nan, index=items)
ratings = model.rating_matrix.loc[user]

# set up rating array
Expand All @@ -167,6 +174,7 @@ def predict(self, model, user, items, ratings=None):
rate_v = np.full(len(model.items), np.nan, dtype=np.float_)
rate_v[ri_pos] = m_rates.values - model.means[ri_pos]
_logger.debug('user %s: %d of %d rated items in model', user, len(ri_pos), len(ratings))
assert np.sum(np.logical_not(np.isnan(rate_v))) == len(ri_pos)

# set up item result vector
# ipos will be an array of item indices
Expand All @@ -179,8 +187,7 @@ def predict(self, model, user, items, ratings=None):

# now compute the predictions
accel.predict(model.sim_matrix, len(model.items),
self.min_neighbors if self.min_neighbors else 0,
self.max_neighbors if self.max_neighbors else -1,
self.min_neighbors, self.max_neighbors,
rate_v, i_pos, iscore)

nscored = np.sum(np.logical_not(np.isnan(iscore)))
Expand All @@ -198,29 +205,36 @@ def predict(self, model, user, items, ratings=None):
return results

def save_model(self, model, file):
_logger.info('saving I-I model to %s', file)
with pd.HDFStore(file, 'w') as hdf:
h5 = hdf._handle
group = h5.create_group('/', 'ii-model')
group = h5.create_group('/', 'ii_model')
h5.create_array(group, 'items', model.items.values)
h5.create_array(group, 'means', model.means)
_logger.debug('saving matrix with %d entries (%d nnz)',
model.sim_matrix.nnz, np.sum(model.sim_matrix.data != 0))
h5.create_array(group, 'col_ptrs', model.sim_matrix.indptr)
h5.create_array(group, 'row_nums', model.sim_matrix.indices)
h5.create_array(group, 'sim_values', model.sim_matrix.data)

hdf['ratings'] = model.rating_matrix

def load_model(self, file):
_logger.info('loading I-I model from %s', file)
with pd.HDFStore(file, 'r') as hdf:
ratings = hdf['ratings']
h5 = hdf._handle

items = h5.get_node('/ii-model', 'items').read()
items = h5.get_node('/ii_model', 'items').read()
items = pd.Index(items)
means = h5.get_node('/ii-model', 'means').read()

indptr = h5.get_node('/ii-model', 'col_ptrs').read()
indices = h5.get_node('/ii-model', 'row_nums').read()
values = h5.get_node('/ii-model', 'sim_values').read()
means = h5.get_node('/ii_model', 'means').read()

indptr = h5.get_node('/ii_model', 'col_ptrs').read()
indices = h5.get_node('/ii_model', 'row_nums').read()
values = h5.get_node('/ii_model', 'sim_values').read()
_logger.debug('loading matrix with %d entries (%d nnz)',
len(values), np.sum(values != 0))
assert np.all(values > self.min_similarity)

matrix = sps.csr_matrix((values, indices, indptr))

Expand Down
9 changes: 6 additions & 3 deletions lenskit/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,12 @@ def predict(algo, pairs, model=None):
else:
pfun = algo

ures = (pfun(user, udf.item).reset_index(name='prediction').assign(user=user)
for (user, udf) in pairs.groupby('user'))
res = pd.concat(ures).loc[:, ['user', 'item', 'prediction']]
def run(user, udf):
res = pfun(user, udf.item)
return pd.DataFrame({'user': user, 'item': res.index, 'prediction': res.values})

ures = (run(user, udf) for (user, udf) in pairs.groupby('user'))
res = pd.concat(ures)
if 'rating' in pairs:
return pairs.join(res.set_index(['user', 'item']), on=('user', 'item'))
return res
Expand Down
17 changes: 15 additions & 2 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


@task
def build(c, cover=False):
def build(c, cover=False, openmp=None):
try:
if cover:
print('enabling coverage & profiling in Cython build')
Expand All @@ -23,6 +23,14 @@ def build(c, cover=False):
if 'COVERAGE' in os.environ:
del os.environ['COVERAGE']

if openmp is not None:
if not openmp:
os.environ['USE_OPENMP'] = 'no'
elif openmp is True:
os.environ['USE_OPENMP'] = 'yes'
else:
os.environ['USE_OPENMP'] = openmp

ldir = Path('build/lib.%s-%d.%d' % (du.get_platform(), *sys.version_info[:2]))
files = set()
for ext in importlib.machinery.EXTENSION_SUFFIXES:
Expand All @@ -32,7 +40,12 @@ def build(c, cover=False):
path = pyd.relative_to(ldir)
if not path.exists() or pyd.stat().st_mtime > path.stat().st_mtime:
print('copying', pyd, '->', path)
shutil.copy2(str(pyd), str(path))
try:
shutil.copy2(str(pyd), str(path))
except PermissionError:
print(path, 'in use, renaming')
path.replace(str(path) + '.old.pyd')
shutil.copy2(str(pyd), str(path))
else:
print(path, 'is up to date')

Expand Down
8 changes: 5 additions & 3 deletions tests/test_batch_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def test_predict_single(mlb):

assert len(res) == 1
assert all(res.user == 1)
assert list(res.columns) == ['user', 'item', 'prediction']
assert set(res.columns) == set(['user', 'item', 'prediction'])
assert all(res.item == 31)

expected = mlb.model.mean + mlb.model.items.loc[31] + mlb.model.users.loc[1]
assert res.prediction.iloc[0] == pytest.approx(expected)
Expand All @@ -40,7 +41,8 @@ def test_predict_single_model(mlb):

assert len(res) == 1
assert all(res.user == 1)
assert list(res.columns) == ['user', 'item', 'prediction']
assert set(res.columns) == set(['user', 'item', 'prediction'])
assert all(res.item == 31)

expected = mlb.model.mean + mlb.model.items.loc[31] + mlb.model.users.loc[1]
assert res.prediction.iloc[0] == pytest.approx(expected)
Expand All @@ -59,7 +61,7 @@ def test_predict_user(mlb):
res = lkb.predict(mlb.predictor, tf)

assert len(res) == 15
assert list(res.columns) == ['user', 'item', 'prediction']
assert set(res.columns) == set(['user', 'item', 'prediction'])
assert all(res.user == uid)
assert set(res.item) == set(test_items)

Expand Down
36 changes: 36 additions & 0 deletions tests/test_knn_item_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,42 @@ def test_ii_train_big_unbounded():
assert means[model.items].values == approx(model.means)


@mark.slow
@mark.skipif(not lktu.ml100k.available, reason='ML100K data not present')
def test_ii_train_ml100k(tmpdir):
"Test an unbounded model on ML-100K"
ratings = lktu.ml100k.load_ratings()
algo = knn.ItemItem(30)
_log.info('training model')
model = algo.train(ratings)

_log.info('testing model')
assert model is not None

assert all(np.logical_not(np.isnan(model.sim_matrix.data)))
assert all(model.sim_matrix.data > 0)

# a little tolerance
assert all(model.sim_matrix.data < 1 + 1.0e-6)

assert model.counts.sum() == model.sim_matrix.nnz

means = ratings.groupby('item').rating.mean()
assert means[model.items].values == approx(model.means)

# save
fn = os.path.join(tmpdir, 'ii.mod')
_log.info('saving model to %s', fn)
algo.save_model(model, fn)
_log.info('reloading model')
restored = algo.load_model(fn)
assert restored is not None and restored is not model
assert all(restored.sim_matrix.data > 0)
assert all(restored.sim_matrix.indptr == model.sim_matrix.indptr)
assert all(restored.sim_matrix.indices == model.sim_matrix.indices)
assert all(restored.sim_matrix.data == model.sim_matrix.data)


@mark.slow
def test_ii_large_models():
"Several tests of large trained I-I models"
Expand Down

0 comments on commit d1aba29

Please sign in to comment.