Skip to content

Commit

Permalink
Merge pull request #28 from lenskit/feature/ii-fast-norm
Browse files Browse the repository at this point in the history
Use SciPy for accelerated item-item normalization
  • Loading branch information
mdekstrand committed Sep 20, 2018
2 parents 2dc0e97 + 42b16ea commit a1a6fb6
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 23 deletions.
11 changes: 10 additions & 1 deletion lenskit/algorithms/_item_knn.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from numpy cimport math as npm
from cython.parallel cimport parallel, prange, threadid
from libc.stdlib cimport malloc, free, realloc, abort, calloc
from libc.math cimport isnan, fabs
import time
import logging

from lenskit cimport _cy_util as lku
Expand Down Expand Up @@ -172,6 +173,8 @@ cpdef sim_matrix(BuildContext context, double threshold, int nnbrs):
cdef int i
cdef ThreadState* tres
cdef list neighborhoods = []
cdef double start = time.perf_counter()
cdef double now

with nogil, parallel():
tres = tr_new(context.n_items)
Expand All @@ -181,8 +184,14 @@ cpdef sim_matrix(BuildContext context, double threshold, int nnbrs):
omp_get_thread_num(), omp_get_num_threads(),
<unsigned long> tres)

for i in prange(context.n_items, schedule='dynamic', chunksize=10):
for i in prange(context.n_items, schedule='dynamic', chunksize=100):
train_row(i, tres, context, threshold, nnbrs)
if i % 1000 == 999:
with gil:
now = time.perf_counter()
_logger.info('finished %d of %d rows in %.2fs (eta: %.2fs)',
i+1, context.n_items, now-start,
(context.n_items - i) * (now - start) / i)

with gil:
_logger.debug('thread %d computed %d pairs', omp_get_thread_num(), tres.size)
Expand Down
45 changes: 23 additions & 22 deletions lenskit/algorithms/item_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import numpy as np
import scipy.sparse as sps
import scipy.sparse.linalg as spla

from lenskit import util, matrix
from . import _item_knn as accel
Expand Down Expand Up @@ -63,42 +64,42 @@ def train(self, ratings):
# 1. Normalize item vectors to be mean-centered and unit-normalized
# 2. Compute similarities with pairwise dot products
watch = util.Stopwatch()

item_means = ratings.groupby('item').rating.mean()
_logger.info('[%s] computed means for %d items', watch, len(item_means))

_logger.info('[%s] normalizing user-item ratings', watch)
rmat, users, items = matrix.sparse_ratings(ratings)
item_means = item_means.reindex(items).values
_logger.info('[%s] made sparse matrix for %d items (%d ratings)',
watch, len(items), rmat.nnz)

def normalize(x):
xmc = x - x.mean()
norm = np.linalg.norm(xmc)
if norm > 1.0e-10:
return xmc / norm
else:
return xmc
# stupid trick: indices are items, look up means, subtract!
rmat.data = rmat.data - item_means[rmat.indices]
m2 = rmat.mean(axis=0)
_logger.info('min mean: %f, max mean: %f', m2.A1.min(), m2.A1.max())

uir = ratings.set_index(['item', 'user']).rating
uir = uir.groupby('item').transform(normalize)
uir = uir.reset_index()
assert uir.rating.notna().all()
# now we have normalized vectors
# compute column norms
norms = spla.norm(rmat, 2, axis=0)
# and multiply by a diagonal to normalize columns
norm_mat = rmat @ sps.diags(np.reciprocal(norms))
# and reset NaN
norm_mat.data[np.isnan(norm_mat.data)] = 0
_logger.info('[%s] normalized user-item ratings', watch)

_logger.info('[%s] computing similarity matrix', watch)
sim_matrix, items = self._cy_matrix(ratings, uir, watch)
item_means = item_means.reindex(items)
sim_matrix = self._cy_matrix(norm_mat, watch)

_logger.info('[%s] computed %d neighbor pairs', watch, sim_matrix.nnz)

return IIModel(items, item_means.values, np.diff(sim_matrix.indptr),
return IIModel(items, item_means, np.diff(sim_matrix.indptr),
sim_matrix, ratings.set_index(['user', 'item']).rating)

def _cy_matrix(self, ratings, uir, watch):
def _cy_matrix(self, rmat, watch):
_logger.debug('[%s] preparing Cython data launch', watch)
# the Cython implementation requires contiguous numeric IDs.
# so let's make those
rmat, user_idx, item_idx = matrix.sparse_ratings(uir)
assert rmat.nnz == len(uir)
n_items = len(item_idx)

n_items = rmat.shape[1]
context = accel.BuildContext(rmat)

_logger.debug('[%s] running accelerated matrix computations', watch)
Expand All @@ -120,9 +121,9 @@ def _cy_matrix(self, ratings, uir, watch):
smat.indices[start:end] = tmp
tmp = smat.data[sorti[::-1] + start]
smat.data[start:end] = tmp
_logger.info('[%s] sorted neighborhoods', watch)

# clean up neighborhoods
return smat, item_idx
return smat

def _py_matrix(self, ratings, uir, watch):
_logger.info('[%s] computing item-item similarities for %d items with %d ratings',
Expand Down

0 comments on commit a1a6fb6

Please sign in to comment.