Skip to content

Commit

Permalink
Merge pull request #35 from lenskit/feature/implicit-als
Browse files Browse the repository at this point in the history
Implement Implicit ALS
  • Loading branch information
mdekstrand committed Oct 14, 2018
2 parents 3ceef08 + 5231221 commit 159afe4
Show file tree
Hide file tree
Showing 12 changed files with 444 additions and 87 deletions.
6 changes: 6 additions & 0 deletions doc/mf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ Common Support
The :py:mod:`mf_common` module contains common support code for matrix factorization
algorithms.

.. autoclass:: MFModel
:members:

.. autoclass:: BiasMFModel
:members:

Expand All @@ -29,6 +32,9 @@ best with the MKL from Conda.
.. autoclass:: BiasedMF
:members:

.. autoclass:: ImplicitMF
:members:

FunkSVD
-------

Expand Down
195 changes: 145 additions & 50 deletions lenskit/algorithms/als.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import logging
from collections import namedtuple
from copy import copy

import pandas as pd
import numpy as np
from numba import njit, jitclass, prange, float64, int32, int64

from . import basic
from . import Predictor, Trainable
from .mf_common import BiasMFModel
from .mf_common import BiasMFModel, MFModel
from ..matrix import sparse_ratings
from .. import util

Expand Down Expand Up @@ -42,6 +41,8 @@ def __init__(self, nr, nf, ps, cs, vs):

@njit(parallel=True, nogil=True)
def _train_matrix(ctx: _Ctx, other: np.ndarray, reg: float):
"One half of an explicit ALS training round."
assert other.shape[1] == ctx.n_features
result = np.zeros((ctx.n_rows, ctx.n_features))
for i in prange(ctx.n_rows):
sp = ctx.ptrs[i]
Expand All @@ -64,6 +65,47 @@ def _train_matrix(ctx: _Ctx, other: np.ndarray, reg: float):
return result


@njit(parallel=True, nogil=True)
def _train_implicit_matrix(ctx: _Ctx, other: np.ndarray, reg: float):
"One half of an implicit ALS training round."
n_items = other.shape[0]
assert other.shape[1] == ctx.n_features
OtO = other.T @ other
assert OtO.shape[0] == OtO.shape[1]
assert OtO.shape[0] == ctx.n_features
result = np.zeros((ctx.n_rows, ctx.n_features))
for i in prange(ctx.n_rows):
sp = ctx.ptrs[i]
ep = ctx.ptrs[i+1]
if sp == ep:
continue

cols = ctx.cols[sp:ep]
rates = ctx.vals[sp:ep]
# we can optimize by only considering the nonzero entries of Cu-I
# this means we only need the corresponding matrix columns
M = other[cols, :]
# Compute M^T C_u M, restricted to these nonzero entries
MMT = (M.T.copy() * rates) @ M
# assert MMT.shape[0] == ctx.n_features
# assert MMT.shape[1] == ctx.n_features
# Build and invert the matrix
A = OtO + MMT + np.identity(ctx.n_features) * reg
Ainv = np.linalg.inv(A)
# And now we can compute the final piece of the update rule
AiYt = Ainv @ other.T
cu = np.ones(n_items)
cu[cols] = rates + 1.0
AiYtCu = AiYt * cu
pu = np.zeros(n_items)
pu[cols] = 1.0
uv = AiYtCu @ pu
# assert len(uv) == ctx.n_features
result[i, :] = uv

return result


class BiasedMF(Predictor, Trainable):
"""
Biased matrix factorization trained with alternating least squares [ZWSP2008]_. This is a
Expand Down Expand Up @@ -106,7 +148,6 @@ def train(self, ratings, bias=None):
_logger.info('[%s] training biased MF model with ALS for %d features',
self.timer, self.features)
for epoch, model in enumerate(self._train_iters(current, uctx, ictx)):

current = model

_logger.info('trained model in %s', self.timer)
Expand All @@ -115,13 +156,13 @@ def train(self, ratings, bias=None):

def _initial_model(self, ratings, bias=None):
"Initialize a model and build contexts."
gbias, ubias, ibias = self._get_bias(bias, ratings)
bias = self._get_bias(bias, ratings)
rmat, users, items = sparse_ratings(ratings)
n_users = len(users)
n_items = len(items)

rmat, ubias, ibias = self._normalize(rmat, users, items, gbias, ubias, ibias)
assert len(ubias) == n_users and len(ibias) == n_items
rmat, bias = self._normalize(rmat, users, items, bias)
assert len(bias.users) == n_users and len(bias.items) == n_items

_logger.debug('setting up contexts')
uctx = _Ctx(n_users, self.features,
Expand All @@ -132,28 +173,23 @@ def _initial_model(self, ratings, bias=None):

_logger.debug('initializing item matrix')
imat = np.random.randn(n_items, self.features) * 0.01
umat = np.full((n_users, self.features), np.nan)

return BiasMFModel(users, items, gbias, ubias, ibias, None, imat), uctx, ictx
return BiasMFModel(users, items, bias, umat, imat), uctx, ictx

def _get_bias(self, bias, ratings):
"Extract or construct bias terms for the model."
"Ensure we have a suitable set of bias terms for the model."
if bias is None:
_logger.info('[%s] training bias model', self.timer)
bias = basic.Bias(damping=self.damping).train(ratings)
# unpack the bias
if isinstance(bias, basic.BiasModel):
gbias = bias.mean
ibias = bias.items
ubias = bias.users
return bias
else:
# we have a single global bias (for e.g. implicit feedback data)
gbias = bias
ibias = None
ubias = None
# we have a single global bias
return basic.BiasModel(bias, None, None)

return gbias, ubias, ibias

def _normalize(self, ratings, users, items, gbias, ubias, ibias):
def _normalize(self, ratings, users, items, bias):
"Apply bias normalization to the data in preparation for training."
n_users = len(users)
n_items = len(items)
Expand All @@ -163,12 +199,14 @@ def _normalize(self, ratings, users, items, gbias, ubias, ibias):

_logger.info('[%s] normalizing %dx%d matrix (%d nnz)',
self.timer, n_users, n_items, ratings.nnz)
ratings.data = ratings.data - gbias
ratings.data = ratings.data - bias.mean
ibias = bias.items
if ibias is not None:
ibias = ibias.reindex(items)
ibias = ibias.reindex(items, fill_value=0)
ratings.data = ratings.data - ibias.values[ratings.indices]
ubias = bias.users
if ubias is not None:
ubias = ubias.reindex(users)
ubias = ubias.reindex(users, fill_value=0)
# create a user index array the size of the data
reps = np.repeat(np.arange(len(users), dtype=np.int32),
np.diff(ratings.indptr))
Expand All @@ -177,7 +215,7 @@ def _normalize(self, ratings, users, items, gbias, ubias, ibias):
ratings.data = ratings.data - ubias.values[reps]
del reps

return ratings, ubias, ibias
return ratings, basic.BiasModel(bias.mean, ibias, ubias)

def _train_iters(self, current, uctx, ictx):
"Generator of training iterations."
Expand All @@ -187,38 +225,95 @@ def _train_iters(self, current, uctx, ictx):
imat = _train_matrix(ictx, umat, self.regularization)
_logger.debug('[%s] finished item epoch %d', self.timer, epoch)
di = np.linalg.norm(imat - current.item_features, 'fro')
if current.user_features is not None:
du = np.linalg.norm(umat - current.user_features, 'fro')
else:
du = np.nan
du = np.linalg.norm(umat - current.user_features, 'fro')
_logger.info('[%s] finished epoch %d (|ΔI|=%.3f, |ΔU|=%.3f)', self.timer, epoch, di, du)
current = BiasMFModel(current.user_index, current.item_index,
current.global_bias, current.user_bias, current.item_bias,
umat, imat)
current = copy(current)
current.user_features = umat
current.item_features = imat
yield current

def predict(self, model, user, items, ratings=None):
def predict(self, model: BiasMFModel, user, items, ratings=None):
# look up user index
uidx = model.lookup_user(user)
if uidx < 0:
_logger.debug('user %s not in model', user)
return pd.Series(np.nan, index=items)

# get item index & limit to valid ones
items = np.array(items)
iidx = model.lookup_items(items)
good = iidx >= 0
good_items = items[good]
good_iidx = iidx[good]

# multiply
_logger.debug('scoring %d items for user %s', len(good_items), user)
rv = model.score(uidx, good_iidx)

res = pd.Series(rv, index=good_items)
res = res.reindex(items)
return res
return model.score_by_ids(user, items)

def __str__(self):
return 'als.BiasedMF(features={}, regularization={})'.\
format(self.features, self.regularization)


class ImplicitMF(Predictor, Trainable):
"""
Implicit matrix factorization trained with alternating least squares [HKV2008]_. This
algorithm outputs 'predictions', but they are not on a meaningful scale. If its input
data contains ``rating`` values, these will be used as the 'confidence' values; otherwise,
confidence will be 1 for every rated item.
.. [HKV2008] Y. Hu, Y. Koren, and C. Volinsky. 2008.
Collaborative Filtering for Implicit Feedback Datasets.
In _Proceedings of the 2008 Eighth IEEE International Conference on Data Mining_, 263–272.
DOI `10.1109/ICDM.2008.22 <http://dx.doi.org/10.1109/ICDM.2008.22>`_
Args:
features(int): the number of features to train
iterations(int): the number of iterations to train
reg(double): the regularization factor
weight(double): the scaling weight for positive samples (:math:`\\alpha` in [HKV2008]_).
"""
timer = None

def __init__(self, features, iterations=20, reg=0.1, weight=40):
self.features = features
self.iterations = iterations
self.regularization = reg
self.weight = weight

def train(self, ratings):
self.timer = util.Stopwatch()
current, uctx, ictx = self._initial_model(ratings)

for model in self._train_iters(current, uctx, ictx):
current = model

_logger.info('[%s] finished training model with %d features',
self.timer, current.n_features)

return current

def _train_iters(self, current, uctx, ictx):
"Generator of training iterations."
for epoch in range(self.iterations):
umat = _train_implicit_matrix(uctx, current.item_features, self.regularization)
_logger.debug('[%s] finished user epoch %d', self.timer, epoch)
imat = _train_implicit_matrix(ictx, umat, self.regularization)
_logger.debug('[%s] finished item epoch %d', self.timer, epoch)
di = np.linalg.norm(imat - current.item_features, 'fro')
du = np.linalg.norm(umat - current.user_features, 'fro')
_logger.info('[%s] finished epoch %d (|ΔI|=%.3f, |ΔU|=%.3f)', self.timer, epoch, di, du)
current = copy(current)
current.user_features = umat
current.item_features = imat
yield current

def _initial_model(self, ratings):
"Initialize a model and build contexts."

rmat, users, items = sparse_ratings(ratings)
n_users = len(users)
n_items = len(items)

_logger.debug('setting up contexts')
uctx = _Ctx(n_users, self.features,
rmat.indptr, rmat.indices, rmat.data * self.weight)
trmat = rmat.tocsc()
ictx = _Ctx(n_items, self.features,
trmat.indptr, trmat.indices, trmat.data * self.weight)

imat = np.random.randn(n_items, self.features) * 0.01
imat = np.square(imat)
umat = np.full((n_users, self.features), np.nan)

return MFModel(users, items, umat, imat), uctx, ictx

def predict(self, model: MFModel, user, items, ratings=None):
# look up user index
return model.score_by_ids(user, items)
19 changes: 6 additions & 13 deletions lenskit/algorithms/funksvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,15 +225,8 @@ def train(self, ratings, bias=None):
_logger.info('[%s] training bias model', timer)
bias = basic.Bias(damping=self.damping).train(ratings)
# unpack the bias
if isinstance(bias, basic.BiasModel):
gbias = bias.mean
ibias = bias.items
ubias = bias.users
else:
# we have a single global bias (for e.g. implicit feedback data)
gbias = bias
ibias = None
ubias = None
if not isinstance(bias, basic.BiasModel):
bias = basic.BiasModel(bias, None, None)

_logger.info('[%s] preparing rating data for %d samples', timer, len(ratings))
_logger.debug('shuffling rating data')
Expand All @@ -251,9 +244,9 @@ def train(self, ratings, bias=None):
assert np.all(items >= 0)

_logger.debug('[%s] computing initial estimates', timer)
initial = pd.Series(gbias, index=ratings.index, dtype=np.float_)
ibias, initial = _align_add_bias(ibias, iidx, ratings.item, initial)
ubias, initial = _align_add_bias(ubias, uidx, ratings.user, initial)
initial = pd.Series(bias.mean, index=ratings.index, dtype=np.float_)
ibias, initial = _align_add_bias(bias.items, iidx, ratings.item, initial)
ubias, initial = _align_add_bias(bias.users, uidx, ratings.user, initial)

_logger.debug('have %d estimates for %d ratings', len(initial), len(ratings))
assert len(initial) == len(ratings)
Expand All @@ -269,7 +262,7 @@ def train(self, ratings, bias=None):
train(context, params, model, timer)
_logger.info('finished model training in %s', timer)

return BiasMFModel(uidx, iidx, gbias, ubias, ibias,
return BiasMFModel(uidx, iidx, basic.BiasModel(bias.mean, ibias, ubias),
model.user_features, model.item_features)

def predict(self, model, user, items, ratings=None):
Expand Down
4 changes: 1 addition & 3 deletions lenskit/algorithms/item_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@
from collections import namedtuple
import logging

import ctypes
import pandas as pd
import numpy as np
import scipy.sparse as sps
import scipy.sparse.linalg as spla
import numba as n
from numba import njit, jitclass
from numba import njit

from lenskit import util, matrix
from . import Trainable, Predictor
Expand Down

0 comments on commit 159afe4

Please sign in to comment.