In [1]:
import numpy as np
from lightfm import LightFM
from scipy.sparse import csr_matrix, hstack, eye, coo_matrix

# there is similar function in scipy but no exactly
def get_sparse_matrix(
    row_count: int,
    col_count: int,
    density: float
) -> coo_matrix:
    data_length = int(row_count * col_count * density)
    rows = np.random.randint(low=0, high=row_count, size=data_length)
    cols = np.random.randint(low=0, high=col_count, size=data_length)
    data = np.ones(data_length)
    return coo_matrix(
        (data, (rows, cols)),
        shape=(row_count, col_count)
    )

In [2]:
%%time
# some kind of quite large model
model = LightFM()
user_count = 100000000
item_count = 100000
user_feature_count = 1000
item_feature_count = 1000
# we wil get recommendations by batches
# assuming #users >> #items
user_batch_size = 10000
user_item = get_sparse_matrix(user_count, item_count, 0.0001)
user_features = hstack([
    eye(user_count),
    get_sparse_matrix(user_count, user_feature_count, 0.001)
]).tocsr()
item_features = hstack([
    eye(item_count),
    get_sparse_matrix(item_count, item_feature_count, 0.001)
]).tocsr()
model.fit(
    interactions=user_item,
    user_features=user_features,
    item_features=item_features,
    sample_weight=user_item,
    verbose=True,
    num_threads=20
)

Epoch 0
CPU times: user 49min 29s, sys: 18.2 s, total: 49min 47s
Wall time: 4min 37s


In [3]:
%%timeit
# simple numpy multiplication
item_biases, item_embeddings = model.get_item_representations(item_features)
user_biases, user_embeddings = model.get_user_representations(user_features[:user_batch_size])
other_pred = (
        user_embeddings.dot(item_embeddings.T) +
        item_biases.reshape(1, -1) + user_biases.reshape(-1, 1)
)

4.04 s ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%timeit
# another way of getting recommendations
# for a batch of users
user_ids = np.repeat(np.arange(user_batch_size), item_count)
item_ids = np.tile(np.arange(item_count), user_batch_size)
pred = model.predict(
    user_ids=user_ids,
    item_ids=item_ids,
    item_features=item_features,
    user_features=user_features,
    num_threads=20
).reshape(user_batch_size, item_count)

18.9 s ± 223 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
# creating indices is already very slow
user_ids = np.repeat(np.arange(user_batch_size), item_count)
item_ids = np.tile(np.arange(item_count), user_batch_size)

6.6 s ± 42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
user_ids = np.repeat(np.arange(user_batch_size), item_count)
item_ids = np.tile(np.arange(item_count), user_batch_size)

In [6]:
%%timeit
# and predict method is even slower
pred = model.predict(
    user_ids=user_ids,
    item_ids=item_ids,
    item_features=item_features,
    user_features=user_features,
    num_threads=20
).reshape(user_batch_size, item_count)

12.4 s ± 318 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
