In [1]:
from scipy.sparse import load_npz

# загружаем заранее подготовленные данные
user_item_matrix = load_npz("/data/other/user_item_lastfm.npz")

In [2]:
user_item_matrix

<359337x160163 sparse matrix of type '<class 'numpy.float16'>'
	with 17332977 stored elements in COOrdinate format>

In [3]:
import numpy as np

# делим разреженную матрицу на обучающую и тестовую
total_len = user_item_matrix.data.size
train_len = int(total_len * 0.8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace=False)
train_mask = np.in1d(all_indices, train_indices)

In [4]:
print(all_indices)
print(train_indices)
print(train_mask)
print(len(train_mask))

[       0        1        2 ... 17332974 17332975 17332976]
[ 4343353 10204053  1135877 ...  3975992  7428974  3148521]
[ True False  True ... False  True  True]
17332977


In [5]:
from scipy.sparse import coo_matrix

def get_masked(arr, mask):
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ),
        arr.shape
    )

In [6]:
train_csr = get_masked(user_item_matrix, train_mask).tocsr()
# для обучения нужна item*user матрица
train = train_csr.T
test_coo = get_masked(user_item_matrix, ~train_mask)
test_csr = test_coo.tocsr()

In [7]:
from implicit.als import AlternatingLeastSquares
import os

# автор пакета утверждает, что так быстрее
os.environ["OPENBLAS_NUM_THREADS"] = "1"
# обучаемся на тех же параметрах, что и в Spark
als = AlternatingLeastSquares(
    factors=10,
    iterations=10,
    regularization=0.1
)

In [8]:
%%time

# обучает быстрее, чем Spark
als.fit(train)

CPU times: user 1min 47s, sys: 2min 3s, total: 3min 51s
Wall time: 1min 1s


In [9]:
%%time

# обучимся на дефолтных параметрах
als = AlternatingLeastSquares()
als.fit(train)

CPU times: user 8min 14s, sys: 8min 38s, total: 16min 53s
Wall time: 4min 16s


In [10]:
import pickle

pickle_filename = "/data/other/implicit_top50.pkl"
users = set(test_coo.row)

In [11]:
%%time

def get_recs(users, model):
    return {
        user: model.recommend(userid=user, user_items=train_csr, N=50)
        for user in users
    }

# посчитаем по 50 рекомендаций для каждого пользователя из тестовой выборки
recs = get_recs(users, als)
# сохраним предрасчёт рекомендаций
with open(pickle_filename, "wb") as f:
    pickle.dump(recs, f)

CPU times: user 2h 21min 22s, sys: 43min 37s, total: 3h 5min
Wall time: 50min 1s


In [12]:
# загрузим сохранённый предрасчёт
with open(pickle_filename, "rb") as f:
    recs = pickle.load(f)

In [13]:
%%time

def hitrate(k, recs, users):
    hits = 0
    for user in users:
        if recs[user]:
            rec_items, _ = zip(*recs[user])
            hits += len(set(rec_items[:k]).intersection(set(test_csr[user].indices))) > 0
    return hits / len(users)

print("hitrate@50:", hitrate(50, recs, users))
print("hitrate@20:", hitrate(20, recs, users))
print("hitrate@10:", hitrate(10, recs, users))
print("hitrate@5:", hitrate(5, recs, users))
print("hitrate@1:", hitrate(1, recs, users))

# сравним с бейзлайном
# hitrate@100 = 0.9254293323537515
# hitrate@50 = 0.869095027787286
# hitrate@20 = 0.7547928546183666
# hitrate@10 = 0.6508709094805155
# hitrate@5 = 0.5333739637165112
# hitrate@1 = 0.21251638434116163

hitrate@50: 0.9155975673288814
hitrate@20: 0.8192852983866699
hitrate@10: 0.709606153554703
hitrate@5: 0.5763942866217018
hitrate@1: 0.2740878179786988
CPU times: user 2min 26s, sys: 0 ns, total: 2min 26s
Wall time: 2min 25s
