In [1]:
# 1. magic for inline plot
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext autoreload
!pip install implicit hnswlib > install.txt

import os
import time
import hnswlib
import numpy as np
import pandas as pd
from subprocess import call
from scipy.sparse import csr_matrix
from implicit.bpr import BayesianPersonalizedRanking

In [2]:
users_col = 'user_id'
items_col = 'category'
value_col = 'total_rating_mean'
time_col = 'weekday_max'

In [3]:
Train = pd.read_parquet('/kaggle/input/next-orders/Train (1).parquet')

In [4]:
def train_test_user_time_split(df: pd.DataFrame, test_size: float=0.2):
    train_size = 1 - test_size

    df_train_user = []
    df_test_user = []
    df_grouped = df.sort_values(time_col).groupby(users_col)
    for name, df_group in df_grouped:
        n_train = int(df_group.shape[0] * train_size)
        df_group_train = df_group.iloc[:n_train]
        df_group_test = df_group.iloc[n_train:]
        df_train_user.append(df_group_train)
        df_test_user.append(df_group_test)

    df_train = pd.concat(df_train_user, ignore_index=True)
    df_test = pd.concat(df_test_user, ignore_index=True)
    return df_train, df_test

In [5]:
test_size = 0.2
df_train, df_test = train_test_user_time_split(Train, test_size)

print('train size: ', df_train.shape[0])
print('test size: ', df_test.shape[0])

train size:  7868843
test size:  1970091


In [6]:
n_users = Train[users_col].shape[0]
n_items = Train[items_col].shape[0]
rows = df_train[items_col].values
cols = df_train[users_col].values
values = df_train[value_col].astype(np.float32)
item_user = csr_matrix((values, (rows, cols)), shape=(n_items, n_users))
item_user

<9838934x9838934 sparse matrix of type '<class 'numpy.float32'>'
	with 7868843 stored elements in Compressed Sparse Row format>

In [7]:
bpr = BayesianPersonalizedRanking()
bpr.fit(item_user)

  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
import tqdm
def recommend_all(query_factors, index_factors):
    num_queries = query_factors.shape[0]
    labels = []
    progress_bar = tqdm(total=num_queries, desc='Recommendations')
    for i in range(num_queries):
        query = query_factors[i]
        distances_i = np.dot(index_factors, query)
        top_index = np.argmin(distances_i)
        labels.append(top_index)
        progress_bar.update(1)

    progress_bar.close()

    return labels

# labels = recommend_all(bpr.user_factors, bpr.item_factors)
# print(labels)


In [None]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

max_norm, augmented_index_factors = augment_inner_product(bpr.item_factors)

from tqdm import tqdm

def build_hnsw(factors, space, ef_construction, M):
    # Declaring index
    max_elements, dim = factors.shape
    hnsw = hnswlib.Index(space, dim) # possible options for space are l2, cosine or ip

    # Initing index - the maximum number of elements should be known beforehand
    hnsw.init_index(max_elements, M, ef_construction)

    # Element insertion (can be called several times)
    with tqdm(total=max_elements, desc="Building HNSW Index") as pbar:
        for i in range(max_elements):
            hnsw.add_items(factors[i:i+1])
            pbar.update(1)

    return hnsw


space = 'ip'
ef_construction = 400
M = 24

start = time.time()
hnsw = build_hnsw(augmented_index_factors, space, ef_construction, M)
build_time = time.time() - start

extra_zero = np.zeros((bpr.user_factors.shape[0], 1))
augmented_query_factors = np.append(bpr.user_factors, extra_zero, axis=1)

k = 1

# Controlling the recall by setting ef, should always be > k
hnsw.set_ef(70)

# retrieve the top-n search neighbors
label, distance = hnsw.knn_query(augmented_query_factors, k=k)
print(label)