In [1]:
import pandas as pd
import numpy as np
import json
from scipy import sparse as sp
from tqdm.notebook import tqdm
from collections import defaultdict

In [2]:
import sys
sys.path.append('../')

from src.utils import get_shard_path, ProductEncoder, make_coo_row
from src.metrics import normalized_average_precision

In [3]:
product_encoder = ProductEncoder('../data/raw/products.csv')

In [4]:
rows = []
for shard_id in range(4):
    for js in tqdm(json.loads(l) for l in open(get_shard_path(shard_id))):
        rows.append(make_coo_row(js["transaction_history"], product_encoder))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [5]:
X_sparse = sp.vstack(rows)

In [6]:
X_sparse.shape

(92161, 43038)

In [7]:
X_stored = X_sparse.tocsr()

In [8]:
from sklearn.decomposition import TruncatedSVD

In [9]:
svd = TruncatedSVD(n_components=128)
X_dense = svd.fit_transform(X_sparse)

In [10]:
from sklearn.neighbors import NearestNeighbors

In [11]:
num_neighbours = 256
knn = NearestNeighbors(n_neighbors=num_neighbours, metric="cosine")
knn.fit(X_dense)

In [13]:
valid_data = [json.loads(l) for l in open(get_shard_path(7))][:3000]

In [14]:
m_ap = []
for js in tqdm(valid_data):
    row_sparse = make_coo_row(js["transaction_history"], product_encoder)
    row_dense = svd.transform(row_sparse)
    knn_result = knn.kneighbors(row_dense, n_neighbors=num_neighbours)
    neighbors = knn_result[1]
    scores = np.asarray(X_stored[neighbors[0]].sum(axis=0)[0]).flatten()
    top_indices = np.argsort(-scores)
    recommended_items = product_encoder.toPid(top_indices[:30])
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

  0%|          | 0/3000 [00:00<?, ?it/s]

0.09338607503819348


In [None]:
! mkdir -p ../tmp/u2u

In [None]:
import pickle
pickle.dump(X_stored, open('../tmp/u2u/X_stored.pkl', "wb"))
pickle.dump(svd, open('../tmp/u2u/svd.pkl', "wb"))
pickle.dump(knn, open('../tmp/u2u/knn.pkl', "wb"))

In [None]:
! ls -lah ../tmp/u2u

# FAISS
[Вики faiss](https://github.com/facebookresearch/faiss/wiki)

In [16]:
import faiss

In [17]:
index = faiss.index_factory(128, "IVF256,PQ32", faiss.METRIC_INNER_PRODUCT)
index.train(X_dense)
index.add(X_dense)

In [20]:
index.nprobe = 32

[Индексы в faiss](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes)

In [21]:
m_ap = []
for js in tqdm(valid_data):
    row_sparse = make_coo_row(js["transaction_history"], product_encoder)
    row_dense = svd.transform(row_sparse)
    knn_result = index.search(row_dense, num_neighbours)
    neighbors = knn_result[1]
    scores = np.asarray(X_stored[neighbors[0]].sum(axis=0)[0]).flatten()
    top_indices = np.argsort(-scores)
    recommended_items = product_encoder.toPid(top_indices[:30])
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

  0%|          | 0/3000 [00:00<?, ?it/s]

0.08529721925490799


In [None]:
# ???

In [None]:
faiss.write_index(index, '../tmp/u2u/faiss.idx')

In [None]:
! ls -lah ../tmp/u2u