In [11]:
import numpy as np
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from scipy.sparse import coo_matrix, csr_matrix

In [2]:
item_features = load_npz('data/item_features_matrix.npz')
item_features.shape

(383771, 4349)

## 1. Рекомендации, основанные на описании товаров
### воспользуемся HNSW

In [4]:
item_ids = item_features.getcol(0).toarray().reshape(383771) #id товаров

In [5]:
#понизим кол-во признаков для каждого товара до 300
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=300, n_iter=7, random_state=42)
item_features = svd.fit_transform(item_features)

In [6]:
import hnswlib
dim = 300
num_items = 383771

In [7]:
HNSW = hnswlib.Index(space = 'cosine', dim = dim)
HNSW.init_index(max_elements = num_items, ef_construction = 100, M = 16)

In [8]:
HNSW.add_items(item_features, item_ids, num_threads = 4)

In [9]:
labels, distances = HNSW.knn_query(item_features, k = 5)

In [11]:
id1, id2, M = [], [], []

for i in range(num_items):
    for k in range(5):     
        id1.append(int(item_ids[i]))
        id2.append(int(labels[i][k]))
        M.append(distances[i][k])
    print(i, end='\r')

383770

In [12]:
df = pd.DataFrame({'id1':id1, 'id2':id2, 'distance':M})

In [13]:
df.head(5)

Unnamed: 0,id1,id2,distance
0,335486,76108,0.0
1,335486,99413,0.0
2,335486,114077,0.0
3,335486,164597,0.0
4,335486,201064,0.0


In [14]:
df.to_csv('ilin_pavel_1.csv', index = False)
!cat ilin_pavel_1.csv | head

id1,id2,distance
335486,76108,0.0
335486,99413,0.0
335486,114077,0.0
335486,164597,0.0
335486,201064,0.0
322530,76108,0.0
322530,99413,0.0
322530,114077,0.0
322530,164597,0.0
cat: write error: Broken pipe


## 2. Рекомендации, основанные на коллаборативной фильтрации

In [34]:
df = pd.read_csv('data/Interactions.csv')

In [35]:
df.head(5)

Unnamed: 0,vid,product_id,page_type
0,0,0,PRODUCT
1,1,1,PRODUCT
2,3,3,CART
3,4,4,PURCHASE
4,5,5,PRODUCT


In [37]:
print(df.product_id.unique())
print(len(df.product_id.unique()))

[     0      1      3 ..., 509394 509395 509399]
383771


In [43]:
users = df.vid.unique()
print(len(users))

917486


In [40]:
#соберем разряженную матрицу взаимодейтсвий
#PRODUCT - просмотр вес 1
#CART - корзина вес 2
#PURCHASE - покупка вес 3
action_dict = {'PRODUCT':1, 'CART':2, 'PURCHASE':3}
df['page_type'] = df['page_type'].map(action_dict)
data = df.product_id.values
row = df.vid.values
col = df.product_id
I = coo_matrix((data, (row, col)), shape = (975006, 509400))

In [None]:
for user in users:
    metrics = cosine_similarity(I.getrow(user), I).reshape(-1,1)
    I *= metrics