### Co-occurrence matrix

In [None]:
import polars as pl
import numpy as np 
import pandas as pd
from itertools import combinations
from scipy.sparse import lil_matrix
from tqdm import tqdm
import glob
from datetime import timedelta

In [5]:
order_files = glob.glob('data/final_apparel_orders_data_07/*/*.parquet')
all_dfs = []
for i in tqdm(range(len(order_files)), desc='Processing files'):
    file_path = order_files[i]
    df = pl.read_parquet(
        file_path, 
        columns=['user_id', 'item_id', 'created_timestamp', 'last_status']
    ).filter((pl.col("last_status") == "delivered_orders") &
             (pl.col('created_timestamp') > pl.datetime(2025, 5, 21))
    )
    all_dfs.append(df)

orders = pl.concat(all_dfs)

Processing files: 100%|██████████| 33/33 [00:00<00:00, 36.53it/s] 


#### Получение матрицы

In [8]:
user_item_sets = (
    orders
    .with_columns(date_window=pl.col("created_timestamp").dt.date())
    .group_by(["user_id", "date_window"])
    .agg(items=pl.col("item_id").unique())
    .filter(pl.col("items").list.len() > 1)
)

In [10]:
user_item_sets_pd = user_item_sets.to_pandas()

In [11]:
user_item_sets_pd

Unnamed: 0,user_id,date_window,items
0,4918181,2025-06-02,"[82203591, 247059816, 282827968]"
1,1383031,2025-06-13,"[117426377, 174821200, 310976689]"
2,571280,2025-05-31,"[20587943, 266650402]"
3,4972881,2025-05-23,"[45971975, 122287509, 338922730]"
4,3362070,2025-06-22,"[53144500, 69704539, 93914016, 171451725, 2768..."
...,...,...,...
519320,104581,2025-05-22,"[120325561, 206589773, 308572668]"
519321,186461,2025-05-21,"[74014662, 204532885, 312107635]"
519322,4315781,2025-06-18,"[170462492, 313839177]"
519323,589181,2025-06-02,"[46620845, 223920102]"


In [None]:
user_item_sets_5_day_window

Unnamed: 0,user_id,5_day_window,item_id
1,1,2025-01-22,"{187012969, 204338469}"
3,1,2025-02-16,"{182622868, 264778815}"
4,1,2025-05-17,"{336690475, 42233071}"
5,30,2025-02-06,"{336800348, 175117598}"
6,30,2025-02-21,"{309473154, 118090266}"
...,...,...,...
2107929,5061670,2025-02-26,"{93838194, 334669811, 7253950, 292406902, 3266..."
2107934,5061671,2025-03-23,"{234800754, 220831565}"
2107941,5061680,2025-01-27,"{203055740, 72516606}"
2107942,5061680,2025-02-11,"{139938162, 183348475}"


In [13]:
item_ids_from_windows = user_item_sets_pd['items'].explode().unique()
item_to_idx = {item: idx for idx, item in enumerate(item_ids_from_windows)}
idx_to_item = {idx: item for item, idx in item_to_idx.items()}
num_items = len(item_ids_from_windows)

cooccurrence_sparse = lil_matrix((num_items, num_items), dtype=np.int32)

for items_in_window in user_item_sets_pd['items']:
    for item1, item2 in combinations(items_in_window, 2):
        idx1 = item_to_idx[item1]
        idx2 = item_to_idx[item2]
        cooccurrence_sparse[idx1, idx2] += 1
        cooccurrence_sparse[idx2, idx1] += 1

cooccurrence_csr = cooccurrence_sparse.tocsr()

In [None]:
item_ids_list = []
neighbors_list = []

k = 15

for item in item_ids_from_windows:
    idx = item_to_idx[item]

    row = cooccurrence_csr[idx]
    nonzero_indices = row.indices
    nonzero_values = row.data

    mask = nonzero_indices != idx
    nonzero_indices = nonzero_indices[mask]
    nonzero_values = nonzero_values[mask]
    
    if len(nonzero_values) == 0:

        item_ids_list.append(item)
        neighbors_list.append([])
        continue
    

    if len(nonzero_values) <= k:
        top_k_indices = nonzero_indices[np.argsort(-nonzero_values)]
        top_k_values = nonzero_values[np.argsort(-nonzero_values)]
    else:
        partition_indices = np.argpartition(-nonzero_values, k)[:k]
        sorted_indices = partition_indices[np.argsort(-nonzero_values[partition_indices])]
        top_k_indices = nonzero_indices[sorted_indices]
        top_k_values = nonzero_values[sorted_indices]
    
 
    neighbors = [int(idx_to_item[neighbor_idx]) for neighbor_idx in top_k_indices]
    
    item_ids_list.append(item)
    neighbors_list.append(neighbors)


cooccurrence_neighbors_df = pd.DataFrame({
    'item_id': item_ids_list,
    'neighbors': neighbors_list
})

In [19]:
cooccurrence_neighbors_df.to_parquet('candidate_cache/cooccurrence_neighbors.parquet', index=False, compression='snappy')

#### Загружаем готовую. Окно - 1 день

In [None]:
cooccurrence_neighbors_df = pd.read_parquet('cooccurrence_neighbors.parquet')

In [18]:
cooccurrence_neighbors_df

Unnamed: 0,item_id,neighbors
0,82203591,"[3848213, 247059816, 197225976, 282827968, 141..."
1,247059816,"[82203591, 282827968, 79949320, 305514457, 315..."
2,282827968,"[82203591, 247059816, 81141914, 110100062, 290..."
3,117426377,"[174821200, 310976689, 188218763]"
4,174821200,"[117426377, 310976689]"
...,...,...
733391,204532885,"[74014662, 312107635]"
733392,312107635,"[74014662, 204532885]"
733393,170462492,[313839177]
733394,313839177,[170462492]
